更新专业术语索引文件，优化意图识别逻辑，添加后缀项更新功能，调整重排序参数以提高相关性，同时修正文档中的描述信息。

2025-06-16 15:18:04 +08:00
parent f1b3f7e158
commit 503c7ff0bc
6 changed files with 57 additions and 25 deletions
@@ -127,7 +127,9 @@ class ProfessionalNounVectorizer:
            
            # 准备数据
            texts, metadatas = self._prepare_terms_for_faiss(deduplicated_terms)
-            
+            suffix_text,suffix_metadatas = self._updata_suffix_item()
+            texts.extend(suffix_text)
+            metadatas.extend(suffix_metadatas)
            # 创建索引
            faiss_index = self._create_index(texts, metadatas)
            
@@ -140,6 +142,30 @@ class ProfessionalNounVectorizer:
            logging.error(f"多文件向量化处理失败: {e}")
            return False
    
+    def _updata_suffix_item(self)->Tuple[List[str], List[Dict]] :
+        """
+        更新suffix_keywords.json文件
+        
+        Returns:
+            更新后的术语列表
+        """
+        # 加载suffix_keywords.json文件
+        text=[]
+        meta_info=[]
+
+        suffix_keywords_path = os.path.join(".", 'data', 'nouns', 'suffix_keywords.json')
+        if os.path.exists(suffix_keywords_path):
+            try:
+                with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
+                    suffix_terms = json.load(f)
+                    suffix_terms = [{"name": term["name"].upper(), "synonymous": [], "description": ""} for term in suffix_terms]
+                    for cur_suffix in suffix_terms:
+                        text.append(cur_suffix["name"].upper())
+                        meta_info.append(cur_suffix)
+                    logging.info(f"加载{suffix_keywords_path}，共{len(suffix_terms)}条")
+            except Exception as e:
+                logging.warning(f"读取{suffix_keywords_path}失败: {e}")
+        return text,meta_info
    
    def _prepare_terms_for_faiss(self, terms: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict]]:
        """
@@ -156,15 +182,9 @@ class ProfessionalNounVectorizer:
        
        for term in terms:
            name = term["name"]
-            texts.append(name.strip())
            synonymous = term.get("synonymous", [])
            description = term.get("description", "")
            # 记录元数据
-            metadatas.append({
-                "name": name,
-                "synonymous": synonymous,
-                "description": description
-            })

            if len(synonymous) > 0:
                for synonyms_str in synonymous:
@@ -175,13 +195,21 @@ class ProfessionalNounVectorizer:
                        "description": description
                    })
            
-            if len(description) > 0:
-                texts.append(description.strip())
-                metadatas.append({
-                    "name": name,
-                    "synonymous": synonymous,
-                    "description": description
-                })
+            # texts.append(name.strip())
+            # metadatas.append({
+            #     "name": name,
+            #     "synonymous": synonymous,
+            #     "description": description
+            # })
+
+            # 不检索描述字段
+            # if len(description) > 0:
+            #     texts.append(description.strip())
+            #     metadatas.append({
+            #         "name": name,
+            #         "synonymous": synonymous,
+            #         "description": description
+            #     })
        
        return texts, metadatas