新增同义词词典支持，优化IntentRecognition类以加载jieba自定义词典和同义词字典，调整关键词提取流程，简化日志记录，更新PromptTemplates以规范同义词处理规则。

2025-07-31 09:23:27 +08:00
parent 728262cc65
commit 6a72233a97
3 changed files with 297 additions and 63 deletions
@@ -80,8 +80,49 @@ class AsyncIntentRecognizer:
        # 加载软件词条名称库
        self._soft_wiki_library = self._load_soft_wiki_library()
        # 异步检索器将在create方法中初始化
-        self._noun_retriever = None
-    
+        # self._noun_retriever = None
+        # 初始化jieba自定义词典
+        self._init_jieba_dict()
+        self._synonymous_dict=self._init_synonymous_dict()
+
+    def _init_jieba_dict(self):
+        """初始化jieba自定义词典"""
+        try:
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "all_synonymous_jieba.txt")
+            
+            # 检查字典文件是否存在
+            if os.path.exists(dict_path):
+                jieba.load_userdict(dict_path)
+                logging.info("成功加载jieba自定义词典")
+            else:
+                logging.warning(f"自定义词典文件不存在: {dict_path}")
+        except Exception as e:
+            logging.error(f"加载jieba自定义词典失败: {e}")
+
+    def _init_synonymous_dict(self):
+        """加载同义词，key是同义词 val:是对应名词"""
+        try:
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "merged_nouns.json")
+            
+            # 检查字典文件是否存在
+            synonymous_dict={}
+            if os.path.exists(dict_path):
+                with open(dict_path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                for cur_data in data:
+                    synonymous=cur_data["synonymous"]
+                    name=cur_data["name"]
+                    for cur_synonymous in synonymous:
+                        synonymous_dict[cur_synonymous]=name
+            else:
+                logging.warning(f"名词库文件不存在: {dict_path}")
+            return synonymous_dict
+        except Exception as e:
+            logging.error(f"加载名词库文件失败: {e}")
+            return {}
+
    def _load_soft_wiki_library(self):
        """
        加载软件wiki库
@@ -105,7 +146,7 @@ class AsyncIntentRecognizer:
        """
        instance = cls()
        # 异步初始化名词检索器
-        instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
+        # instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
        return instance
    
    def _load_suffix_keywords(self, filepath: str = None) -> List[str]:
@@ -277,7 +318,7 @@ class AsyncIntentRecognizer:
        """
        start_time = time.time()
        query_keys=[]
-        # 步骤1: 使用LLM提取查询中的关键词
+        # 步骤1: 提取查询中的关键词
        try:
            llm_start_time = time.time()
            extracted_terms = await self._extract_keywords_async(query, use_jieba)
@@ -289,44 +330,14 @@ class AsyncIntentRecognizer:
            raise RuntimeError(f"异步LLM关键词提取失败: {e}") from e
        
        matched_terms = []  # 存储匹配到的Term对象       
-        # 步骤2: 使用向量检索找到相似的专业名词
-        try:
-            vector_start_time = time.time()
-            
-            # 创建并行任务列表
-            async def process_single_keyword(current_key: str) -> List[Term]:
-                """处理单个关键词的向量检索和重排序"""
-                vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
-                current_key_terms = set()
-                # 添加向量检索结果
-                for result in vector_results:
-                    if isinstance(result.get('synonymous', []), str):
-                        result['synonymous'] = result['synonymous'].split(';')
-                    term = Term(
-                        name=result.get('name'),
-                        synonymous=result.get('synonymous', []),
-                        description=result.get('description', '')
-                    )
-                    current_key_terms.add(term)
-                if len(current_key_terms) > 0:
-                    reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
-                    return reranked_terms
-                return []
-            
-            # 并行处理所有关键词
-            keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys]
-            keyword_results = await asyncio.gather(*keyword_tasks)
-            
-            # 合并所有结果
-            for result in keyword_results:
-                if len(result) > 0:
-                    matched_terms.extend(result)
-                
-            vector_end_time = time.time()
-            vector_time = vector_end_time - vector_start_time
-        except Exception as e:
-            raise RuntimeError(f"异步向量检索关键词时出错: {e}") from e
        
+        # 查找同义词
+        for cur_key in query_keys:
+            if cur_key not in self._synonymous_dict:
+                continue
+            name = self._synonymous_dict[cur_key]
+            matched_terms.append(Term(name=name,synonymous=[cur_key],description=""))
+
        # 提取所有Term对象的名称并排序
        # 将set类型的matched_terms转换为TermList类型
        term_list = TermList(terms=list(matched_terms))
@@ -334,7 +345,7 @@ class AsyncIntentRecognizer:
        total_time = end_time - start_time

        # 输出整合的时间日志
-        logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}秒, 问题关键词提取: {llm_time:.2f}秒, 向量检索+重排序: {vector_time:.2f}秒")
+        logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}秒")
        
        return term_list, query_keys