新增同义词词典支持,优化IntentRecognition类以加载jieba自定义词典和同义词字典,调整关键词提取流程,简化日志记录,更新PromptTemplates以规范同义词处理规则。

This commit is contained in:
2025-07-31 09:23:27 +08:00
parent 728262cc65
commit 6a72233a97
3 changed files with 297 additions and 63 deletions
+53 -42
View File
@@ -80,8 +80,49 @@ class AsyncIntentRecognizer:
# 加载软件词条名称库
self._soft_wiki_library = self._load_soft_wiki_library()
# 异步检索器将在create方法中初始化
self._noun_retriever = None
# self._noun_retriever = None
# 初始化jieba自定义词典
self._init_jieba_dict()
self._synonymous_dict=self._init_synonymous_dict()
def _init_jieba_dict(self):
"""初始化jieba自定义词典"""
try:
current_dir = os.path.dirname(os.path.abspath(__file__))
dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "all_synonymous_jieba.txt")
# 检查字典文件是否存在
if os.path.exists(dict_path):
jieba.load_userdict(dict_path)
logging.info("成功加载jieba自定义词典")
else:
logging.warning(f"自定义词典文件不存在: {dict_path}")
except Exception as e:
logging.error(f"加载jieba自定义词典失败: {e}")
def _init_synonymous_dict(self):
"""加载同义词,key是同义词 val:是对应名词"""
try:
current_dir = os.path.dirname(os.path.abspath(__file__))
dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "merged_nouns.json")
# 检查字典文件是否存在
synonymous_dict={}
if os.path.exists(dict_path):
with open(dict_path, "r", encoding="utf-8") as f:
data = json.load(f)
for cur_data in data:
synonymous=cur_data["synonymous"]
name=cur_data["name"]
for cur_synonymous in synonymous:
synonymous_dict[cur_synonymous]=name
else:
logging.warning(f"名词库文件不存在: {dict_path}")
return synonymous_dict
except Exception as e:
logging.error(f"加载名词库文件失败: {e}")
return {}
def _load_soft_wiki_library(self):
"""
加载软件wiki库
@@ -105,7 +146,7 @@ class AsyncIntentRecognizer:
"""
instance = cls()
# 异步初始化名词检索器
instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
# instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
return instance
def _load_suffix_keywords(self, filepath: str = None) -> List[str]:
@@ -277,7 +318,7 @@ class AsyncIntentRecognizer:
"""
start_time = time.time()
query_keys=[]
# 步骤1: 使用LLM提取查询中的关键词
# 步骤1: 提取查询中的关键词
try:
llm_start_time = time.time()
extracted_terms = await self._extract_keywords_async(query, use_jieba)
@@ -289,44 +330,14 @@ class AsyncIntentRecognizer:
raise RuntimeError(f"异步LLM关键词提取失败: {e}") from e
matched_terms = [] # 存储匹配到的Term对象
# 步骤2: 使用向量检索找到相似的专业名词
try:
vector_start_time = time.time()
# 创建并行任务列表
async def process_single_keyword(current_key: str) -> List[Term]:
"""处理单个关键词的向量检索和重排序"""
vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
current_key_terms = set()
# 添加向量检索结果
for result in vector_results:
if isinstance(result.get('synonymous', []), str):
result['synonymous'] = result['synonymous'].split(';')
term = Term(
name=result.get('name'),
synonymous=result.get('synonymous', []),
description=result.get('description', '')
)
current_key_terms.add(term)
if len(current_key_terms) > 0:
reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
return reranked_terms
return []
# 并行处理所有关键词
keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys]
keyword_results = await asyncio.gather(*keyword_tasks)
# 合并所有结果
for result in keyword_results:
if len(result) > 0:
matched_terms.extend(result)
vector_end_time = time.time()
vector_time = vector_end_time - vector_start_time
except Exception as e:
raise RuntimeError(f"异步向量检索关键词时出错: {e}") from e
# 查找同义词
for cur_key in query_keys:
if cur_key not in self._synonymous_dict:
continue
name = self._synonymous_dict[cur_key]
matched_terms.append(Term(name=name,synonymous=[cur_key],description=""))
# 提取所有Term对象的名称并排序
# 将set类型的matched_terms转换为TermList类型
term_list = TermList(terms=list(matched_terms))
@@ -334,7 +345,7 @@ class AsyncIntentRecognizer:
total_time = end_time - start_time
# 输出整合的时间日志
logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}, 问题关键词提取: {llm_time:.2f}秒, 向量检索+重排序: {vector_time:.2f}")
logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}")
return term_list, query_keys