新增同义词词典支持,优化IntentRecognition类以加载jieba自定义词典和同义词字典,调整关键词提取流程,简化日志记录,更新PromptTemplates以规范同义词处理规则。
This commit is contained in:
@@ -80,8 +80,49 @@ class AsyncIntentRecognizer:
|
||||
# 加载软件词条名称库
|
||||
self._soft_wiki_library = self._load_soft_wiki_library()
|
||||
# 异步检索器将在create方法中初始化
|
||||
self._noun_retriever = None
|
||||
|
||||
# self._noun_retriever = None
|
||||
# 初始化jieba自定义词典
|
||||
self._init_jieba_dict()
|
||||
self._synonymous_dict=self._init_synonymous_dict()
|
||||
|
||||
def _init_jieba_dict(self):
|
||||
"""初始化jieba自定义词典"""
|
||||
try:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "all_synonymous_jieba.txt")
|
||||
|
||||
# 检查字典文件是否存在
|
||||
if os.path.exists(dict_path):
|
||||
jieba.load_userdict(dict_path)
|
||||
logging.info("成功加载jieba自定义词典")
|
||||
else:
|
||||
logging.warning(f"自定义词典文件不存在: {dict_path}")
|
||||
except Exception as e:
|
||||
logging.error(f"加载jieba自定义词典失败: {e}")
|
||||
|
||||
def _init_synonymous_dict(self):
|
||||
"""加载同义词,key是同义词 val:是对应名词"""
|
||||
try:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
dict_path = os.path.join(current_dir, "..", "..", "data", "nouns", "merged_nouns.json")
|
||||
|
||||
# 检查字典文件是否存在
|
||||
synonymous_dict={}
|
||||
if os.path.exists(dict_path):
|
||||
with open(dict_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
for cur_data in data:
|
||||
synonymous=cur_data["synonymous"]
|
||||
name=cur_data["name"]
|
||||
for cur_synonymous in synonymous:
|
||||
synonymous_dict[cur_synonymous]=name
|
||||
else:
|
||||
logging.warning(f"名词库文件不存在: {dict_path}")
|
||||
return synonymous_dict
|
||||
except Exception as e:
|
||||
logging.error(f"加载名词库文件失败: {e}")
|
||||
return {}
|
||||
|
||||
def _load_soft_wiki_library(self):
|
||||
"""
|
||||
加载软件wiki库
|
||||
@@ -105,7 +146,7 @@ class AsyncIntentRecognizer:
|
||||
"""
|
||||
instance = cls()
|
||||
# 异步初始化名词检索器
|
||||
instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
|
||||
# instance._noun_retriever = await AsyncProfessionalNounRetriever.create()
|
||||
return instance
|
||||
|
||||
def _load_suffix_keywords(self, filepath: str = None) -> List[str]:
|
||||
@@ -277,7 +318,7 @@ class AsyncIntentRecognizer:
|
||||
"""
|
||||
start_time = time.time()
|
||||
query_keys=[]
|
||||
# 步骤1: 使用LLM提取查询中的关键词
|
||||
# 步骤1: 提取查询中的关键词
|
||||
try:
|
||||
llm_start_time = time.time()
|
||||
extracted_terms = await self._extract_keywords_async(query, use_jieba)
|
||||
@@ -289,44 +330,14 @@ class AsyncIntentRecognizer:
|
||||
raise RuntimeError(f"异步LLM关键词提取失败: {e}") from e
|
||||
|
||||
matched_terms = [] # 存储匹配到的Term对象
|
||||
# 步骤2: 使用向量检索找到相似的专业名词
|
||||
try:
|
||||
vector_start_time = time.time()
|
||||
|
||||
# 创建并行任务列表
|
||||
async def process_single_keyword(current_key: str) -> List[Term]:
|
||||
"""处理单个关键词的向量检索和重排序"""
|
||||
vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
|
||||
current_key_terms = set()
|
||||
# 添加向量检索结果
|
||||
for result in vector_results:
|
||||
if isinstance(result.get('synonymous', []), str):
|
||||
result['synonymous'] = result['synonymous'].split(';')
|
||||
term = Term(
|
||||
name=result.get('name'),
|
||||
synonymous=result.get('synonymous', []),
|
||||
description=result.get('description', '')
|
||||
)
|
||||
current_key_terms.add(term)
|
||||
if len(current_key_terms) > 0:
|
||||
reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
|
||||
return reranked_terms
|
||||
return []
|
||||
|
||||
# 并行处理所有关键词
|
||||
keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys]
|
||||
keyword_results = await asyncio.gather(*keyword_tasks)
|
||||
|
||||
# 合并所有结果
|
||||
for result in keyword_results:
|
||||
if len(result) > 0:
|
||||
matched_terms.extend(result)
|
||||
|
||||
vector_end_time = time.time()
|
||||
vector_time = vector_end_time - vector_start_time
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"异步向量检索关键词时出错: {e}") from e
|
||||
|
||||
# 查找同义词
|
||||
for cur_key in query_keys:
|
||||
if cur_key not in self._synonymous_dict:
|
||||
continue
|
||||
name = self._synonymous_dict[cur_key]
|
||||
matched_terms.append(Term(name=name,synonymous=[cur_key],description=""))
|
||||
|
||||
# 提取所有Term对象的名称并排序
|
||||
# 将set类型的matched_terms转换为TermList类型
|
||||
term_list = TermList(terms=list(matched_terms))
|
||||
@@ -334,7 +345,7 @@ class AsyncIntentRecognizer:
|
||||
total_time = end_time - start_time
|
||||
|
||||
# 输出整合的时间日志
|
||||
logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}秒, 问题关键词提取: {llm_time:.2f}秒, 向量检索+重排序: {vector_time:.2f}秒")
|
||||
logging.info(f"异步关键词匹配耗时统计 - 总耗时: {total_time:.2f}秒")
|
||||
|
||||
return term_list, query_keys
|
||||
|
||||
|
||||
Reference in New Issue
Block a user