更新专业术语索引文件,优化意图识别逻辑,添加后缀项更新功能,调整重排序参数以提高相关性,同时修正文档中的描述信息。
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -63,7 +63,7 @@ def intent_recognize():
|
||||
for term in keywords["terms"]:
|
||||
term_info = {
|
||||
"名称": term["name"],
|
||||
# "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "",
|
||||
# "同义词": ";".join(term["synonymous"]) if term["synonymous"] else [],
|
||||
# "描述": term["description"]
|
||||
}
|
||||
term_details.append(term_info)
|
||||
|
||||
@@ -184,7 +184,7 @@ class IntentRecognizer:
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e
|
||||
|
||||
def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2) -> List[Term]:
|
||||
def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2, rerank_score:float = 0.6) -> List[Term]:
|
||||
"""
|
||||
对召回的专业术语进行重排序,按与用户查询的相关性排序
|
||||
|
||||
@@ -199,9 +199,13 @@ class IntentRecognizer:
|
||||
if not matched_terms:
|
||||
return []
|
||||
|
||||
if len(matched_terms) <= top_k:
|
||||
return list(matched_terms)
|
||||
|
||||
try:
|
||||
# 将每个术语转换为可用于重排序的文本表示
|
||||
term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms]
|
||||
# term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms]
|
||||
term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) for term in matched_terms]
|
||||
|
||||
# 使用重排序模型
|
||||
xinference_reranker = SiliconFlowReRankerModel()
|
||||
@@ -211,7 +215,7 @@ class IntentRecognizer:
|
||||
matched_terms_list = list(matched_terms)
|
||||
|
||||
# 根据重排序结果获取排序后的术语列表
|
||||
reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= 0.6]
|
||||
reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= rerank_score]
|
||||
|
||||
return reranked_terms
|
||||
|
||||
@@ -279,7 +283,8 @@ class IntentRecognizer:
|
||||
改写结果
|
||||
"""
|
||||
# 准备问题改写提示
|
||||
terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms]
|
||||
# terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms]
|
||||
terms_dict = [term.model_dump() for term in keywords.terms]
|
||||
keywords_str = json.dumps(terms_dict, ensure_ascii=False)
|
||||
query_rewrite_parser = PydanticOutputParser(pydantic_object=QueryRewrite)
|
||||
# formatted_prompt = query_rewrite_prompt.format(query=query,
|
||||
@@ -369,7 +374,7 @@ class IntentRecognizer:
|
||||
)
|
||||
|
||||
# 步骤3: 进行意图识别和槽位填充
|
||||
result = self._process_intent_and_slot(query, conversation_context, chat_history, previous_slots)
|
||||
result = self._process_intent_and_slot(rewrite.rewrite, conversation_context, chat_history, previous_slots)
|
||||
result.update({"keywords": keywords_terms.model_dump(),
|
||||
"rewrite": rewrite.model_dump(),
|
||||
"query_keys": query_keys})
|
||||
|
||||
@@ -159,7 +159,7 @@ graph TD
|
||||
|
||||
### 三、重构优先级
|
||||
1. **背景补充**
|
||||
- 历史对话中确定的软件/地区必须继承(例:"这软件"→"【配网工程D3】")
|
||||
- 历史对话中确定的背景信息需要保留(例:"这软件"→"【配网工程D3】")
|
||||
|
||||
2. **术语处理**
|
||||
- 同义词转标准词 → 批量设置定额
|
||||
@@ -190,9 +190,7 @@ graph TD
|
||||
|
||||
|
||||
intent_and_slot_prompt = """
|
||||
# 电力造价软件意图分类与槽位填充统一提示词
|
||||
|
||||
你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。
|
||||
# 你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。
|
||||
|
||||
{classification_info}
|
||||
|
||||
@@ -206,6 +204,7 @@ intent_and_slot_prompt = """
|
||||
- **技改检修工程计价通T1软件**:别名包括技改T1、T1软件、技改检修软件等
|
||||
- **技改检修清单计价通T1软件**:别名包括技改清单T1、T1清单软件、技改检修清单软件等
|
||||
- **主网电力建设计价通软件**:别名包括主网软件、电力建设软件、主网建设软件、博微电力建设计价通等
|
||||
不在上述软件之列的,使用用户输入中的软件名称
|
||||
|
||||
## 【任务要求】
|
||||
|
||||
|
||||
@@ -127,7 +127,9 @@ class ProfessionalNounVectorizer:
|
||||
|
||||
# 准备数据
|
||||
texts, metadatas = self._prepare_terms_for_faiss(deduplicated_terms)
|
||||
|
||||
suffix_text,suffix_metadatas = self._updata_suffix_item()
|
||||
texts.extend(suffix_text)
|
||||
metadatas.extend(suffix_metadatas)
|
||||
# 创建索引
|
||||
faiss_index = self._create_index(texts, metadatas)
|
||||
|
||||
@@ -140,6 +142,30 @@ class ProfessionalNounVectorizer:
|
||||
logging.error(f"多文件向量化处理失败: {e}")
|
||||
return False
|
||||
|
||||
def _updata_suffix_item(self)->Tuple[List[str], List[Dict]] :
|
||||
"""
|
||||
更新suffix_keywords.json文件
|
||||
|
||||
Returns:
|
||||
更新后的术语列表
|
||||
"""
|
||||
# 加载suffix_keywords.json文件
|
||||
text=[]
|
||||
meta_info=[]
|
||||
|
||||
suffix_keywords_path = os.path.join(".", 'data', 'nouns', 'suffix_keywords.json')
|
||||
if os.path.exists(suffix_keywords_path):
|
||||
try:
|
||||
with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
|
||||
suffix_terms = json.load(f)
|
||||
suffix_terms = [{"name": term["name"].upper(), "synonymous": [], "description": ""} for term in suffix_terms]
|
||||
for cur_suffix in suffix_terms:
|
||||
text.append(cur_suffix["name"].upper())
|
||||
meta_info.append(cur_suffix)
|
||||
logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条")
|
||||
except Exception as e:
|
||||
logging.warning(f"读取{suffix_keywords_path}失败: {e}")
|
||||
return text,meta_info
|
||||
|
||||
def _prepare_terms_for_faiss(self, terms: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict]]:
|
||||
"""
|
||||
@@ -156,15 +182,9 @@ class ProfessionalNounVectorizer:
|
||||
|
||||
for term in terms:
|
||||
name = term["name"]
|
||||
texts.append(name.strip())
|
||||
synonymous = term.get("synonymous", [])
|
||||
description = term.get("description", "")
|
||||
# 记录元数据
|
||||
metadatas.append({
|
||||
"name": name,
|
||||
"synonymous": synonymous,
|
||||
"description": description
|
||||
})
|
||||
|
||||
if len(synonymous) > 0:
|
||||
for synonyms_str in synonymous:
|
||||
@@ -175,13 +195,21 @@ class ProfessionalNounVectorizer:
|
||||
"description": description
|
||||
})
|
||||
|
||||
if len(description) > 0:
|
||||
texts.append(description.strip())
|
||||
metadatas.append({
|
||||
"name": name,
|
||||
"synonymous": synonymous,
|
||||
"description": description
|
||||
})
|
||||
# texts.append(name.strip())
|
||||
# metadatas.append({
|
||||
# "name": name,
|
||||
# "synonymous": synonymous,
|
||||
# "description": description
|
||||
# })
|
||||
|
||||
# 不检索描述字段
|
||||
# if len(description) > 0:
|
||||
# texts.append(description.strip())
|
||||
# metadatas.append({
|
||||
# "name": name,
|
||||
# "synonymous": synonymous,
|
||||
# "description": description
|
||||
# })
|
||||
|
||||
return texts, metadatas
|
||||
|
||||
|
||||
Reference in New Issue
Block a user