diff --git a/data/nouns/professional_nouns_index/index.faiss b/data/nouns/professional_nouns_index/index.faiss index c177d79..d4698b0 100644 Binary files a/data/nouns/professional_nouns_index/index.faiss and b/data/nouns/professional_nouns_index/index.faiss differ diff --git a/data/nouns/professional_nouns_index/index.pkl b/data/nouns/professional_nouns_index/index.pkl index 61ed265..23b245f 100644 Binary files a/data/nouns/professional_nouns_index/index.pkl and b/data/nouns/professional_nouns_index/index.pkl differ diff --git a/rag2_0/dify/intent_recognition_api.py b/rag2_0/dify/intent_recognition_api.py index 4b990e4..42da11e 100644 --- a/rag2_0/dify/intent_recognition_api.py +++ b/rag2_0/dify/intent_recognition_api.py @@ -63,7 +63,7 @@ def intent_recognize(): for term in keywords["terms"]: term_info = { "名称": term["name"], - # "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "", + # "同义词": ";".join(term["synonymous"]) if term["synonymous"] else [], # "描述": term["description"] } term_details.append(term_info) diff --git a/rag2_0/intent_recognition/IntentRecognition.py b/rag2_0/intent_recognition/IntentRecognition.py index 4ba9584..d1e0cc0 100644 --- a/rag2_0/intent_recognition/IntentRecognition.py +++ b/rag2_0/intent_recognition/IntentRecognition.py @@ -184,7 +184,7 @@ class IntentRecognizer: except Exception as e: raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e - def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2) -> List[Term]: + def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2, rerank_score:float = 0.6) -> List[Term]: """ 对召回的专业术语进行重排序,按与用户查询的相关性排序 @@ -198,10 +198,14 @@ class IntentRecognizer: """ if not matched_terms: return [] - + + if len(matched_terms) <= top_k: + return list(matched_terms) + try: # 将每个术语转换为可用于重排序的文本表示 - term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms] + # term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms] + term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) for term in matched_terms] # 使用重排序模型 xinference_reranker = SiliconFlowReRankerModel() @@ -211,7 +215,7 @@ class IntentRecognizer: matched_terms_list = list(matched_terms) # 根据重排序结果获取排序后的术语列表 - reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= 0.6] + reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= rerank_score] return reranked_terms @@ -279,7 +283,8 @@ class IntentRecognizer: 改写结果 """ # 准备问题改写提示 - terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms] + # terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms] + terms_dict = [term.model_dump() for term in keywords.terms] keywords_str = json.dumps(terms_dict, ensure_ascii=False) query_rewrite_parser = PydanticOutputParser(pydantic_object=QueryRewrite) # formatted_prompt = query_rewrite_prompt.format(query=query, @@ -369,7 +374,7 @@ class IntentRecognizer: ) # 步骤3: 进行意图识别和槽位填充 - result = self._process_intent_and_slot(query, conversation_context, chat_history, previous_slots) + result = self._process_intent_and_slot(rewrite.rewrite, conversation_context, chat_history, previous_slots) result.update({"keywords": keywords_terms.model_dump(), "rewrite": rewrite.model_dump(), "query_keys": query_keys}) diff --git a/rag2_0/intent_recognition/Multi_PromptTemplates.py b/rag2_0/intent_recognition/Multi_PromptTemplates.py index eca6d26..a29534a 100644 --- a/rag2_0/intent_recognition/Multi_PromptTemplates.py +++ b/rag2_0/intent_recognition/Multi_PromptTemplates.py @@ -159,7 +159,7 @@ graph TD ### 三、重构优先级 1. **背景补充** - - 历史对话中确定的软件/地区必须继承(例:"这软件"→"【配网工程D3】") + - 历史对话中确定的背景信息需要保留(例:"这软件"→"【配网工程D3】") 2. **术语处理** - 同义词转标准词 → 批量设置定额 @@ -190,9 +190,7 @@ graph TD intent_and_slot_prompt = """ -# 电力造价软件意图分类与槽位填充统一提示词 - -你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。 +# 你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。 {classification_info} @@ -206,6 +204,7 @@ intent_and_slot_prompt = """ - **技改检修工程计价通T1软件**:别名包括技改T1、T1软件、技改检修软件等 - **技改检修清单计价通T1软件**:别名包括技改清单T1、T1清单软件、技改检修清单软件等 - **主网电力建设计价通软件**:别名包括主网软件、电力建设软件、主网建设软件、博微电力建设计价通等 +不在上述软件之列的,使用用户输入中的软件名称 ## 【任务要求】 diff --git a/rag2_0/intent_recognition/ProfessionalNounVector.py b/rag2_0/intent_recognition/ProfessionalNounVector.py index ce139bf..9921db0 100644 --- a/rag2_0/intent_recognition/ProfessionalNounVector.py +++ b/rag2_0/intent_recognition/ProfessionalNounVector.py @@ -127,7 +127,9 @@ class ProfessionalNounVectorizer: # 准备数据 texts, metadatas = self._prepare_terms_for_faiss(deduplicated_terms) - + suffix_text,suffix_metadatas = self._updata_suffix_item() + texts.extend(suffix_text) + metadatas.extend(suffix_metadatas) # 创建索引 faiss_index = self._create_index(texts, metadatas) @@ -140,6 +142,30 @@ class ProfessionalNounVectorizer: logging.error(f"多文件向量化处理失败: {e}") return False + def _updata_suffix_item(self)->Tuple[List[str], List[Dict]] : + """ + 更新suffix_keywords.json文件 + + Returns: + 更新后的术语列表 + """ + # 加载suffix_keywords.json文件 + text=[] + meta_info=[] + + suffix_keywords_path = os.path.join(".", 'data', 'nouns', 'suffix_keywords.json') + if os.path.exists(suffix_keywords_path): + try: + with open(suffix_keywords_path, 'r', encoding='utf-8') as f: + suffix_terms = json.load(f) + suffix_terms = [{"name": term["name"].upper(), "synonymous": [], "description": ""} for term in suffix_terms] + for cur_suffix in suffix_terms: + text.append(cur_suffix["name"].upper()) + meta_info.append(cur_suffix) + logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条") + except Exception as e: + logging.warning(f"读取{suffix_keywords_path}失败: {e}") + return text,meta_info def _prepare_terms_for_faiss(self, terms: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict]]: """ @@ -156,15 +182,9 @@ class ProfessionalNounVectorizer: for term in terms: name = term["name"] - texts.append(name.strip()) synonymous = term.get("synonymous", []) description = term.get("description", "") # 记录元数据 - metadatas.append({ - "name": name, - "synonymous": synonymous, - "description": description - }) if len(synonymous) > 0: for synonyms_str in synonymous: @@ -175,13 +195,21 @@ class ProfessionalNounVectorizer: "description": description }) - if len(description) > 0: - texts.append(description.strip()) - metadatas.append({ - "name": name, - "synonymous": synonymous, - "description": description - }) + # texts.append(name.strip()) + # metadatas.append({ + # "name": name, + # "synonymous": synonymous, + # "description": description + # }) + + # 不检索描述字段 + # if len(description) > 0: + # texts.append(description.strip()) + # metadatas.append({ + # "name": name, + # "synonymous": synonymous, + # "description": description + # }) return texts, metadatas