Compare commits

..

3 Commits

14 changed files with 567 additions and 895 deletions
File diff suppressed because it is too large Load Diff
Binary file not shown.
Binary file not shown.
@@ -9815,7 +9815,6 @@
{ {
"name": "措施项目一", "name": "措施项目一",
"synonymous": [ "synonymous": [
"措施费"
], ],
"description": "费用项目分类,包含特殊地区施工增加费等临时性工程费用;在建安预算中手动添加的不按费率计取的措施项目费,用于计算一笔性费用。" "description": "费用项目分类,包含特殊地区施工增加费等临时性工程费用;在建安预算中手动添加的不按费率计取的措施项目费,用于计算一笔性费用。"
}, },
@@ -79,11 +79,6 @@
"synonymous": [], "synonymous": [],
"description": "施工机具使用费的价格依据" "description": "施工机具使用费的价格依据"
}, },
{
"name": "博微储能计价通C1",
"synonymous": [],
"description": "用于锂离子电池储能工程编制的软件"
},
{ {
"name": "试用账号", "name": "试用账号",
"synonymous": [], "synonymous": [],
@@ -108,7 +103,8 @@
"name": "博微储能计价通C1软件", "name": "博微储能计价通C1软件",
"synonymous": [ "synonymous": [
"储能C1软件", "储能C1软件",
"储能软件" "储能软件",
"博微储能计价通C1"
], ],
"description": "用于锂离子电池储能工程编制的软件" "description": "用于锂离子电池储能工程编制的软件"
}, },
@@ -235,12 +235,12 @@
"description": "查看定额升级的具体规则" "description": "查看定额升级的具体规则"
}, },
{ {
"name": "西藏电力工程造价2016 V2.7.1.0", "name": "西藏电力工程造价2016",
"synonymous": [], "synonymous": [],
"description": "西藏电力工程造价的旧版定额标准" "description": "西藏电力工程造价的旧版定额标准"
}, },
{ {
"name": "西藏配网造价2017 V2.12.3.0", "name": "西藏配网造价2017",
"synonymous": [], "synonymous": [],
"description": "西藏配网造价的旧版定额标准" "description": "西藏配网造价的旧版定额标准"
}, },
@@ -4207,11 +4207,6 @@
"synonymous": [], "synonymous": [],
"description": "用于批量设置或修改多个工程的模板、参数和物料信息" "description": "用于批量设置或修改多个工程的模板、参数和物料信息"
}, },
{
"name": "参数",
"synonymous": [],
"description": "工程设置中的各项参数"
},
{ {
"name": "地区范围", "name": "地区范围",
"synonymous": [], "synonymous": [],
+33 -10
View File
@@ -71,16 +71,16 @@ class TermMerger:
logging.warning(f"读取{file}失败: {e}") logging.warning(f"读取{file}失败: {e}")
# 加载suffix_keywords.json文件 # 加载suffix_keywords.json文件
suffix_keywords_path = os.path.join(os.path.dirname(os.path.dirname(self.EXTRACTED_NOUNS_DIR)), 'data', 'nouns', 'suffix_keywords.json') # suffix_keywords_path = os.path.join(os.path.dirname(os.path.dirname(self.EXTRACTED_NOUNS_DIR)), 'data', 'nouns', 'suffix_keywords.json')
if os.path.exists(suffix_keywords_path): # if os.path.exists(suffix_keywords_path):
try: # try:
with open(suffix_keywords_path, 'r', encoding='utf-8') as f: # with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
suffix_terms = json.load(f) # suffix_terms = json.load(f)
suffix_terms = [{"name": term["name"].upper(), "synonymous": "", "description": ""} for term in suffix_terms] # suffix_terms = [{"name": term["name"].upper(), "synonymous": "", "description": ""} for term in suffix_terms]
all_terms.extend(suffix_terms) # all_terms.extend(suffix_terms)
logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}") # logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条")
except Exception as e: # except Exception as e:
logging.warning(f"读取{suffix_keywords_path}失败: {e}") # logging.warning(f"读取{suffix_keywords_path}失败: {e}")
return all_terms return all_terms
@@ -154,6 +154,9 @@ class TermMerger:
for result in tqdm(executor.map(self.process_term, items_to_process), total=len(items_to_process)): for result in tqdm(executor.map(self.process_term, items_to_process), total=len(items_to_process)):
merged_terms.append(result) merged_terms.append(result)
# 4. 去重
merged_terms = self.deduplicate_synonymous_name(merged_terms)
# 4. 保存合并结果 # 4. 保存合并结果
os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True) os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True)
with open(self.OUTPUT_PATH, 'w', encoding='utf-8') as f: with open(self.OUTPUT_PATH, 'w', encoding='utf-8') as f:
@@ -162,6 +165,26 @@ class TermMerger:
return merged_terms return merged_terms
def deduplicate_synonymous_name(self, terms):
# 1. 删除name字段重复的条目
unique_names = set()
unique_data = []
for item in terms:
if item["name"] not in unique_names:
unique_names.add(item["name"])
unique_data.append(item)
# 如果重复,则跳过该条目
# 2. 如果A条目的某一个synonymou字段是B条目的name,则删除A条目中的对应的synonymou
name_set = {item["name"] for item in unique_data}
for item in unique_data:
# 过滤掉synonymous中与其他条目name重复的部分
filtered_synonymous = [syn for syn in item["synonymous"] if syn not in name_set]
item["synonymous"] = filtered_synonymous
return unique_data
def main(): def main():
"""主函数,创建TermMerger实例并执行合并""" """主函数,创建TermMerger实例并执行合并"""
+1 -1
View File
@@ -63,7 +63,7 @@ def intent_recognize():
for term in keywords["terms"]: for term in keywords["terms"]:
term_info = { term_info = {
"名称": term["name"], "名称": term["name"],
# "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "", # "同义词": ";".join(term["synonymous"]) if term["synonymous"] else [],
# "描述": term["description"] # "描述": term["description"]
} }
term_details.append(term_info) term_details.append(term_info)
+2 -2
View File
@@ -139,7 +139,7 @@ class SlotBase(BaseModel):
if v in SOFTWARE_NAME_ALIAS_MAP: if v in SOFTWARE_NAME_ALIAS_MAP:
return SOFTWARE_NAME_ALIAS_MAP[v].value return SOFTWARE_NAME_ALIAS_MAP[v].value
# 如果无法匹配,返回原值用于错误提示 # 如果无法匹配,返回原值
return v return v
return v return v
@@ -204,7 +204,7 @@ class ProfessionalConsultingSlots(SlotBase):
# 2.2 数据问题 # 2.2 数据问题
class DataProblemSlots(SlotBase): class DataProblemSlots(SlotBase):
expense_type: str = Field(default="", description="费用类型") expense_type: str = Field(default="", description="费用(数据)项、费用类型")
operation_purpose: str = Field(default="", description="操作目的") operation_purpose: str = Field(default="", description="操作目的")
software_name: Optional[str] = Field(default="", description="软件名称") software_name: Optional[str] = Field(default="", description="软件名称")
project_type: Optional[str] = Field(default="", description="工程类型") project_type: Optional[str] = Field(default="", description="工程类型")
+15 -8
View File
@@ -184,7 +184,7 @@ class IntentRecognizer:
except Exception as e: except Exception as e:
raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e
def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2) -> List[Term]: def _rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2, rerank_score:float = 0.6) -> List[Term]:
""" """
对召回的专业术语进行重排序,按与用户查询的相关性排序 对召回的专业术语进行重排序,按与用户查询的相关性排序
@@ -198,10 +198,14 @@ class IntentRecognizer:
""" """
if not matched_terms: if not matched_terms:
return [] return []
if len(matched_terms) <= top_k:
return list(matched_terms)
try: try:
# 将每个术语转换为可用于重排序的文本表示 # 将每个术语转换为可用于重排序的文本表示
term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms] # term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms]
term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) for term in matched_terms]
# 使用重排序模型 # 使用重排序模型
xinference_reranker = SiliconFlowReRankerModel() xinference_reranker = SiliconFlowReRankerModel()
@@ -211,7 +215,7 @@ class IntentRecognizer:
matched_terms_list = list(matched_terms) matched_terms_list = list(matched_terms)
# 根据重排序结果获取排序后的术语列表 # 根据重排序结果获取排序后的术语列表
reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= 0.6] reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= rerank_score]
return reranked_terms return reranked_terms
@@ -266,19 +270,21 @@ class IntentRecognizer:
term_list = TermList(terms=list(matched_terms)) term_list = TermList(terms=list(matched_terms))
return term_list, query_keys return term_list, query_keys
def _rewrite_query(self, query: str, keywords: TermList, chat_history: List[Dict[str, str]] = None, context: str = "") -> QueryRewrite: def _rewrite_query(self, query: str, keywords: TermList, query_keys:List[str], chat_history: List[Dict[str, str]] = None, context: str = "") -> QueryRewrite:
""" """
对用户问题进行改写 对用户问题进行改写
Args: Args:
query: 用户原始问题 query: 用户原始问题
keywords: 匹配到的关键词列表 keywords: 匹配到的关键词列表
query_keys: 用户查询中提取的关键词列表
Returns: Returns:
改写结果 改写结果
""" """
# 准备问题改写提示 # 准备问题改写提示
terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms] # terms_dict = [term.model_dump(exclude={"description"}) for term in keywords.terms]
terms_dict = [term.model_dump() for term in keywords.terms]
keywords_str = json.dumps(terms_dict, ensure_ascii=False) keywords_str = json.dumps(terms_dict, ensure_ascii=False)
query_rewrite_parser = PydanticOutputParser(pydantic_object=QueryRewrite) query_rewrite_parser = PydanticOutputParser(pydantic_object=QueryRewrite)
# formatted_prompt = query_rewrite_prompt.format(query=query, # formatted_prompt = query_rewrite_prompt.format(query=query,
@@ -362,12 +368,13 @@ class IntentRecognizer:
rewrite = self._rewrite_query( rewrite = self._rewrite_query(
query=query, query=query,
keywords=keywords_terms, keywords=keywords_terms,
query_keys=query_keys,
chat_history=chat_history, chat_history=chat_history,
context=conversation_context context=conversation_context
) )
# 步骤3: 进行意图识别和槽位填充 # 步骤3: 进行意图识别和槽位填充
result = self._process_intent_and_slot(query, conversation_context, chat_history, previous_slots) result = self._process_intent_and_slot(rewrite.rewrite, conversation_context, chat_history, previous_slots)
result.update({"keywords": keywords_terms.model_dump(), result.update({"keywords": keywords_terms.model_dump(),
"rewrite": rewrite.model_dump(), "rewrite": rewrite.model_dump(),
"query_keys": query_keys}) "query_keys": query_keys})
@@ -159,7 +159,7 @@ graph TD
### 三、重构优先级 ### 三、重构优先级
1. **背景补充** 1. **背景补充**
- 历史对话中确定的软件/地区必须继承(例:"这软件""【配网工程D3】" - 历史对话中确定的背景信息需要保留(例:"这软件""【配网工程D3】"
2. **术语处理** 2. **术语处理**
- 同义词转标准词 → 批量设置定额 - 同义词转标准词 → 批量设置定额
@@ -190,9 +190,7 @@ graph TD
intent_and_slot_prompt = """ intent_and_slot_prompt = """
# 电力造价软件意图分类与槽位填充统一提示词 # 你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。
你是一个专业的电力造价领域智能助手,负责对用户输入进行意图分类识别和关键信息槽位填充。
{classification_info} {classification_info}
@@ -206,6 +204,7 @@ intent_and_slot_prompt = """
- **技改检修工程计价通T1软件**:别名包括技改T1、T1软件、技改检修软件等 - **技改检修工程计价通T1软件**:别名包括技改T1、T1软件、技改检修软件等
- **技改检修清单计价通T1软件**:别名包括技改清单T1、T1清单软件、技改检修清单软件等 - **技改检修清单计价通T1软件**:别名包括技改清单T1、T1清单软件、技改检修清单软件等
- **主网电力建设计价通软件**:别名包括主网软件、电力建设软件、主网建设软件、博微电力建设计价通等 - **主网电力建设计价通软件**:别名包括主网软件、电力建设软件、主网建设软件、博微电力建设计价通等
不在上述软件之列的,使用用户输入中的软件名称
## 【任务要求】 ## 【任务要求】
@@ -127,7 +127,9 @@ class ProfessionalNounVectorizer:
# 准备数据 # 准备数据
texts, metadatas = self._prepare_terms_for_faiss(deduplicated_terms) texts, metadatas = self._prepare_terms_for_faiss(deduplicated_terms)
suffix_text,suffix_metadatas = self._updata_suffix_item()
texts.extend(suffix_text)
metadatas.extend(suffix_metadatas)
# 创建索引 # 创建索引
faiss_index = self._create_index(texts, metadatas) faiss_index = self._create_index(texts, metadatas)
@@ -140,6 +142,30 @@ class ProfessionalNounVectorizer:
logging.error(f"多文件向量化处理失败: {e}") logging.error(f"多文件向量化处理失败: {e}")
return False return False
def _updata_suffix_item(self)->Tuple[List[str], List[Dict]] :
"""
更新suffix_keywords.json文件
Returns:
更新后的术语列表
"""
# 加载suffix_keywords.json文件
text=[]
meta_info=[]
suffix_keywords_path = os.path.join(".", 'data', 'nouns', 'suffix_keywords.json')
if os.path.exists(suffix_keywords_path):
try:
with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
suffix_terms = json.load(f)
suffix_terms = [{"name": term["name"].upper(), "synonymous": [], "description": ""} for term in suffix_terms]
for cur_suffix in suffix_terms:
text.append(cur_suffix["name"].upper())
meta_info.append(cur_suffix)
logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}")
except Exception as e:
logging.warning(f"读取{suffix_keywords_path}失败: {e}")
return text,meta_info
def _prepare_terms_for_faiss(self, terms: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict]]: def _prepare_terms_for_faiss(self, terms: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict]]:
""" """
@@ -156,15 +182,9 @@ class ProfessionalNounVectorizer:
for term in terms: for term in terms:
name = term["name"] name = term["name"]
texts.append(name.strip())
synonymous = term.get("synonymous", []) synonymous = term.get("synonymous", [])
description = term.get("description", "") description = term.get("description", "")
# 记录元数据 # 记录元数据
metadatas.append({
"name": name,
"synonymous": synonymous,
"description": description
})
if len(synonymous) > 0: if len(synonymous) > 0:
for synonyms_str in synonymous: for synonyms_str in synonymous:
@@ -175,13 +195,21 @@ class ProfessionalNounVectorizer:
"description": description "description": description
}) })
if len(description) > 0: # texts.append(name.strip())
texts.append(description.strip()) # metadatas.append({
metadatas.append({ # "name": name,
"name": name, # "synonymous": synonymous,
"synonymous": synonymous, # "description": description
"description": description # })
})
# 不检索描述字段
# if len(description) > 0:
# texts.append(description.strip())
# metadatas.append({
# "name": name,
# "synonymous": synonymous,
# "description": description
# })
return texts, metadatas return texts, metadatas
+1 -1
View File
@@ -34,7 +34,7 @@ classification_info="""【垂直领域分类】:
4. 其他 -- 指与软件或电力造价专业无关的日常对话、问候、感慨、情绪表达等。 4. 其他 -- 指与软件或电力造价专业无关的日常对话、问候、感慨、情绪表达等。
【软件问题包括以下两类】: 【软件问题包括以下两类】:
1. 软件功能:询问软件功能的使用、操作、位置 1. 软件功能:询问软件功能的使用、功能操作(调整)、功能位置、如何设置、如何转换
2. 故障排查:软件运行异常、软件报错、软件显示错误等 2. 故障排查:软件运行异常、软件报错、软件显示错误等
【业务问题包括以下两类】: 【业务问题包括以下两类】:
+5 -6
View File
@@ -18,6 +18,11 @@ import requests
# sk-dvbaktabkdwdpjgxyoozlwnejosjyhdgqwllfeborqahndxs # sk-dvbaktabkdwdpjgxyoozlwnejosjyhdgqwllfeborqahndxs
API_KEY_LIST=[ API_KEY_LIST=[
"sk-kvgfuqeqvpmfsccykyoohheshclcrtvjlnewratvrjpkpbkc",
"sk-zhnbqnpuumuuvegnvbgoggxafpukbzchpgrugpkobiwkzsar",
"sk-kzhxlqvqcxlnbdgnpalqnzumkmspepkttkgbophnkqanainw",
"sk-bzttugqtlskrvguvhckwamdssvgmgnrqpsialpdbskfsyyak",
"sk-tovmogiablsoeabwgqyvevpcfichyjpuzqdymmvksspdrtqt",
"sk-wqdpapdkisovziexgcyxvumpwzbjnhqbxvcqcspzctjhyhjk", "sk-wqdpapdkisovziexgcyxvumpwzbjnhqbxvcqcspzctjhyhjk",
"sk-bbntrnifrtdzhhgrtlrhvwbnaysuszviemshdakxonnnymnb", "sk-bbntrnifrtdzhhgrtlrhvwbnaysuszviemshdakxonnnymnb",
"sk-vmpnwjxersrwybmfhfxgsvbmhsmpjldxseiyxovnysrlbuzi", "sk-vmpnwjxersrwybmfhfxgsvbmhsmpjldxseiyxovnysrlbuzi",
@@ -91,12 +96,6 @@ API_KEY_LIST=[
"sk-jrdzerhmvrtvzawkksowbgkggkubwfquplmrxbdhespqgtis", "sk-jrdzerhmvrtvzawkksowbgkggkubwfquplmrxbdhespqgtis",
"sk-jjbpnkbeupsxyclcivbhizcfpfjrppddunbqynyjkqhtmpwu", "sk-jjbpnkbeupsxyclcivbhizcfpfjrppddunbqynyjkqhtmpwu",
"sk-oqehupcveovkjqqtxypqyifidcdissuyehwrkdwgruoyjkpq", "sk-oqehupcveovkjqqtxypqyifidcdissuyehwrkdwgruoyjkpq",
"sk-orhfntzrbpmpavybcjyylofxncdvufdmvlznofmhxmnjymjl",
"sk-kvgfuqeqvpmfsccykyoohheshclcrtvjlnewratvrjpkpbkc",
"sk-zhnbqnpuumuuvegnvbgoggxafpukbzchpgrugpkobiwkzsar",
"sk-kzhxlqvqcxlnbdgnpalqnzumkmspepkttkgbophnkqanainw",
"sk-bzttugqtlskrvguvhckwamdssvgmgnrqpsialpdbskfsyyak",
"sk-tovmogiablsoeabwgqyvevpcfichyjpuzqdymmvksspdrtqt",
] ]
class APIKeyManager: class APIKeyManager: