更新对话转工单处理逻辑,增强用户问题和解决方案提取功能,添加槽位填充支持,调整最大工作线程数为10,优化意图识别API,重排序匹配术语,改进数据模型以支持软件名称枚举,提升代码结构和可读性。
This commit is contained in:
@@ -9,7 +9,27 @@ Description: 提取和分类的数据模型
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
from enum import Enum
|
||||
|
||||
class SoftwareName(str, Enum):
|
||||
"""软件名称枚举类"""
|
||||
D3 = "配网工程计价通D3软件"
|
||||
C1 = "新型储能电站建设计价通C1软件"
|
||||
Z1 = "西藏电力工程计价通Z1软件"
|
||||
T1 = "技改检修工程计价通T1软件"
|
||||
T1_LIST = "技改检修清单计价通T1软件"
|
||||
MAIN = "主网电力建设计价通软件"
|
||||
UNKNOWN = "" # 未知
|
||||
|
||||
# 软件别名映射
|
||||
ALIASES = {
|
||||
D3: ["配网D3", "D3软件", "配网工程软件"],
|
||||
C1: ["储能C1", "C1软件", "储能电站软件", "储能软件"],
|
||||
Z1: ["西藏Z1", "Z1软件", "西藏电力软件"],
|
||||
T1: ["技改T1", "T1软件", "技改检修软件"],
|
||||
T1_LIST: ["技改清单T1", "T1清单软件", "技改检修清单软件"],
|
||||
MAIN: ["主网软件", "电力建设软件", "主网建设软件", "主网软件"]
|
||||
}
|
||||
|
||||
# 定义输出模型
|
||||
class Term(BaseModel):
|
||||
@@ -38,7 +58,7 @@ class QueryRewrite(BaseModel):
|
||||
# 1. 软件问题
|
||||
# 1.1 软件功能
|
||||
class SoftwareFunction(BaseModel):
|
||||
software_name: str = Field(description="软件名称")
|
||||
software_name: SoftwareName = Field(description="软件名称")
|
||||
function_name: str = Field(description="具体功能名称")
|
||||
operation: str = Field(description="用户操作意图(如何使用功能、功能入口、功能使用场景)")
|
||||
software_version: Optional[str] = Field(None, description="软件版本")
|
||||
@@ -57,7 +77,7 @@ class SoftwareFunction(BaseModel):
|
||||
|
||||
# 1.2 故障排查
|
||||
class TroubleShooting(BaseModel):
|
||||
software_name: str = Field(description="软件名称")
|
||||
software_name: SoftwareName = Field(description="软件名称")
|
||||
function_name: str = Field(description="具体功能名称/操作描述")
|
||||
error_message: str = Field(description="报错信息/异常现象")
|
||||
software_version: Optional[str] = Field(None, description="软件版本")
|
||||
@@ -80,7 +100,7 @@ class TroubleShooting(BaseModel):
|
||||
class ProfessionalConsulting(BaseModel):
|
||||
scene_subject: str = Field(description="场景主体")
|
||||
business_scene: str = Field(description="业务场景描述")
|
||||
software_name: Optional[str] = Field(None, description="软件名称")
|
||||
software_name: Optional[SoftwareName] = Field(None, description="软件名称")
|
||||
|
||||
def check_required_slots(self) -> Tuple[bool, Dict[str, str]]:
|
||||
"""检查必填槽位是否都存在"""
|
||||
@@ -95,7 +115,7 @@ class ProfessionalConsulting(BaseModel):
|
||||
class DataProblem(BaseModel):
|
||||
expense_type: str = Field(description="费用类型")
|
||||
operation_purpose: str = Field(description="操作目的")
|
||||
software_name: Optional[str] = Field(None, description="软件名称")
|
||||
software_name: Optional[SoftwareName] = Field(None, description="软件名称")
|
||||
project_type: Optional[str] = Field(None, description="工程类型")
|
||||
|
||||
def check_required_slots(self) -> Tuple[bool, Dict[str, str]]:
|
||||
@@ -141,7 +161,9 @@ class SoftwareLock(BaseModel):
|
||||
|
||||
# 3.3 安装下载类
|
||||
class InstallationDownload(BaseModel):
|
||||
software_name: str = Field(description="软件/插件名称")
|
||||
|
||||
software_name: SoftwareName = Field(description="软件/插件名称,与file_name二选一")
|
||||
file_name: str = Field(description="文件名,与software_name二选一")
|
||||
operation_stage: str = Field(description="操作阶段")
|
||||
os_version: Optional[str] = Field(None, description="操作系统版本")
|
||||
package_source: Optional[str] = Field(None, description="安装包来源/版本号")
|
||||
@@ -149,8 +171,9 @@ class InstallationDownload(BaseModel):
|
||||
def check_required_slots(self) -> Tuple[bool, Dict[str, str]]:
|
||||
"""检查必填槽位是否都存在"""
|
||||
missing_slots = {}
|
||||
if not self.software_name:
|
||||
if not self.software_name and not self.file_name:
|
||||
missing_slots["software_name"] = InstallationDownload.model_fields["software_name"].description
|
||||
missing_slots["file_name"] = InstallationDownload.model_fields["file_name"].description
|
||||
if not self.operation_stage:
|
||||
missing_slots["operation_stage"] = InstallationDownload.model_fields["operation_stage"].description
|
||||
return len(missing_slots) == 0, missing_slots
|
||||
@@ -158,7 +181,7 @@ class InstallationDownload(BaseModel):
|
||||
# 3.4 问题排查类
|
||||
class ProblemDiagnosis(BaseModel):
|
||||
error_message: str = Field(description="报错信息/异常现象")
|
||||
software_name: Optional[str] = Field(None, description="软件名称")
|
||||
software_name: Optional[SoftwareName] = Field(None, description="软件名称")
|
||||
os_version: Optional[str] = Field(None, description="操作系统版本")
|
||||
|
||||
def check_required_slots(self) -> Tuple[bool, Dict[str, str]]:
|
||||
|
||||
@@ -148,6 +148,40 @@ class IntentRecognizer:
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"无法解析LLM关键词提取响应: {e}") from e
|
||||
|
||||
def rerank_matched_terms(self, query_key: str, matched_terms: set, top_k: int = 2) -> List[Term]:
|
||||
"""
|
||||
对召回的专业术语进行重排序,按与用户查询的相关性排序
|
||||
|
||||
Args:
|
||||
query: 用户查询
|
||||
matched_terms: 匹配到的专业术语集合
|
||||
query_keys: 用户查询中提取的关键词列表
|
||||
|
||||
Returns:
|
||||
重排序后的专业术语列表
|
||||
"""
|
||||
if not matched_terms:
|
||||
return []
|
||||
|
||||
try:
|
||||
# 将每个术语转换为可用于重排序的文本表示
|
||||
term_texts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms]
|
||||
|
||||
# 使用重排序模型
|
||||
xinference_reranker = SiliconFlowReRankerModel()
|
||||
rerank_results = xinference_reranker.rerank(query_key, term_texts, top_k=top_k)
|
||||
|
||||
# 将matched_terms转换为列表以便按索引访问
|
||||
matched_terms_list = list(matched_terms)
|
||||
|
||||
# 根据重排序结果获取排序后的术语列表
|
||||
reranked_terms = [matched_terms_list[result["index"]] for result in rerank_results if result["score"] >= 0.6]
|
||||
|
||||
return reranked_terms
|
||||
|
||||
except Exception as e:
|
||||
return list(matched_terms)
|
||||
|
||||
def match_keywords(self, query: str) -> Tuple[TermList, List[str]]:
|
||||
"""
|
||||
从用户问题中匹配关键词,结合LLM提取和向量检索
|
||||
@@ -158,7 +192,6 @@ class IntentRecognizer:
|
||||
Returns:
|
||||
匹配到的关键词列表
|
||||
"""
|
||||
matched_terms = set() # 存储匹配到的Term对象
|
||||
query_keys=[]
|
||||
# 步骤2: 使用LLM提取查询中的关键词
|
||||
try:
|
||||
@@ -168,12 +201,13 @@ class IntentRecognizer:
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"LLM关键词提取失败: {e}") from e
|
||||
|
||||
matched_terms = [] # 存储匹配到的Term对象
|
||||
# 步骤3: 使用向量检索找到相似的专业名词
|
||||
try:
|
||||
# 对matched_terms中的每个关键字进行向量检索
|
||||
for current_key in query_keys:
|
||||
vector_results = self.noun_retriever.query(current_key, top_k=3, use_intersection=True)
|
||||
|
||||
current_key_terms = set()
|
||||
# 添加向量检索结果
|
||||
for result in vector_results:
|
||||
term = Term(
|
||||
@@ -181,18 +215,12 @@ class IntentRecognizer:
|
||||
synonymous=result.get('synonymous', []),
|
||||
description=result.get('description', '')
|
||||
)
|
||||
matched_terms.add(term)
|
||||
|
||||
current_key_terms.add(term)
|
||||
reranked_terms = self.rerank_matched_terms(current_key, current_key_terms)
|
||||
matched_terms.extend(reranked_terms)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"向量检索关键词时出错: {e}") from e
|
||||
|
||||
if len(matched_terms) != 0:
|
||||
txts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms]
|
||||
# txts = [term.name for term in matched_terms]
|
||||
xinference_reranker = SiliconFlowReRankerModel()
|
||||
rerank_results = xinference_reranker.rerank(query, txts, top_k=5)
|
||||
matched_terms_list = list(matched_terms)
|
||||
matched_terms = [matched_terms_list[result["index"]] for result in rerank_results]
|
||||
# 提取所有Term对象的名称并排序
|
||||
# 将set类型的matched_terms转换为TermList类型
|
||||
term_list = TermList(terms=list(matched_terms))
|
||||
@@ -295,7 +323,7 @@ class IntentRecognizer:
|
||||
# rewrite = QueryRewrite(rewrite=query)
|
||||
return classification, keywords_terms, rewrite, query_keys
|
||||
|
||||
def fill_slots(self, query: str, classification: Classification, keywords: TermList) -> Dict[str, Any]:
|
||||
def fill_slots(self, query: str, classification: Classification) -> Dict[str, Any]:
|
||||
"""
|
||||
根据分类结果对问题进行槽位填充
|
||||
|
||||
@@ -313,7 +341,7 @@ class IntentRecognizer:
|
||||
return {"error": "未找到匹配的槽位模型"}
|
||||
|
||||
# 使用LLM进行槽位填充
|
||||
filled_slots = self._fill_slots_with_llm(query, classification, keywords, slot_model)
|
||||
filled_slots = self._fill_slots_with_llm(query, classification, slot_model)
|
||||
|
||||
# 检查必填槽位是否都已填充
|
||||
is_complete, missing_slots = filled_slots.check_required_slots()
|
||||
@@ -349,7 +377,7 @@ class IntentRecognizer:
|
||||
return DataProblem
|
||||
|
||||
# 安装下载注册
|
||||
elif classification.vertical_classification == "安装下载":
|
||||
elif classification.vertical_classification == "安装下载注册":
|
||||
if classification.sub_classification == "后缀名咨询":
|
||||
return FileExtensionConsulting
|
||||
elif classification.sub_classification == "软件锁类":
|
||||
@@ -361,14 +389,13 @@ class IntentRecognizer:
|
||||
|
||||
return None
|
||||
|
||||
def _fill_slots_with_llm(self, query: str, classification: Classification, keywords: TermList, slot_model_class: type) -> Any:
|
||||
def _fill_slots_with_llm(self, query: str, classification: Classification, slot_model_class: type) -> Any:
|
||||
"""
|
||||
使用LLM进行槽位填充
|
||||
|
||||
Args:
|
||||
query: 用户原始问题
|
||||
classification: 意图分类结果
|
||||
keywords: 匹配的关键词列表
|
||||
slot_model_class: 槽位模型类
|
||||
|
||||
Returns:
|
||||
@@ -377,15 +404,11 @@ class IntentRecognizer:
|
||||
# 准备提示词
|
||||
slot_parser = PydanticOutputParser(pydantic_object=slot_model_class)
|
||||
model_schema = json.dumps(slot_model_class.model_json_schema(), ensure_ascii=False)
|
||||
terms_dict = [term.model_dump() for term in keywords.terms]
|
||||
keywords_str = json.dumps(terms_dict, ensure_ascii=False)
|
||||
|
||||
formatted_prompt = slot_filling_prompt.format(
|
||||
query=query,
|
||||
vertical_classification=classification.vertical_classification,
|
||||
sub_classification=classification.sub_classification,
|
||||
keywords=keywords_str,
|
||||
model_schema=model_schema,
|
||||
output_format=slot_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
@@ -417,7 +440,7 @@ class IntentRecognizer:
|
||||
# 如果是有效分类,进行槽位填充
|
||||
slot_filling_result = {}
|
||||
if classification.vertical_classification not in ["其他", "闲聊"] and classification.sub_classification not in ["其他", "闲聊"]:
|
||||
slot_filling_result = self.fill_slots(rewrite.rewrite, classification, keywords)
|
||||
slot_filling_result = self.fill_slots(rewrite.rewrite, classification)
|
||||
|
||||
return {
|
||||
"classification": classification.model_dump(),
|
||||
|
||||
@@ -157,21 +157,21 @@ class ProfessionalNounVectorizer:
|
||||
for term in terms:
|
||||
name = term["name"]
|
||||
texts.append(name.strip())
|
||||
synonyms = term.get("synonymous", [])
|
||||
synonymous = term.get("synonymous", [])
|
||||
description = term.get("description", "")
|
||||
# 记录元数据
|
||||
metadatas.append({
|
||||
"name": name,
|
||||
"synonyms": synonyms,
|
||||
"synonymous": synonymous,
|
||||
"description": description
|
||||
})
|
||||
|
||||
if len(synonyms) > 0:
|
||||
synonyms_str = ', '.join(synonyms)
|
||||
if len(synonymous) > 0:
|
||||
synonyms_str = ', '.join(synonymous)
|
||||
texts.append(synonyms_str.strip())
|
||||
metadatas.append({
|
||||
"name": name,
|
||||
"synonyms": synonyms,
|
||||
"synonymous": synonymous,
|
||||
"description": description
|
||||
})
|
||||
|
||||
@@ -179,7 +179,7 @@ class ProfessionalNounVectorizer:
|
||||
texts.append(description.strip())
|
||||
metadatas.append({
|
||||
"name": name,
|
||||
"synonyms": synonyms,
|
||||
"synonymous": synonymous,
|
||||
"description": description
|
||||
})
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ query_rewrite_prompt = """
|
||||
|
||||
## 第三阶段:专业重构
|
||||
3. 术语规范化处理
|
||||
a. 实施术语映射:将口语表达替换为知识库标准术语
|
||||
a. 实施术语映射:将口语表达替换为知识库标准术语,优先保留原问题中的术语
|
||||
b. 执行结构优化:
|
||||
- 采用【术语标记】规范标注关键概念
|
||||
- 构建主谓宾明确的问题句式
|
||||
@@ -118,14 +118,13 @@ query_rewrite_prompt = """
|
||||
# 质量约束条款
|
||||
1. 语义内容保真原则
|
||||
- 禁止修改原问题核心诉求(如转换主语/变更操作对象)
|
||||
- 保留原始问题的限定条件
|
||||
- 保留原始问题的限定条件(包括:软件名称等)
|
||||
|
||||
2. 术语使用规范
|
||||
- 仅使用检索返回的关键词进行术语替换
|
||||
- 新增术语必须来自关键词集合
|
||||
|
||||
3. 结构优化标准
|
||||
- 问题长度控制在20字内
|
||||
- 必须包含≥1个【标注术语】
|
||||
- 禁止添加解释性语句
|
||||
|
||||
@@ -144,12 +143,6 @@ slot_filling_prompt = """
|
||||
垂直领域分类: {vertical_classification}
|
||||
子分类: {sub_classification}
|
||||
|
||||
【已识别关键词】
|
||||
{keywords}
|
||||
|
||||
【目标数据结构】
|
||||
{model_schema}
|
||||
|
||||
【输出格式】
|
||||
{output_format}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user