diff --git a/rag2_0/demo/dialogue_to_workorder.py b/rag2_0/demo/dialogue_to_workorder.py old mode 100644 new mode 100755 index fe7af05..6f8052d --- a/rag2_0/demo/dialogue_to_workorder.py +++ b/rag2_0/demo/dialogue_to_workorder.py @@ -89,7 +89,7 @@ class DialogueToWorkorder: # 初始化LLM模型 self.llm_params = llm_params or { - "temperature": 0.6, + "temperature": 0.2, "model": os.getenv("LLM_MODEL_NAME"), "api_key": os.getenv("OPENAI_API_KEY"), "base_url": os.getenv("OPENAI_API_BASE") @@ -207,37 +207,43 @@ class DialogueToWorkorder: """分析用户问题和解决方案""" dialogue_str = self.get_dialogue_str(conversation_rows) - prompt = f""" -请从以下电力造价相关的客服对话记录中,精准提取用户提出的专业问题及对应坐席提供的解决方案。要求: + prompt = """请从以下电力造价相关的客服对话记录中,识别并精准提取用户提出的问题及对应坐席提供的解决方案。 +1、理解对话记录,识别用户在此次对话中提出的诉求 +2、根据用户提出的诉求,分析坐席提供的解决方法 +3、使用json格式输出: +{output_format} -1. 专业识别: -- 重点识别电力工程领域的专业术语(如:定额套用、工程量清单、概预算编制、造价指标分析等) -- 注意区分不同业务场景(输变电工程、配网改造、新能源项目等) -- 识别政策文件引用(如:国网Q/GDW 11337-2014标准) +输出示例: +{{ + "user_question": "软件打开报错", + "solution": "通过远程引导解决" +}} -2. 信息提取: -用户问题提取: -- 核心诉求(成本核算/计价争议/软件操作等) -- 涉及的专业环节(设计概算/施工图预算/竣工结算) -- 具体技术参数(电压等级/线路长度/设备型号) - -坐席解决方案提取: -- 提供的计算方法(单位工程法/实物量法) -- 推荐的计价依据(电力建设工程定额2018版) -- 指导的软件操作步骤(博微软件操作) -- 政策法规应用建议 -- 文件模板提供情况 - -3. 结构化输出: -{self.user_question_and_solution_list_parser.get_format_instructions()} -访客与坐席的对话记录如下: +=======对话记录如下所示======= {dialogue_str} +============================ """ + output_format = self.user_question_and_solution_parser.get_format_instructions() + llm_prompt = prompt.format(output_format=output_format, dialogue_str=dialogue_str) - response = self.llm.invoke(user_prompt=prompt) - user_question_and_solution_list = self.user_question_and_solution_list_parser.parse(response.content) + response = self.llm.invoke(user_prompt=llm_prompt) + if 'reasoning_content' not in response.model_extra and self.llm._model == 'deepseek-ai/DeepSeek-R1': + print("deepseek-ai/DeepSeek-R1 解析失败") - return user_question_and_solution_list.user_question_list + try: + if response.content.count('user_question') == 1: + user_question_and_solution = self.user_question_and_solution_parser.parse(response.content) + return [user_question_and_solution] + else: + raise Exception("解析失败") + except Exception as e: + output_format = self.user_question_and_solution_list_parser.get_format_instructions() + llm_prompt = prompt.format(output_format=output_format, dialogue_str=dialogue_str) + response = self.llm.invoke(user_prompt=llm_prompt) + user_question_and_solution = self.user_question_and_solution_list_parser.parse(response.content) + return user_question_and_solution.user_question_list + + return [user_question_and_solution] @retry_llm_call(max_retries=3, delay=2) def get_product_name_and_module_name(self, product_line, conversation_rows, product_detail_dict, user_question_str, solution_str): @@ -343,16 +349,14 @@ class DialogueToWorkorder: prompt = f""" 请根据以下对话记录分析访客情绪是否对博微软件或者坐席服务存在明显抱怨,并按照以下结构输出JSON格式分析结果: -1. 抱怨识别:判断访客是否对博微软件功能或者坐席服务存在明显抱怨语气或词语 +1. 抱怨识别:判断访客是否对博微软件功能或者坐席服务存在**明显抱怨语气或词语** 2. 抱怨分级(如存在抱怨): - - 一般抱怨:对博微软件功者坐席服务存在轻微不满但情绪稳定 - - 中等抱怨:对博微软件或者坐席服务明确表达不满并提出具体问题 - - 严重抱怨:对博微软件或者坐席服务使用激烈言辞或威胁性语言 - - 抗议行为:明确表示投诉/退费/法律手段 + - 一般抱怨:明确提出对博微软件功能或者坐席服务存在不满 + - 中等抱怨:明确提出对博微软件功能或者坐席服务存在不满,语气较为强烈 + - 严重抱怨:对博微软件功能或者坐席服务使用激烈言辞或威胁性语言 3. 投诉倾向:是否明确/暗示将进行投诉 4. 抱怨对象:坐席服务态度/业务能力 或 博微功能问题(注意忽略对非博微软件或坐席的抱怨) 5. 内容摘录:标注具体抱怨语句 -6. 分析理由:结合语义与上下文的判断依据 示例输出: {{ @@ -387,13 +391,13 @@ class DialogueToWorkorder: """处理单个会话的函数,用于多线程并发""" # 获取工单基本信息 workorder_dict = self.get_workorder_dict(conversation_rows) - + # 分析用户问题和解决方案 + user_question_list = self.get_user_question_and_solution(conversation_rows) + # 分析是否抱怨、是否投诉、抱怨级别 is_dissatisfaction, dissatisfaction_level, dissatisfaction_reasoning, is_complaint = ( self.get_is_complaint_and_is_complaint_level(conversation_rows)) - # 分析用户问题和解决方案 - user_question_list = self.get_user_question_and_solution(conversation_rows) for user_question in user_question_list: user_question_str = user_question.user_question solution_str = user_question.solution @@ -554,18 +558,9 @@ def main(): # 设置默认文件路径 conversation_excel_path = args.conversation_file or os.path.join('data', 'excel', '会话内容详情20250528110230.xlsx') product_detail_excel_path = args.product_detail_file or os.path.join('data', 'excel', '产品详情_工单.xlsx') - output_file = args.output_file - - # 配置LLM参数 - llm_params = { - "temperature": args.temperature, - "model": args.model_name or os.getenv("LLM_MODEL_NAME"), - "api_key": os.getenv("OPENAI_API_KEY"), - "base_url": os.getenv("OPENAI_API_BASE") - } # 创建处理实例 - processor = DialogueToWorkorder(llm_params=llm_params) + processor = DialogueToWorkorder() # 分析会话数据 workorder_dict_list = processor.analyze_conversation_data( @@ -573,7 +568,7 @@ def main(): product_detail_excel_path, max_workers=args.max_workers ) - + output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx') # 保存结果 processor.save_results_to_excel(workorder_dict_list, output_file) diff --git a/rag2_0/demo/intent_recognition_example.py b/rag2_0/demo/intent_recognition_example.py index a28e90f..5c6dbce 100644 --- a/rag2_0/demo/intent_recognition_example.py +++ b/rag2_0/demo/intent_recognition_example.py @@ -15,6 +15,7 @@ import json import concurrent.futures from tqdm import tqdm import time +import sys # 加载环境变量 load_dotenv() @@ -43,7 +44,7 @@ def load_questions_from_excel(file_path=None): def process_query(recognizer, query): """ - 处理单个查询,支持重试机制 + 处理单个查询,支持重试机制,并包含槽位填充 Args: recognizer: 意图识别器实例 @@ -57,32 +58,48 @@ def process_query(recognizer, query): while retry_count <= max_retries: try: - # 如果是重试,添加重试信息到日志 - classification, keywords, rewrite, query_keys = recognizer.process_query(query) + # 使用新的process_query_with_slots方法处理查询 + result = recognizer.process_query_with_slots(query) - # 将keywords对象转换为字符串 + # 提取分类信息 + classification = result["classification"] + + # 提取关键词信息 + keywords = result["keywords"] keywords_str = "" - if keywords and keywords.terms: + if keywords and keywords.get("terms"): term_details = [] - for term in keywords.terms: + for term in keywords["terms"]: term_info = { - "名称": term.name, - "同义词": ";".join(term.synonymous) if term.synonymous else "", - "描述": term.description + "名称": term["name"], + "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "", + "描述": term["description"] } term_details.append(term_info) # 将term_details转换为JSON字符串,确保中文正确显示 keywords_str = json.dumps(term_details, ensure_ascii=False, indent=2) + # 提取槽位填充信息 + slot_filling = result.get("slot_filling", {}) + slot_filling_str = "" + if slot_filling and "filled_data" in slot_filling: + # 格式化槽位填充结果 + slot_filling_str = json.dumps({ + "是否完整": slot_filling.get("is_complete", False), + "缺失槽位": slot_filling.get("missing_slots", {}), + "填充数据": slot_filling.get("filled_data", {}) + }, ensure_ascii=False, indent=2) + # 处理成功,返回结果 return { "提问": query, - "问题拆解": query_keys, - "一级分类": classification.vertical_classification, - "二级分类": classification.sub_classification, - "问题改写": rewrite.rewrite, - "检索的关键词": keywords_str + "问题拆解": result["query_keys"], + "一级分类": classification["vertical_classification"], + "二级分类": classification["sub_classification"], + "问题改写": result["rewrite"]["rewrite"], + "检索的关键词": keywords_str, + "槽位填充": slot_filling_str } except Exception as e: @@ -96,13 +113,15 @@ def process_query(recognizer, query): "一级分类": "处理出错", "二级分类": "处理出错", "问题改写": "处理出错", - "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}" + "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}", + "槽位填充": "处理出错" } else: # 可以在这里添加延迟,避免过快重试 time.sleep(10 * retry_count) -examples_query = """下载软件在哪下载?""" +# 示例查询 +examples_query = """这个安全文明费费率在哪里调""" def main(): """ @@ -119,56 +138,67 @@ def main(): # 读取提问数据 current_dir = os.path.dirname(os.path.abspath(__file__)) - data_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据.xlsx") - examples = load_questions_from_excel(data_file) - # examples = examples_query.split("\n") - max_workers = 20 - logging.info(f"共有 {len(examples)} 个问题需要处理,使用 {max_workers} 个并发线程") + data_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据.xlsx") - # 创建一个与输入顺序相同的结果列表 - results = [None] * len(examples) + # 检测是否为调试模式,调试模式下使用examples_query,否则从Excel读取 + + is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None + if is_debug: + examples = examples_query.strip().split("\n") + else: + examples = load_questions_from_excel(data_file) - # 使用线程池进行并发处理 - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - # 提交所有任务并记录它们的索引 - future_to_index = {} + if not is_debug: + + max_workers = 5 # 减少并发数以避免API限制 + logging.info(f"共有 {len(examples)} 个问题需要处理,使用 {max_workers} 个并发线程") + # 创建一个与输入顺序相同的结果列表 + results = [None] * len(examples) + # 使用线程池进行并发处理 + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务并记录它们的索引 + future_to_index = {} + for idx, query in enumerate(examples): + future = executor.submit(process_query, recognizer, query) + future_to_index[future] = idx + + # 使用tqdm显示进度条 + for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"): + idx = future_to_index[future] + result = future.result() + # 将结果放在与输入相同的位置 + results[idx] = result + + # 将结果保存到Excel文件 + results_df = pd.DataFrame(results) + + output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx") + + # 使用ExcelWriter设置格式 + with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: + results_df.to_excel(writer, index=False, sheet_name='Sheet1') + + # 获取工作簿和工作表对象 + workbook = writer.book + worksheet = writer.sheets['Sheet1'] + + # 设置列宽(单位:像素) + # 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位) + worksheet.set_column('A:A', 60) # 提问列 60个Excel单位 + worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位 + worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位 + worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位 + worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位 + worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位 + worksheet.set_column('G:G', 80) # 槽位填充 80个Excel单位 + + # 设置所有行高为20磅 + for i in range(len(results_df) + 1): # +1 是为了包括表头 + worksheet.set_row(i, 20) + else: for idx, query in enumerate(examples): - future = executor.submit(process_query, recognizer, query) - future_to_index[future] = idx + process_query(recognizer, query) - # 使用tqdm显示进度条 - for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"): - idx = future_to_index[future] - result = future.result() - # 将结果放在与输入相同的位置 - results[idx] = result - - # 将结果保存到Excel文件 - results_df = pd.DataFrame(results) - - output_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据_重写结果.xlsx") - - # 使用ExcelWriter设置格式 - with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: - results_df.to_excel(writer, index=False, sheet_name='Sheet1') - - # 获取工作簿和工作表对象 - workbook = writer.book - worksheet = writer.sheets['Sheet1'] - - # 设置列宽(单位:像素) - # 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位) - worksheet.set_column('A:A', 60) # 提问列 60个Excel单位 - worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位 - worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位 - worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位 - worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位 - worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位 - - # 设置所有行高为20磅 - for i in range(len(results_df) + 1): # +1 是为了包括表头 - worksheet.set_row(i, 20) - logging.info(f"处理完成,结果已保存至: {output_file}") def setup_logging(): diff --git a/rag2_0/dify/test_dify_chatapi.py b/rag2_0/dify/test_dify_chatapi.py index fe3e881..ae74288 100755 --- a/rag2_0/dify/test_dify_chatapi.py +++ b/rag2_0/dify/test_dify_chatapi.py @@ -29,7 +29,7 @@ class DifyComparisonTester: Dify新旧流程对比测试类,用于比较两个不同流程的问答效果并进行评判 """ def __init__(self, excel_path:str, baseurl:str, old_workflow_api_key:str, new_workflow_api_key:str, - wiki_excel_path:str=None, output_path:str=None, max_workers:int=5): + wiki_excel_path:str=None, output_path:str=None, max_workers:int=1): """ 初始化对比测试器 @@ -394,6 +394,13 @@ content: "{content}" Returns: dict: 包含问题分类结果的字典 """ + retrieve_content=[] + max_score=0 + min_score=0 + avg_score=0 + rewrite_query="" + vertical_classification="" + sub_classification="" try: new_message_info = DifyTool.get_message_debug_info_by_id(message_id=new_message_id) for workflow_node in new_message_info["workflow_node_executions_info"]: @@ -429,6 +436,11 @@ content: "{content}" Returns: dict: 包含问题分类结果的字典 """ + retrieve_content=[] + max_score=0 + min_score=0 + avg_score=0 + rewrite_query="" try: old_message_info = DifyTool.get_message_debug_info_by_id(message_id=old_message_id) for workflow_node in old_message_info["workflow_node_executions_info"]: @@ -526,9 +538,8 @@ content: "{content}" """ # 读取Excel文件中的问题 df = pd.read_excel(self.excel_path) - questions = df['补全后的提问'].tolist() + questions = df['问题'].tolist() results = [] - # 选择处理函数 process_func = self.process_question_with_judge if with_judge else self.process_question @@ -572,7 +583,7 @@ content: "{content}" if __name__ == "__main__": # 定义Excel路径 - excel_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".." ,"data/excel/历史提问数据(like)_提问明确.xlsx") + excel_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", ".." ,"data/excel/400条答案差异的.xlsx") if not os.path.exists(excel_path): print(f"错误:Excel文件不存在: {excel_path}") diff --git a/rag2_0/intent_recognition/DataModels.py b/rag2_0/intent_recognition/DataModels.py index a933ab7..c7d28ab 100644 --- a/rag2_0/intent_recognition/DataModels.py +++ b/rag2_0/intent_recognition/DataModels.py @@ -8,7 +8,7 @@ Description: 提取和分类的数据模型 """ from pydantic import BaseModel, Field -from typing import List, Optional +from typing import List, Optional, Dict, Tuple # 定义输出模型 @@ -33,4 +33,138 @@ class Classification(BaseModel): sub_classification:str = Field(description="一级分类下的二级分类") class QueryRewrite(BaseModel): - rewrite:str = Field(description="问题改写") \ No newline at end of file + rewrite:str = Field(description="问题改写") + +# 1. 软件问题 +# 1.1 软件功能 +class SoftwareFunction(BaseModel): + software_name: str = Field(description="软件名称") + function_name: str = Field(description="具体功能名称") + operation: str = Field(description="用户操作意图(如何使用功能、功能入口、功能使用场景)") + software_version: Optional[str] = Field(None, description="软件版本") + operation_steps: Optional[str] = Field(None, description="操作步骤描述") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.software_name: + missing_slots["software_name"] = SoftwareFunction.model_fields["software_name"].description + if not self.function_name: + missing_slots["function_name"] = SoftwareFunction.model_fields["function_name"].description + if not self.operation: + missing_slots["operation"] = SoftwareFunction.model_fields["operation"].description + return len(missing_slots) == 0, missing_slots + +# 1.2 故障排查 +class TroubleShooting(BaseModel): + software_name: str = Field(description="软件名称") + function_name: str = Field(description="具体功能名称/操作描述") + error_message: str = Field(description="报错信息/异常现象") + software_version: Optional[str] = Field(None, description="软件版本") + os_version: Optional[str] = Field(None, description="操作系统及版本") + reproduction_steps: Optional[str] = Field(None, description="故障重现步骤") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.software_name: + missing_slots["software_name"] = TroubleShooting.model_fields["software_name"].description + if not self.function_name: + missing_slots["function_name"] = TroubleShooting.model_fields["function_name"].description + if not self.error_message: + missing_slots["error_message"] = TroubleShooting.model_fields["error_message"].description + return len(missing_slots) == 0, missing_slots + +# 2. 业务问题 +# 2.1 专业咨询 +class ProfessionalConsulting(BaseModel): + scene_subject: str = Field(description="场景主体") + business_scene: str = Field(description="业务场景描述") + software_name: Optional[str] = Field(None, description="软件名称") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.scene_subject: + missing_slots["scene_subject"] = ProfessionalConsulting.model_fields["scene_subject"].description + if not self.business_scene: + missing_slots["business_scene"] = ProfessionalConsulting.model_fields["business_scene"].description + return len(missing_slots) == 0, missing_slots + +# 2.2 数据问题 +class DataProblem(BaseModel): + expense_type: str = Field(description="费用类型") + operation_purpose: str = Field(description="操作目的") + software_name: Optional[str] = Field(None, description="软件名称") + project_type: Optional[str] = Field(None, description="工程类型") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.expense_type: + missing_slots["expense_type"] = DataProblem.model_fields["expense_type"].description + if not self.operation_purpose: + missing_slots["operation_purpose"] = DataProblem.model_fields["operation_purpose"].description + return len(missing_slots) == 0, missing_slots + +# 3. 安装下载注册 +# 3.1 后缀名咨询 +class FileExtensionConsulting(BaseModel): + file_extension: str = Field(description="文件后缀名") + operation_purpose: str = Field(description="操作目的") + file_source: Optional[str] = Field(None, description="文件来源场景") + related_software: Optional[str] = Field(None, description="相关软件名称") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.file_extension: + missing_slots["file_extension"] = FileExtensionConsulting.model_fields["file_extension"].description + if not self.operation_purpose: + missing_slots["operation_purpose"] = FileExtensionConsulting.model_fields["operation_purpose"].description + return len(missing_slots) == 0, missing_slots + +# 3.2 软件锁类 +class SoftwareLock(BaseModel): + lock_type: str = Field(description="锁类型") + operation_purpose: str = Field(description="操作目的") + lock_number: Optional[str] = Field(None, description="软件锁编号/注册号") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.lock_type: + missing_slots["lock_type"] = SoftwareLock.model_fields["lock_type"].description + if not self.operation_purpose: + missing_slots["operation_purpose"] = SoftwareLock.model_fields["operation_purpose"].description + return len(missing_slots) == 0, missing_slots + +# 3.3 安装下载类 +class InstallationDownload(BaseModel): + software_name: str = Field(description="软件/插件名称") + operation_stage: str = Field(description="操作阶段") + os_version: Optional[str] = Field(None, description="操作系统版本") + package_source: Optional[str] = Field(None, description="安装包来源/版本号") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.software_name: + missing_slots["software_name"] = InstallationDownload.model_fields["software_name"].description + if not self.operation_stage: + missing_slots["operation_stage"] = InstallationDownload.model_fields["operation_stage"].description + return len(missing_slots) == 0, missing_slots + +# 3.4 问题排查类 +class ProblemDiagnosis(BaseModel): + error_message: str = Field(description="报错信息/异常现象") + software_name: Optional[str] = Field(None, description="软件名称") + os_version: Optional[str] = Field(None, description="操作系统版本") + + def check_required_slots(self) -> Tuple[bool, Dict[str, str]]: + """检查必填槽位是否都存在""" + missing_slots = {} + if not self.error_message: + missing_slots["error_message"] = ProblemDiagnosis.model_fields["error_message"].description + return len(missing_slots) == 0, missing_slots + diff --git a/rag2_0/intent_recognition/IntentRecognition.py b/rag2_0/intent_recognition/IntentRecognition.py index 33f1ded..fd00a48 100644 --- a/rag2_0/intent_recognition/IntentRecognition.py +++ b/rag2_0/intent_recognition/IntentRecognition.py @@ -11,12 +11,17 @@ import os from langchain_openai import ChatOpenAI from langchain.output_parsers import PydanticOutputParser import json -from typing import List, Tuple +from typing import List, Tuple, Dict, Any, Optional, Union import re -from .PromptTemplates import classification_prompt, query_rewrite_prompt, extract_nouns_prompt, classification_info -from .DataModels import Classification, QueryRewrite, Term, TermList +from .PromptTemplates import classification_prompt, query_rewrite_prompt, extract_nouns_prompt, classification_info, slot_filling_prompt +from .DataModels import ( + Classification, QueryRewrite, Term, TermList, + SoftwareFunction, TroubleShooting, ProfessionalConsulting, + DataProblem, FileExtensionConsulting, SoftwareLock, + InstallationDownload, ProblemDiagnosis +) from .ProfessionalNounVector import ProfessionalNounRetriever -from rag2_0.tool.ModelTool import XinferenceReRankerModel, OpenAiLLM +from rag2_0.tool.ModelTool import XinferenceReRankerModel, OpenAiLLM, SiliconFlowReRankerModel class IntentRecognizer: @@ -184,7 +189,7 @@ class IntentRecognizer: if len(matched_terms) != 0: txts = ["名称:" + term.name + "|" + "同义词:" + ";".join(term.synonymous) + "|" + "描述:" + term.description for term in matched_terms] # txts = [term.name for term in matched_terms] - xinference_reranker = XinferenceReRankerModel() + xinference_reranker = SiliconFlowReRankerModel() rerank_results = xinference_reranker.rerank(query, txts, top_k=5) matched_terms_list = list(matched_terms) matched_terms = [matched_terms_list[result["index"]] for result in rerank_results] @@ -288,4 +293,136 @@ class IntentRecognizer: return classification, TermList(terms=[]), QueryRewrite(rewrite=query),[] # rewrite = QueryRewrite(rewrite=query) - return classification, keywords_terms, rewrite, query_keys \ No newline at end of file + return classification, keywords_terms, rewrite, query_keys + + def fill_slots(self, query: str, classification: Classification, keywords: TermList) -> Dict[str, Any]: + """ + 根据分类结果对问题进行槽位填充 + + Args: + query: 用户原始问题 + classification: 意图分类结果 + keywords: 匹配的关键词列表 + + Returns: + 填充后的槽位数据模型 + """ + # 根据分类结果选择对应的数据模型 + slot_model = self._get_slot_model(classification) + if not slot_model: + return {"error": "未找到匹配的槽位模型"} + + # 使用LLM进行槽位填充 + filled_slots = self._fill_slots_with_llm(query, classification, keywords, slot_model) + + # 检查必填槽位是否都已填充 + is_complete, missing_slots = filled_slots.check_required_slots() + + return { + "is_complete": is_complete, + "missing_slots": missing_slots, + "filled_data": filled_slots.model_dump() + } + + def _get_slot_model(self, classification: Classification) -> Optional[type]: + """ + 根据分类结果获取对应的槽位模型类 + + Args: + classification: 意图分类结果 + + Returns: + 对应的槽位模型类 + """ + # 软件问题 + if classification.vertical_classification == "软件问题": + if classification.sub_classification == "软件功能": + return SoftwareFunction + elif classification.sub_classification == "故障排查": + return TroubleShooting + + # 业务问题 + elif classification.vertical_classification == "业务问题": + if classification.sub_classification == "专业咨询": + return ProfessionalConsulting + elif classification.sub_classification == "数据问题": + return DataProblem + + # 安装下载注册 + elif classification.vertical_classification == "安装下载": + if classification.sub_classification == "后缀名咨询": + return FileExtensionConsulting + elif classification.sub_classification == "软件锁类": + return SoftwareLock + elif classification.sub_classification == "安装下载类": + return InstallationDownload + elif classification.sub_classification == "问题排查类": + return ProblemDiagnosis + + return None + + def _fill_slots_with_llm(self, query: str, classification: Classification, keywords: TermList, slot_model_class: type) -> Any: + """ + 使用LLM进行槽位填充 + + Args: + query: 用户原始问题 + classification: 意图分类结果 + keywords: 匹配的关键词列表 + slot_model_class: 槽位模型类 + + Returns: + 填充后的槽位数据模型实例 + """ + # 准备提示词 + slot_parser = PydanticOutputParser(pydantic_object=slot_model_class) + model_schema = json.dumps(slot_model_class.model_json_schema(), ensure_ascii=False) + terms_dict = [term.model_dump() for term in keywords.terms] + keywords_str = json.dumps(terms_dict, ensure_ascii=False) + + formatted_prompt = slot_filling_prompt.format( + query=query, + vertical_classification=classification.vertical_classification, + sub_classification=classification.sub_classification, + keywords=keywords_str, + model_schema=model_schema, + output_format=slot_parser.get_format_instructions() + ) + + # 调用LLM + response = self.llm.invoke(formatted_prompt, False) + + try: + # 尝试解析LLM响应 + parsed_output = slot_parser.parse(response.content) + return parsed_output + except Exception as e: + # 如果解析失败,创建一个空的模型实例 + empty_instance = slot_model_class() + return empty_instance + + def process_query_with_slots(self, query: str) -> Dict[str, Any]: + """ + 处理用户问题的完整流程,包括槽位填充 + + Args: + query: 用户原始问题 + + Returns: + 包含分类、关键词、改写和槽位填充结果的字典 + """ + # 执行基本处理流程 + classification, keywords, rewrite, query_keys = self.process_query(query) + + # 如果是有效分类,进行槽位填充 + slot_filling_result = {} + if classification.vertical_classification not in ["其他", "闲聊"] and classification.sub_classification not in ["其他", "闲聊"]: + slot_filling_result = self.fill_slots(rewrite.rewrite, classification, keywords) + + return { + "classification": classification.model_dump(), + "keywords": keywords.model_dump(), + "rewrite": rewrite.model_dump(), + "query_keys": query_keys, + "slot_filling": slot_filling_result + } \ No newline at end of file diff --git a/rag2_0/intent_recognition/PromptTemplates.py b/rag2_0/intent_recognition/PromptTemplates.py index 3c307c8..9c15950 100644 --- a/rag2_0/intent_recognition/PromptTemplates.py +++ b/rag2_0/intent_recognition/PromptTemplates.py @@ -38,7 +38,7 @@ classification_info="""【垂直领域分类】: 【业务问题包括以下两类】: 1. 专业咨询:涉及电力造价规范、工程计价规则问题、行业标准解读等 -2. 数据问题:涉及电力造价费用、造价指标等 +2. 数据问题:涉及电力造价费用、造价指标的计算或构成等 【安装下载注册包括以下三类】: 1. 后缀名咨询:所有涉及文件扩展名的使用场景、软件关联等问题,包括但不限于:询问文件是否由特定软件打开、扩展名与软件的匹配关系、扩展名含义及关联等 @@ -94,7 +94,7 @@ query_rewrite_prompt = """ b. 执行结构优化: - 采用【术语标记】规范标注关键概念 - 构建主谓宾明确的问题句式 - - 保持原问题时态与语态特征 + - 保持原问题时态与语态特征, 保留5W2H问题特征 - 执行同义词替换:将synonymous中的同义词替换为对应name字段的标准术语 # 输出规范 @@ -132,4 +132,43 @@ query_rewrite_prompt = """ 4. 异常处理机制 - 当关键词与问题无明显关联时,触发直通输出规则 - 出现术语冲突时优先保留原始表述 - """ \ No newline at end of file + """ + +slot_filling_prompt = """ +你是一个专业的电力造价领域问题槽位填充助手。你需要从用户问题中提取关键信息,并填充到对应的数据结构中。 + +【用户问题】 +{query} + +【问题分类】 +垂直领域分类: {vertical_classification} +子分类: {sub_classification} + +【已识别关键词】 +{keywords} + +【目标数据结构】 +{model_schema} + +【输出格式】 +{output_format} + +【任务要求】 +1. 仔细分析用户问题,从中提取所有可能的槽位信息 +2. 对于必填槽位,必须尽力从问题中提取,如果确实无法提取则留空 +3. 对于选填槽位,如果能从问题中提取则填写,否则留空 +4. 只输出符合格式的JSON数据,不要有任何额外的解释 + +【示例】 +用户问题: "我的西藏Z1软件安装后闪退,提示缺少组件,怎么解决?" +分类: 软件问题/故障排查 +输出: +{{ + "software_name": "西藏Z1软件", + "function_name": "软件安装", + "error_message": "闪退,提示缺少组件", + "software_version": null, + "os_version": null, + "reproduction_steps": "软件安装后" +}} +""" \ No newline at end of file