优化对话转工单处理逻辑，调整LLM参数，增强用户问题和解决方案的提取功能，添加槽位填充支持，提升代码结构和可读性。

2025-05-30 11:10:24 +08:00
parent 05caedc4fa
commit d4ff7b6fad
6 changed files with 469 additions and 123 deletions
@@ -89,7 +89,7 @@ class DialogueToWorkorder:
        
        # 初始化LLM模型
        self.llm_params = llm_params or {
-            "temperature": 0.6,
+            "temperature": 0.2,
            "model": os.getenv("LLM_MODEL_NAME"),
            "api_key": os.getenv("OPENAI_API_KEY"),
            "base_url": os.getenv("OPENAI_API_BASE")
@@ -207,37 +207,43 @@ class DialogueToWorkorder:
        """分析用户问题和解决方案"""
        dialogue_str = self.get_dialogue_str(conversation_rows)
        
-        prompt = f"""
-请从以下电力造价相关的客服对话记录中，精准提取用户提出的专业问题及对应坐席提供的解决方案。要求：
+        prompt = """请从以下电力造价相关的客服对话记录中，识别并精准提取用户提出的问题及对应坐席提供的解决方案。
+1、理解对话记录，识别用户在此次对话中提出的诉求
+2、根据用户提出的诉求，分析坐席提供的解决方法
+3、使用json格式输出：
+{output_format}

-1. 专业识别：
- 重点识别电力工程领域的专业术语（如：定额套用、工程量清单、概预算编制、造价指标分析等）
- 注意区分不同业务场景（输变电工程、配网改造、新能源项目等）
- 识别政策文件引用（如：国网Q/GDW 11337-2014标准）
+输出示例：
+{{
+  "user_question": "软件打开报错",
+  "solution": "通过远程引导解决"
+}} 

-2. 信息提取：
-用户问题提取：
- 核心诉求（成本核算/计价争议/软件操作等）
- 涉及的专业环节（设计概算/施工图预算/竣工结算）
- 具体技术参数（电压等级/线路长度/设备型号）
-
-坐席解决方案提取：
- 提供的计算方法（单位工程法/实物量法）
- 推荐的计价依据（电力建设工程定额2018版）
- 指导的软件操作步骤（博微软件操作）
- 政策法规应用建议
- 文件模板提供情况
-
-3. 结构化输出：
-{self.user_question_and_solution_list_parser.get_format_instructions()}
-访客与坐席的对话记录如下：
+=======对话记录如下所示=======
 {dialogue_str}
+============================
        """
+        output_format = self.user_question_and_solution_parser.get_format_instructions()
+        llm_prompt = prompt.format(output_format=output_format, dialogue_str=dialogue_str)
        
-        response = self.llm.invoke(user_prompt=prompt)
-        user_question_and_solution_list = self.user_question_and_solution_list_parser.parse(response.content)
+        response = self.llm.invoke(user_prompt=llm_prompt)
+        if 'reasoning_content' not in response.model_extra and self.llm._model == 'deepseek-ai/DeepSeek-R1':
+            print("deepseek-ai/DeepSeek-R1 解析失败")
        
-        return user_question_and_solution_list.user_question_list
+        try:
+            if response.content.count('user_question') == 1:
+                user_question_and_solution = self.user_question_and_solution_parser.parse(response.content)
+                return [user_question_and_solution]
+            else:
+                raise Exception("解析失败")
+        except Exception as e:
+            output_format = self.user_question_and_solution_list_parser.get_format_instructions()
+            llm_prompt = prompt.format(output_format=output_format, dialogue_str=dialogue_str)
+            response = self.llm.invoke(user_prompt=llm_prompt)
+            user_question_and_solution = self.user_question_and_solution_list_parser.parse(response.content)
+            return user_question_and_solution.user_question_list
+
+        return [user_question_and_solution]
    
    @retry_llm_call(max_retries=3, delay=2)
    def get_product_name_and_module_name(self, product_line, conversation_rows, product_detail_dict, user_question_str, solution_str):
@@ -343,16 +349,14 @@ class DialogueToWorkorder:
        prompt = f"""
 请根据以下对话记录分析访客情绪是否对博微软件或者坐席服务存在明显抱怨，并按照以下结构输出JSON格式分析结果：

-1. 抱怨识别：判断访客是否对博微软件功能或者坐席服务存在明显抱怨语气或词语
+1. 抱怨识别：判断访客是否对博微软件功能或者坐席服务存在**明显抱怨语气或词语**
 2. 抱怨分级（如存在抱怨）：
-   - 一般抱怨：对博微软件功者坐席服务存在轻微不满但情绪稳定
-   - 中等抱怨：对博微软件或者坐席服务明确表达不满并提出具体问题
-   - 严重抱怨：对博微软件或者坐席服务使用激烈言辞或威胁性语言
-   - 抗议行为：明确表示投诉/退费/法律手段
+   - 一般抱怨：明确提出对博微软件功能或者坐席服务存在不满
+   - 中等抱怨：明确提出对博微软件功能或者坐席服务存在不满，语气较为强烈
+   - 严重抱怨：对博微软件功能或者坐席服务使用激烈言辞或威胁性语言
 3. 投诉倾向：是否明确/暗示将进行投诉
 4. 抱怨对象：坐席服务态度/业务能力 或 博微功能问题(注意忽略对非博微软件或坐席的抱怨)
 5. 内容摘录：标注具体抱怨语句
-6. 分析理由：结合语义与上下文的判断依据

 示例输出：
 {{
@@ -387,13 +391,13 @@ class DialogueToWorkorder:
        """处理单个会话的函数，用于多线程并发"""
        # 获取工单基本信息
        workorder_dict = self.get_workorder_dict(conversation_rows)
-        
+        # 分析用户问题和解决方案
+        user_question_list = self.get_user_question_and_solution(conversation_rows)
+
        # 分析是否抱怨、是否投诉、抱怨级别
        is_dissatisfaction, dissatisfaction_level, dissatisfaction_reasoning, is_complaint = (
            self.get_is_complaint_and_is_complaint_level(conversation_rows))

-        # 分析用户问题和解决方案
-        user_question_list = self.get_user_question_and_solution(conversation_rows)
        for user_question in user_question_list:
            user_question_str = user_question.user_question
            solution_str = user_question.solution
@@ -554,18 +558,9 @@ def main():
    # 设置默认文件路径
    conversation_excel_path = args.conversation_file or os.path.join('data', 'excel', '会话内容详情20250528110230.xlsx')
    product_detail_excel_path = args.product_detail_file or os.path.join('data', 'excel', '产品详情_工单.xlsx')
-    output_file = args.output_file
-    
-    # 配置LLM参数
-    llm_params = {
-        "temperature": args.temperature,
-        "model": args.model_name or os.getenv("LLM_MODEL_NAME"),
-        "api_key": os.getenv("OPENAI_API_KEY"),
-        "base_url": os.getenv("OPENAI_API_BASE")
-    }
    
    # 创建处理实例
-    processor = DialogueToWorkorder(llm_params=llm_params)
+    processor = DialogueToWorkorder()
    
    # 分析会话数据
    workorder_dict_list = processor.analyze_conversation_data(
@@ -573,7 +568,7 @@ def main():
        product_detail_excel_path, 
        max_workers=args.max_workers
    )
-    
+    output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx')
    # 保存结果
    processor.save_results_to_excel(workorder_dict_list, output_file)

@@ -15,6 +15,7 @@ import json
 import concurrent.futures
 from tqdm import tqdm
 import time
+import sys
 # 加载环境变量
 load_dotenv()

@@ -43,7 +44,7 @@ def load_questions_from_excel(file_path=None):

 def process_query(recognizer, query):
    """
-    处理单个查询，支持重试机制
+    处理单个查询，支持重试机制，并包含槽位填充
    
    Args:
        recognizer: 意图识别器实例
@@ -57,32 +58,48 @@ def process_query(recognizer, query):
    
    while retry_count <= max_retries:
        try:
-            # 如果是重试，添加重试信息到日志        
-            classification, keywords, rewrite, query_keys = recognizer.process_query(query)
+            # 使用新的process_query_with_slots方法处理查询
+            result = recognizer.process_query_with_slots(query)
            
-            # 将keywords对象转换为字符串
+            # 提取分类信息
+            classification = result["classification"]
+            
+            # 提取关键词信息
+            keywords = result["keywords"]
            keywords_str = ""
-            if keywords and keywords.terms:
+            if keywords and keywords.get("terms"):
                term_details = []
-                for term in keywords.terms:
+                for term in keywords["terms"]:
                    term_info = {
-                        "名称": term.name,
-                        "同义词": ";".join(term.synonymous) if term.synonymous else "",
-                        "描述": term.description
+                        "名称": term["name"],
+                        "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "",
+                        "描述": term["description"]
                    }
                    term_details.append(term_info)
                
                # 将term_details转换为JSON字符串,确保中文正确显示
                keywords_str = json.dumps(term_details, ensure_ascii=False, indent=2)
            
+            # 提取槽位填充信息
+            slot_filling = result.get("slot_filling", {})
+            slot_filling_str = ""
+            if slot_filling and "filled_data" in slot_filling:
+                # 格式化槽位填充结果
+                slot_filling_str = json.dumps({
+                    "是否完整": slot_filling.get("is_complete", False),
+                    "缺失槽位": slot_filling.get("missing_slots", {}),
+                    "填充数据": slot_filling.get("filled_data", {})
+                }, ensure_ascii=False, indent=2)
+            
            # 处理成功，返回结果
            return {
                "提问": query,
-                "问题拆解": query_keys,
-                "一级分类": classification.vertical_classification,
-                "二级分类": classification.sub_classification,
-                "问题改写": rewrite.rewrite,
-                "检索的关键词": keywords_str
+                "问题拆解": result["query_keys"],
+                "一级分类": classification["vertical_classification"],
+                "二级分类": classification["sub_classification"],
+                "问题改写": result["rewrite"]["rewrite"],
+                "检索的关键词": keywords_str,
+                "槽位填充": slot_filling_str
            }
            
        except Exception as e:
@@ -96,13 +113,15 @@ def process_query(recognizer, query):
                    "一级分类": "处理出错",
                    "二级分类": "处理出错",
                    "问题改写": "处理出错",
-                    "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}"
+                    "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}",
+                    "槽位填充": "处理出错"
                }
            else:
                # 可以在这里添加延迟，避免过快重试
                time.sleep(10 * retry_count)

-examples_query = """下载软件在哪下载？"""
+# 示例查询
+examples_query = """这个安全文明费费率在哪里调"""

 def main():
    """
@@ -119,56 +138,67 @@ def main():
    
    # 读取提问数据
    current_dir = os.path.dirname(os.path.abspath(__file__))
-    data_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据.xlsx")
-    examples = load_questions_from_excel(data_file)
-    # examples = examples_query.split("\n")
-    max_workers = 20
-    logging.info(f"共有 {len(examples)} 个问题需要处理，使用 {max_workers} 个并发线程")
+    data_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据.xlsx")
    
-    # 创建一个与输入顺序相同的结果列表
-    results = [None] * len(examples)
+    # 检测是否为调试模式，调试模式下使用examples_query，否则从Excel读取
+
+    is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
+    if is_debug:
+        examples = examples_query.strip().split("\n")
+    else:
+        examples = load_questions_from_excel(data_file)
    
-    # 使用线程池进行并发处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # 提交所有任务并记录它们的索引
-        future_to_index = {}
+    if not is_debug:
+
+        max_workers = 5  # 减少并发数以避免API限制
+        logging.info(f"共有 {len(examples)} 个问题需要处理，使用 {max_workers} 个并发线程")
+        # 创建一个与输入顺序相同的结果列表
+        results = [None] * len(examples)
+        # 使用线程池进行并发处理
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务并记录它们的索引
+            future_to_index = {}
+            for idx, query in enumerate(examples):
+                future = executor.submit(process_query, recognizer, query)
+                future_to_index[future] = idx
+            
+            # 使用tqdm显示进度条
+            for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
+                idx = future_to_index[future]
+                result = future.result()
+                # 将结果放在与输入相同的位置
+                results[idx] = result
+        
+            # 将结果保存到Excel文件
+        results_df = pd.DataFrame(results)
+        
+        output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx")
+        
+        # 使用ExcelWriter设置格式
+        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
+            results_df.to_excel(writer, index=False, sheet_name='Sheet1')
+            
+            # 获取工作簿和工作表对象
+            workbook = writer.book
+            worksheet = writer.sheets['Sheet1']
+            
+            # 设置列宽（单位：像素）
+            # 定义列宽（厘米转为Excel单位，1cm约等于4.7个Excel单位）
+            worksheet.set_column('A:A', 60)  # 提问列 60个Excel单位
+            worksheet.set_column('B:B', 20)   # 问题拆解 20个Excel单位
+            worksheet.set_column('C:C', 20)   # 一级分类 20个Excel单位
+            worksheet.set_column('D:D', 20)   # 二级分类 20个Excel单位
+            worksheet.set_column('E:E', 60)  # 问题改写 60个Excel单位
+            worksheet.set_column('F:F', 60)  # 检索到的关键词 60个Excel单位
+            worksheet.set_column('G:G', 80)  # 槽位填充 80个Excel单位
+            
+            # 设置所有行高为20磅
+            for i in range(len(results_df) + 1):  # +1 是为了包括表头
+                worksheet.set_row(i, 20)
+    else:
        for idx, query in enumerate(examples):
-            future = executor.submit(process_query, recognizer, query)
-            future_to_index[future] = idx
+                process_query(recognizer, query)
        
-        # 使用tqdm显示进度条
-        for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
-            idx = future_to_index[future]
-            result = future.result()
-            # 将结果放在与输入相同的位置
-            results[idx] = result
-    
-    # 将结果保存到Excel文件
-    results_df = pd.DataFrame(results)
-    
-    output_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据_重写结果.xlsx")
-    
-    # 使用ExcelWriter设置格式
-    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
-        results_df.to_excel(writer, index=False, sheet_name='Sheet1')
-        
-        # 获取工作簿和工作表对象
-        workbook = writer.book
-        worksheet = writer.sheets['Sheet1']
-        
-        # 设置列宽（单位：像素）
-        # 定义列宽（厘米转为Excel单位，1cm约等于4.7个Excel单位）
-        worksheet.set_column('A:A', 60)  # 提问列 60个Excel单位
-        worksheet.set_column('B:B', 20)   # 问题拆解 20个Excel单位
-        worksheet.set_column('C:C', 20)   # 一级分类 20个Excel单位
-        worksheet.set_column('D:D', 20)   # 二级分类 20个Excel单位
-        worksheet.set_column('E:E', 60)  # 问题改写 60个Excel单位
-        worksheet.set_column('F:F', 60)  # 检索到的关键词 60个Excel单位
-        
-        # 设置所有行高为20磅
-        for i in range(len(results_df) + 1):  # +1 是为了包括表头
-            worksheet.set_row(i, 20)
-    
    logging.info(f"处理完成，结果已保存至: {output_file}")

 def setup_logging():