优化对话转工单处理逻辑，调整LLM参数，增强用户问题和解决方案的提取功能，添加槽位填充支持，提升代码结构和可读性。

2025-05-30 11:10:24 +08:00
parent 05caedc4fa
commit d4ff7b6fad
6 changed files with 469 additions and 123 deletions
@@ -15,6 +15,7 @@ import json
 import concurrent.futures
 from tqdm import tqdm
 import time
+import sys
 # 加载环境变量
 load_dotenv()

@@ -43,7 +44,7 @@ def load_questions_from_excel(file_path=None):

 def process_query(recognizer, query):
    """
-    处理单个查询，支持重试机制
+    处理单个查询，支持重试机制，并包含槽位填充
    
    Args:
        recognizer: 意图识别器实例
@@ -57,32 +58,48 @@ def process_query(recognizer, query):
    
    while retry_count <= max_retries:
        try:
-            # 如果是重试，添加重试信息到日志        
-            classification, keywords, rewrite, query_keys = recognizer.process_query(query)
+            # 使用新的process_query_with_slots方法处理查询
+            result = recognizer.process_query_with_slots(query)
            
-            # 将keywords对象转换为字符串
+            # 提取分类信息
+            classification = result["classification"]
+            
+            # 提取关键词信息
+            keywords = result["keywords"]
            keywords_str = ""
-            if keywords and keywords.terms:
+            if keywords and keywords.get("terms"):
                term_details = []
-                for term in keywords.terms:
+                for term in keywords["terms"]:
                    term_info = {
-                        "名称": term.name,
-                        "同义词": ";".join(term.synonymous) if term.synonymous else "",
-                        "描述": term.description
+                        "名称": term["name"],
+                        "同义词": ";".join(term["synonymous"]) if term["synonymous"] else "",
+                        "描述": term["description"]
                    }
                    term_details.append(term_info)
                
                # 将term_details转换为JSON字符串,确保中文正确显示
                keywords_str = json.dumps(term_details, ensure_ascii=False, indent=2)
            
+            # 提取槽位填充信息
+            slot_filling = result.get("slot_filling", {})
+            slot_filling_str = ""
+            if slot_filling and "filled_data" in slot_filling:
+                # 格式化槽位填充结果
+                slot_filling_str = json.dumps({
+                    "是否完整": slot_filling.get("is_complete", False),
+                    "缺失槽位": slot_filling.get("missing_slots", {}),
+                    "填充数据": slot_filling.get("filled_data", {})
+                }, ensure_ascii=False, indent=2)
+            
            # 处理成功，返回结果
            return {
                "提问": query,
-                "问题拆解": query_keys,
-                "一级分类": classification.vertical_classification,
-                "二级分类": classification.sub_classification,
-                "问题改写": rewrite.rewrite,
-                "检索的关键词": keywords_str
+                "问题拆解": result["query_keys"],
+                "一级分类": classification["vertical_classification"],
+                "二级分类": classification["sub_classification"],
+                "问题改写": result["rewrite"]["rewrite"],
+                "检索的关键词": keywords_str,
+                "槽位填充": slot_filling_str
            }
            
        except Exception as e:
@@ -96,13 +113,15 @@ def process_query(recognizer, query):
                    "一级分类": "处理出错",
                    "二级分类": "处理出错",
                    "问题改写": "处理出错",
-                    "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}"
+                    "检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}",
+                    "槽位填充": "处理出错"
                }
            else:
                # 可以在这里添加延迟，避免过快重试
                time.sleep(10 * retry_count)

-examples_query = """下载软件在哪下载？"""
+# 示例查询
+examples_query = """这个安全文明费费率在哪里调"""

 def main():
    """
@@ -119,56 +138,67 @@ def main():
    
    # 读取提问数据
    current_dir = os.path.dirname(os.path.abspath(__file__))
-    data_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据.xlsx")
-    examples = load_questions_from_excel(data_file)
-    # examples = examples_query.split("\n")
-    max_workers = 20
-    logging.info(f"共有 {len(examples)} 个问题需要处理，使用 {max_workers} 个并发线程")
+    data_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据.xlsx")
    
-    # 创建一个与输入顺序相同的结果列表
-    results = [None] * len(examples)
+    # 检测是否为调试模式，调试模式下使用examples_query，否则从Excel读取
+
+    is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
+    if is_debug:
+        examples = examples_query.strip().split("\n")
+    else:
+        examples = load_questions_from_excel(data_file)
    
-    # 使用线程池进行并发处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # 提交所有任务并记录它们的索引
-        future_to_index = {}
+    if not is_debug:
+
+        max_workers = 5  # 减少并发数以避免API限制
+        logging.info(f"共有 {len(examples)} 个问题需要处理，使用 {max_workers} 个并发线程")
+        # 创建一个与输入顺序相同的结果列表
+        results = [None] * len(examples)
+        # 使用线程池进行并发处理
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务并记录它们的索引
+            future_to_index = {}
+            for idx, query in enumerate(examples):
+                future = executor.submit(process_query, recognizer, query)
+                future_to_index[future] = idx
+            
+            # 使用tqdm显示进度条
+            for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
+                idx = future_to_index[future]
+                result = future.result()
+                # 将结果放在与输入相同的位置
+                results[idx] = result
+        
+            # 将结果保存到Excel文件
+        results_df = pd.DataFrame(results)
+        
+        output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx")
+        
+        # 使用ExcelWriter设置格式
+        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
+            results_df.to_excel(writer, index=False, sheet_name='Sheet1')
+            
+            # 获取工作簿和工作表对象
+            workbook = writer.book
+            worksheet = writer.sheets['Sheet1']
+            
+            # 设置列宽（单位：像素）
+            # 定义列宽（厘米转为Excel单位，1cm约等于4.7个Excel单位）
+            worksheet.set_column('A:A', 60)  # 提问列 60个Excel单位
+            worksheet.set_column('B:B', 20)   # 问题拆解 20个Excel单位
+            worksheet.set_column('C:C', 20)   # 一级分类 20个Excel单位
+            worksheet.set_column('D:D', 20)   # 二级分类 20个Excel单位
+            worksheet.set_column('E:E', 60)  # 问题改写 60个Excel单位
+            worksheet.set_column('F:F', 60)  # 检索到的关键词 60个Excel单位
+            worksheet.set_column('G:G', 80)  # 槽位填充 80个Excel单位
+            
+            # 设置所有行高为20磅
+            for i in range(len(results_df) + 1):  # +1 是为了包括表头
+                worksheet.set_row(i, 20)
+    else:
        for idx, query in enumerate(examples):
-            future = executor.submit(process_query, recognizer, query)
-            future_to_index[future] = idx
+                process_query(recognizer, query)
        
-        # 使用tqdm显示进度条
-        for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
-            idx = future_to_index[future]
-            result = future.result()
-            # 将结果放在与输入相同的位置
-            results[idx] = result
-    
-    # 将结果保存到Excel文件
-    results_df = pd.DataFrame(results)
-    
-    output_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据_重写结果.xlsx")
-    
-    # 使用ExcelWriter设置格式
-    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
-        results_df.to_excel(writer, index=False, sheet_name='Sheet1')
-        
-        # 获取工作簿和工作表对象
-        workbook = writer.book
-        worksheet = writer.sheets['Sheet1']
-        
-        # 设置列宽（单位：像素）
-        # 定义列宽（厘米转为Excel单位，1cm约等于4.7个Excel单位）
-        worksheet.set_column('A:A', 60)  # 提问列 60个Excel单位
-        worksheet.set_column('B:B', 20)   # 问题拆解 20个Excel单位
-        worksheet.set_column('C:C', 20)   # 一级分类 20个Excel单位
-        worksheet.set_column('D:D', 20)   # 二级分类 20个Excel单位
-        worksheet.set_column('E:E', 60)  # 问题改写 60个Excel单位
-        worksheet.set_column('F:F', 60)  # 检索到的关键词 60个Excel单位
-        
-        # 设置所有行高为20磅
-        for i in range(len(results_df) + 1):  # +1 是为了包括表头
-            worksheet.set_row(i, 20)
-    
    logging.info(f"处理完成，结果已保存至: {output_file}")

 def setup_logging():