优化意图识别示例，更新文档相关性判断逻辑，增强Excel数据验证功能，改进日志记录，调整参数以提升代码可读性和灵活性。

2025-06-25 09:10:28 +08:00
parent 7142c7c43e
commit 33bc91f0fe
4 changed files with 35 additions and 55 deletions
@@ -33,7 +33,7 @@ class ValidationResult(BaseModel):
 class ExcelDataValidator:
    """Excel数据验证类，用于批量验证Excel数据中的问题分类、问题拆解、检索关键词和问题改写"""
    
-    def __init__(self, input_file=None, output_file=None, workers=4, batch_size=10, debug=False):
+    def __init__(self, input_file=None, output_file=None, workers=4, debug=False):
        """
        初始化验证器
        
@@ -41,7 +41,6 @@ class ExcelDataValidator:
            input_file: 输入Excel文件路径
            output_file: 输出结果Excel文件路径
            workers: 并行工作线程数
-            batch_size: 每批处理的行数
            debug: 是否启用调试模式（串行处理）
        """
        # 加载环境变量
@@ -50,7 +49,6 @@ class ExcelDataValidator:
        self.input_file = input_file
        self.output_file = output_file
        self.workers = workers
-        self.batch_size = batch_size
        self.debug = debug
        self.df = None
        
@@ -86,7 +84,7 @@ class ExcelDataValidator:
            
        try:
            df = pd.read_excel(file_path)
-            required_columns = ["问题", "问题分类", "问题改写", "槽点信息"]
+            required_columns = ["问题", "问题分类", "问题改写", "槽位信息", "检索的内容"]
            for col in required_columns:
                if col not in df.columns:
                    logging.error(f"缺少必要的列: {col}", exc_info=True)
@@ -320,7 +318,7 @@ class ExcelDataValidator:
        query = row["问题"]
        query_class = row.get("问题分类", "")
        rewrite = row.get("问题改写", "")
-        slot_info = row.get("槽点信息", "")
+        slot_info = row.get("槽位信息", "")
        retrieve_content = row.get("检索的内容", "")
        
        if self.debug:
@@ -359,15 +357,16 @@ class ExcelDataValidator:
                if len(query_class_list) >= 2:
                    result = self.validate_classification(llm, rewrite, query_class_list[0], query_class_list[1])
                    if isinstance(result, tuple) and len(result) >= 3:
-                        is_correct, error_reason, confidence_score = result[:3]
+                        is_correct, error_reason, classification_confidence = result[:3]
+                        confidence_score = max(confidence_score, classification_confidence)
                        
                        if self.debug:
-                            logging.info(f"  问题分类验证结果: {'通过' if is_correct else '不通过'}, 置信度: {confidence_score:.2f}")
+                            logging.info(f"  问题分类验证结果: {'通过' if is_correct else '不通过'}, 置信度: {classification_confidence:.2f}")
                            if not is_correct:
                                logging.info(f"  错误原因: {error_reason}")
                        
                        if not is_correct:
-                            return index, False, "问题分类", error_reason, confidence_score
+                            return index, False, "问题分类", error_reason, classification_confidence
            

            
@@ -416,13 +415,6 @@ class ExcelDataValidator:
            logging.error(error_msg, exc_info=True)
            return index, False, "处理错误", error_msg, 0.0
    
-    def process_batch(self, llm, batch_data):
-        """处理一批数据"""
-        results = []
-        for row_data in batch_data:
-            results.append(self.validate_row(llm, row_data))
-        return results
-    
    def create_llm_instances(self, count):
        """创建多个LLM实例"""
        api_key = os.getenv("OPENAI_API_KEY")
@@ -437,7 +429,7 @@ class ExcelDataValidator:

        return [OpenAiLLM(**llm_params) for _ in range(count)]
    
-    def validate(self, input_file=None, output_file=None, workers=None, batch_size=None, debug=None):
+    def validate(self, input_file=None, output_file=None, workers=None, debug=None):
        """
        执行验证过程
        
@@ -445,7 +437,7 @@ class ExcelDataValidator:
            input_file: 输入Excel文件路径
            output_file: 输出结果Excel文件路径
            workers: 并行工作线程数
-            batch_size: 每批处理的行数
+            batch_size: 每批处理的行数（已弃用，保留参数保持兼容）
            debug: 是否启用调试模式（串行处理）
            
        Returns:
@@ -454,7 +446,6 @@ class ExcelDataValidator:
        input_file = input_file or self.input_file
        output_file = output_file or self.output_file
        workers = workers or self.workers
-        batch_size = batch_size or self.batch_size
        debug = debug if debug is not None else self.debug
        
        # 读取数据
@@ -492,21 +483,20 @@ class ExcelDataValidator:
                # 输出当前结果
                logging.info(f"行 {index} 验证结果: {'通过' if is_correct else '不通过'}, 错误环节: {error_phase}, 错误原因: {error_reason}, 置信度: {confidence_score:.2f}")
        else:
-            # 正常模式：并行处理
-            batches = [all_rows[i:i+batch_size] for i in range(0, len(all_rows), batch_size)]
-            llm_instances = self.create_llm_instances(min(workers, len(batches)))
+            # 正常模式：并行处理，每行单独处理
+            llm_instances = self.create_llm_instances(min(workers, len(all_rows)))
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-                # 为每个批次分配一个LLM实例
-                future_to_batch = {
-                    executor.submit(self.process_batch, llm_instances[i % len(llm_instances)], batch): 
-                    i for i, batch in enumerate(batches)
+                # 为每行分配一个LLM实例
+                future_to_row = {
+                    executor.submit(self.validate_row, llm_instances[i % len(llm_instances)], row_data): 
+                    i for i, row_data in enumerate(all_rows)
                }
                
                # 使用tqdm显示进度条
-                for future in tqdm(concurrent.futures.as_completed(future_to_batch), total=len(batches), desc="批次处理进度"):
-                    batch_results = future.result()
-                    all_results.extend(batch_results)
+                for future in tqdm(concurrent.futures.as_completed(future_to_row), total=len(all_rows), desc="处理进度"):
+                    result = future.result()
+                    all_results.append(result)
        
        # 按行索引排序结果，确保与原始数据顺序一致
        all_results.sort(key=lambda x: x[0])
@@ -558,16 +548,14 @@ class ExcelDataValidator:
 def main():
    """主函数"""
    # 解析命令行参数
-    input_excel = os.path.join(os.path.dirname(__file__), "..", "..", "data", "excel", "1500条点踩软件问题测试_检索结果.xlsx")
+    input_excel = os.path.join(os.path.dirname(__file__), "..", "..", "data", "excel", "1500条点踩软件问题测试_意图分类.xlsx")
    output_excel = os.path.join(os.path.dirname(__file__), "..", "..", "data", "excel", "自动验证_问题分类重写结果.xlsx")

    parser = argparse.ArgumentParser(description="验证Excel数据中的问题分类、问题拆解、检索关键词和问题改写")
    parser.add_argument("--input", "-i", type=str, help="输入Excel文件路径", default=input_excel)
    parser.add_argument("--output", "-o", type=str, help="输出结果Excel文件路径", default=output_excel)
    parser.add_argument("--workers", "-w", type=int, default=20, help="并行工作线程数")
-    parser.add_argument("--batch-size", "-b", type=int, default=5, help="每批处理的行数")
-    parser.add_argument("--debug", "-d", action="store_true", help="启用调试模式（串行处理）")
-
+    logging.info(f"输入文件路径: {args.input}, 输出文件路径: {args.output}, 并行工作线程数: {args.workers}")
    args = parser.parse_args()
    is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
    
@@ -576,7 +564,6 @@ def main():
        input_file=args.input,
        output_file=args.output,
        workers=args.workers,
-        batch_size=args.batch_size,
        debug=is_debug
    )
    validator.validate()