更新.gitignore以忽略临时文件，修改api_key文件，重构合并名词的逻辑，删除不再使用的脚本，优化对话到工单的处理流程，添加会话结果保存为JSON的功能，调整API调用参数，修复部分代码中的错误。

2025-07-25 09:53:47 +08:00
parent 4d7ef54ae7
commit 2cbdc23fc0
13 changed files with 1205 additions and 27522 deletions
@@ -15,6 +15,8 @@ import traceback
 import re
 import logging
 from tqdm import tqdm
+import glob
+import shutil

 # 将项目根目录添加到Python路径
 sys.path.append(os.getcwd())
@@ -84,13 +86,6 @@ class IsComplaint(BaseModel):
    dissatisfaction_reasoning: str = Field(description="抱怨原因")
    is_complaint: bool = Field(description="是否明确/暗示将进行投诉")

-class ProductNameAndModuleName(BaseModel):
-    product_name: str = Field(description="产品名称")
-    module_name: str = Field(description="模块名称")
-
-class ProductLine(BaseModel):
-    product_line: str = Field(description="产品线")
-
 # ================ 工具函数 ================
 def retry_llm_call(max_retries=3, delay=2):
    """
@@ -138,8 +133,6 @@ class DialogueToWorkorder:
        self.user_question_and_solution_list_parser = PydanticOutputParser(pydantic_object=UserQuestionAndSolutionList)
        self.question_type_parser = PydanticOutputParser(pydantic_object=QuestionType)
        self.is_complaint_parser = PydanticOutputParser(pydantic_object=IsComplaint)
-        self.product_name_and_module_name_parser = PydanticOutputParser(pydantic_object=ProductNameAndModuleName)
-        self.product_line_parser = PydanticOutputParser(pydantic_object=ProductLine)
        # 初始化LLM模型
        self.llm_params = llm_params or {
            "temperature": 0.2,
@@ -158,6 +151,10 @@ class DialogueToWorkorder:
        #     "timeout": httpx.Timeout(600.0)
        # }
        self.llm = self._get_llm_instance()
+        
+        # 创建工单JSON文件目录
+        self.workorder_json_dir = "data/temp_workorder_json"
+        os.makedirs(self.workorder_json_dir, exist_ok=True)
    
    def _get_llm_instance(self):
        """获取LLM实例"""
@@ -483,6 +480,66 @@ class DialogueToWorkorder:
                is_complaint.dissatisfaction_reasoning, 
                is_complaint.is_complaint)
    
+    def save_conversation_to_json(self, conversation_id, workorder_list):
+        """
+        将会话处理结果保存为JSON文件
+        
+        参数:
+        conversation_id: 会话ID
+        workorder_list: 工单列表
+        """
+        # 确保目录存在
+        os.makedirs(self.workorder_json_dir, exist_ok=True)
+        
+        # 构建文件路径
+        file_path = os.path.join(self.workorder_json_dir, f"{conversation_id}.json")
+        
+        # 将工单列表转换为可序列化的字典列表
+        serializable_workorder_list = []
+        for workorder in workorder_list:
+            # 处理datetime对象
+            serializable_workorder = {}
+            for key, value in workorder.items():
+                if isinstance(value, datetime):
+                    serializable_workorder[key] = value.strftime("%Y-%m-%d %H:%M:%S")
+                else:
+                    serializable_workorder[key] = value
+            serializable_workorder_list.append(serializable_workorder)
+        
+        # 保存为JSON文件
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(serializable_workorder_list, f, ensure_ascii=False, indent=2)
+        
+        logger.info(f"会话ID: {conversation_id} 的处理结果已保存到 {file_path}")
+    
+    def load_conversation_from_json(self, conversation_id):
+        """
+        从JSON文件加载会话处理结果
+        
+        参数:
+        conversation_id: 会话ID
+        
+        返回:
+        工单列表，如果文件不存在则返回None
+        """
+        # 构建文件路径
+        file_path = os.path.join(self.workorder_json_dir, f"{conversation_id}.json")
+        
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            return None
+        
+        # 从JSON文件加载工单列表
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                workorder_list = json.load(f)
+            
+            logger.info(f"已从 {file_path} 加载会话ID: {conversation_id} 的处理结果")
+            return workorder_list
+        except Exception as e:
+            logger.error(f"加载会话ID: {conversation_id} 的处理结果时发生错误: {e}")
+            return None
+    
    def process_conversation(self, conversation_id, conversation_rows):
        """处理单个会话的函数，用于多线程并发"""
        # if conversation_id!="b157aa91-3acb-11f0-a191-4fb224ef4b40":
@@ -534,13 +591,16 @@ class DialogueToWorkorder:
                # 将工单添加到列表中
                workorder_list.append(workorder_dict)
            
+            # 将处理结果保存为JSON文件
+            self.save_conversation_to_json(conversation_id, workorder_list)
+            
            return workorder_list
        except Exception as e:
            logger.error(f"处理会话ID: {conversation_id} 时发生错误: {e}")
            return []
    
    def analyze_conversation_data(self, conversation_excel_path, max_workers=10, start_date=None, end_date=None):
-        """分析会话数据主流程，使用多线程并发处理"""
+        """分析会话数据主流程，使用多线程并发处理，支持失败重试和JSON合并"""
        # 读取Excel文件
        df = pd.read_excel(conversation_excel_path)
        
@@ -575,37 +635,123 @@ class DialogueToWorkorder:
            conversation_dict = new_conversation_dict

        logger.info(f"会话总数为 {len(conversation_dict)}，处理全部会话")
+
+        # ========== 新增：扫描已存在的JSON文件 ==========
+        existing_json_files = set()
+        workorder_json_dir = self.workorder_json_dir
+        if not os.path.exists(workorder_json_dir):
+            os.makedirs(workorder_json_dir, exist_ok=True)
+        for fname in os.listdir(workorder_json_dir):
+            if fname.endswith('.json'):
+                conversation_id = fname[:-5]
+                existing_json_files.add(conversation_id)
+        
+        # 本次新生成的JSON文件
+        newly_generated_json_files = set()
+        # 本次未重新生成但已存在的JSON文件
+        reused_json_files = set()
+        
+        # ========== 线程池处理会话 ==========
+        successful_conversations = set()
+        failed_conversations = set()
+        import threading
+        lock = threading.Lock()
+        
+        def process_wrapper(conversation_id, conversation_rows):
+            json_file_path = os.path.join(workorder_json_dir, f"{conversation_id}.json")
+            if conversation_id in existing_json_files and os.path.exists(json_file_path):
+                # 已存在，直接复用
+                with lock:
+                    reused_json_files.add(conversation_id)
+                return None  # 不处理
+            # 否则正常处理
+            result = self.process_conversation(conversation_id, conversation_rows)
+            if result:
+                with lock:
+                    newly_generated_json_files.add(conversation_id)
+            return result
        
-        # 使用线程池处理每个会话
-        workorder_dict_list = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # 创建任务
            future_to_conversation = {
-                executor.submit(self.process_conversation, conversation_id, conversation_rows): conversation_id
+                executor.submit(process_wrapper, conversation_id, conversation_rows): conversation_id
                for conversation_id, conversation_rows in conversation_dict.items()
            }
-            
-            # 获取结果
-            for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="处理会话进度"):
+            for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="第一轮处理会话进度"):
                conversation_id = future_to_conversation[future]
                try:
                    result_workorders = future.result()
                    if result_workorders:
-                        # 将每个会话的所有工单添加到总列表中
-                        workorder_dict_list.extend(result_workorders)
+                        successful_conversations.add(conversation_id)
                        logger.info(f"完成处理会话ID: {conversation_id}，生成工单数量: {len(result_workorders)}")
+                    elif conversation_id in reused_json_files:
+                        successful_conversations.add(conversation_id)
+                        logger.info(f"跳过已存在JSON，会话ID: {conversation_id}")
+                    else:
+                        failed_conversations.add(conversation_id)
+                        logger.warning(f"会话ID: {conversation_id} 处理可能失败，将在第二轮重试")
                except Exception as exc:
+                    failed_conversations.add(conversation_id)
                    logger.error(f"处理会话ID: {conversation_id} 时发生错误: {exc}")
-        
+
+        # 检查哪些会话没有成功生成JSON文件
+        all_conversation_ids = set(conversation_dict.keys())
+        for conversation_id in all_conversation_ids:
+            json_file_path = os.path.join(workorder_json_dir, f"{conversation_id}.json")
+            if not os.path.exists(json_file_path):
+                failed_conversations.add(conversation_id)
+                if conversation_id in successful_conversations:
+                    successful_conversations.remove(conversation_id)
+
+        # ========== 第二轮重试 ==========
+        if failed_conversations:
+            logger.info(f"第一轮处理后有 {len(failed_conversations)} 个会话需要重试")
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, max_workers // 2)) as executor:
+                future_to_conversation = {
+                    executor.submit(process_wrapper, conversation_id, conversation_dict[conversation_id]): conversation_id
+                    for conversation_id in failed_conversations
+                }
+                for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="第二轮重试处理进度"):
+                    conversation_id = future_to_conversation[future]
+                    try:
+                        result_workorders = future.result()
+                        if result_workorders:
+                            successful_conversations.add(conversation_id)
+                            newly_generated_json_files.add(conversation_id)
+                            failed_conversations.remove(conversation_id)
+                            logger.info(f"重试成功: 会话ID: {conversation_id}，生成工单数量: {len(result_workorders)}")
+                        elif conversation_id in reused_json_files:
+                            successful_conversations.add(conversation_id)
+                            failed_conversations.remove(conversation_id)
+                            logger.info(f"重试跳过已存在JSON，会话ID: {conversation_id}")
+                    except Exception as exc:
+                        logger.error(f"重试处理会话ID: {conversation_id} 时仍然发生错误: {exc}")
+
+        # ========== 合并本次所有成功的JSON文件 ==========
+        logger.info(f"开始合并JSON文件结果，成功处理会话数: {len(successful_conversations)}，失败会话数: {len(failed_conversations)}")
+        workorder_dict_list = []
+        # 只合并本次新生成和本次未重新生成但已存在的JSON
+        all_json_ids_to_merge = newly_generated_json_files.union(reused_json_files)
+        json_files = [os.path.join(workorder_json_dir, f"{cid}.json") for cid in all_json_ids_to_merge if os.path.exists(os.path.join(workorder_json_dir, f"{cid}.json"))]
+        for json_file in tqdm(json_files, desc="合并JSON文件"):
+            conversation_id = os.path.basename(json_file).replace(".json", "")
+            try:
+                with open(json_file, 'r', encoding='utf-8') as f:
+                    workorders = json.load(f)
+                workorder_dict_list.extend(workorders)
+            except Exception as e:
+                logger.error(f"加载JSON文件 {json_file} 时发生错误: {e}")
+        logger.info(f"处理完成，成功处理会话数: {len(successful_conversations)}，失败会话数: {len(failed_conversations)}")
+        if failed_conversations:
+            logger.warning(f"以下会话处理失败: {failed_conversations}")
        return workorder_dict_list
    
    def save_results_to_excel(self, workorder_dict_list, output_file=None):
-        """将结果保存到Excel文件"""
+        """将结果保存到Excel文件，并清理JSON文件"""
        result_df = pd.DataFrame(workorder_dict_list)
        
        # 按照指定的列顺序重新排列DataFrame的列
        columns_order = [
-            '工单编号', '产品线', '产品名称', '模块名称', '问题类型', 
+            '工单编号', '产品线', '产品名称', '问题类型', 
            '客户问题', '解决方案', '是否抱怨', "抱怨内容", '是否投诉', '抱怨级别', 
            '会话id', '访客昵称', '处理坐席', "处理人", "处理技能组",'创建时间'
        ]
@@ -645,7 +791,6 @@ class DialogueToWorkorder:
                '工单编号': 15,
                '产品线': 24,
                '产品名称': 40,
-                '模块名称': 40,
                '问题类型': 9,
                '客户问题': 20,
                '解决方案': 30,
@@ -668,8 +813,7 @@ class DialogueToWorkorder:
                    col_letter = chr(64 + i // 26) + chr(65 + i % 26)
                worksheet.column_dimensions[col_letter].width = column_widths[column]
        
-        logger.info(f"结果已保存到 {output_file}")
-        
+        logger.info(f"结果已保存到 {output_file}")        
        return output_file

 # ================ 参数解析 ================