diff --git a/rag2_0/demo/dialogue_to_workorder.py b/rag2_0/demo/dialogue_to_workorder.py index 4321b5e..bcf13aa 100755 --- a/rag2_0/demo/dialogue_to_workorder.py +++ b/rag2_0/demo/dialogue_to_workorder.py @@ -14,6 +14,7 @@ import httpx import traceback import re import logging +from tqdm import tqdm # 将项目根目录添加到Python路径 sys.path.append(os.getcwd()) @@ -469,9 +470,10 @@ class DialogueToWorkorder: "客户问题": user_question_str, "问题类型": problem_type, "是否抱怨": "是" if is_dissatisfaction else '否', + "抱怨内容": dissatisfaction_reasoning if is_dissatisfaction else '', "抱怨级别": dissatisfaction_level if is_dissatisfaction else '', "是否投诉": "是" if is_complaint else '否', - "解决方案": (solution_str + '\n存在抱怨:' + dissatisfaction_reasoning) if is_dissatisfaction else solution_str + "解决方案": solution_str }) workorder_list.append(base_workorder_dict) # for user_question in user_question_list: @@ -544,7 +546,7 @@ class DialogueToWorkorder: } # 获取结果 - for future in concurrent.futures.as_completed(future_to_conversation): + for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="处理会话进度"): conversation_id = future_to_conversation[future] try: result_workorders = future.result() @@ -637,9 +639,9 @@ def parse_arguments(): help='产品详情Excel文件路径') parser.add_argument('--max_workers', type=int, default=16, help='并发处理线程数,默认为16') - parser.add_argument('--start_date', type=str, required=False, + parser.add_argument('--start_date', type=str, required=False,default="2025-05-25 00:00:00", help='开始日期,格式为YYYY-MM-DD') - parser.add_argument('--end_date', type=str, required=False, + parser.add_argument('--end_date', type=str, required=False,default="2025-05-30 15:54", help='结束日期,格式为YYYY-MM-DD') return parser.parse_args() diff --git a/rag2_0/demo/heli_db_to_excel.py b/rag2_0/demo/heli_db_to_excel.py index 9018c09..823fd90 100755 --- a/rag2_0/demo/heli_db_to_excel.py +++ b/rag2_0/demo/heli_db_to_excel.py @@ -23,6 +23,7 @@ from tqdm import tqdm import concurrent.futures import sys +os.makedirs('./data/log', exist_ok=True) # 配置日志 logging.basicConfig( level=logging.INFO, @@ -33,7 +34,7 @@ logging.basicConfig( ] ) logger = logging.getLogger(__name__) -os.makedirs('./data/log', exist_ok=True) + @dataclass class DatabaseConfig: @@ -492,8 +493,8 @@ def main() -> None: # 创建数据库客户端 with MariaDBClient(config, max_connections=12) as db_client: # 查询会话数据 - start_date = '2025-01-01 00:00:00' - end_date = '2025-06-12 00:00:00' + start_date = '2025-06-12 00:00:00' + end_date = '2025-07-01 00:00:00' logger.info(f"查询时间范围: {start_date} 到 {end_date}") # 创建会话处理器 diff --git a/rag2_0/dify/dify_client/dify_api.py b/rag2_0/dify/dify_client/dify_api.py index 49c4d04..c6e585d 100644 --- a/rag2_0/dify/dify_client/dify_api.py +++ b/rag2_0/dify/dify_client/dify_api.py @@ -402,7 +402,8 @@ class DifyApi: content: str, answer: str, keywords: List[str], - enabled: bool + enabled: bool, + regenerate_child_chunks: bool = True ) -> Dict: """ 更新指定文档的某个分段信息。 @@ -430,7 +431,7 @@ class DifyApi: "answer": answer, "keywords": keywords, "enabled": enabled, - "regenerate_child_chunks": True + "regenerate_child_chunks": regenerate_child_chunks } } diff --git a/rag2_0/dify/export_new_dify.py b/rag2_0/dify/export_new_dify.py index c95f856..04cc6b1 100644 --- a/rag2_0/dify/export_new_dify.py +++ b/rag2_0/dify/export_new_dify.py @@ -37,9 +37,9 @@ class DifyExporter: self.query_log_dir = os.path.join(os.getcwd(), "data", "query_logs") self.query_log_file = query_log_file or os.path.join(self.query_log_dir, "answer_type_logs.json") - # 设置日期过滤 - self.start_date = start_date - self.end_date = end_date + # 设置日期过滤,转换为datetime对象 + self.start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d %H") if start_date else None + self.end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d %H") if end_date else None # 初始化工具类 self.dify_pgsql = PgSql() @@ -49,22 +49,22 @@ class DifyExporter: self.message_info_list = [] self.query_logs = {} - def load_query_logs(self): + def load_query_logs(self,path): """ 从文件加载查询日志 """ try: - with open(self.query_log_file, 'r', encoding='utf-8') as f: + with open(path, 'r', encoding='utf-8') as f: query_logs_list = json.load(f) - # 创建字典来存储每个查询的最新记录 + # 创建字典来存储每个查询的最新记录workflow_run_id for record in query_logs_list: - query = record['query'] + workflow_run_id = record['workflow_run_id'] timestamp = record.get('timestamp') # 如果查询不在字典中或者当前记录的时间戳更新,则更新字典 - if query not in self.query_logs or (timestamp and self.query_logs.get(query, {}).get('timestamp') and + if workflow_run_id not in self.query_logs or (timestamp and self.query_logs.get(workflow_run_id, {}).get('timestamp') and datetime.datetime.fromisoformat(timestamp) > - datetime.datetime.fromisoformat(self.query_logs[query]['timestamp'])): - self.query_logs[query] = record + datetime.datetime.fromisoformat(self.query_logs[workflow_run_id]['timestamp'])): + self.query_logs[workflow_run_id] = record return True except Exception as e: print(f"加载查询日志失败: {e}") @@ -103,6 +103,27 @@ class DifyExporter: message_chain_new.append(msg) return message_chain_new + def get_remark(self, msg_debug_info): + """ + 获取备注 + """ + intent_node_execution_info = [node_execution_info for node_execution_info in msg_debug_info['workflow_node_executions_info'] + if node_execution_info["title"] == "意图识别结果解析"] + if len(intent_node_execution_info) == 0: + return "" + + if intent_node_execution_info[0]["outputs"] is None: + return "" + intent_result = json.loads(intent_node_execution_info[0]["outputs"]) + vertical_classification = intent_result.get("vertical_classification", "") + sub_classification = intent_result.get("sub_classification", "") + if vertical_classification == "固定话术类": + return "使用固定话术" + + if sub_classification == "软件锁类": + return "固定引导至博微软件助手中操作" + return "" + def extract_message_info(self, message): """ 从消息中提取信息 @@ -121,7 +142,7 @@ class DifyExporter: user_name = msg_inputs.get("user_name", "") msg_query = message["query"] msg_answer = message["answer"] - + msg_answer = msg_answer.split("----------------------------------------")[0] # 将UTC+0时间转换为UTC+8时间 created_at_utc = message['created_at'] created_at_utc8 = created_at_utc + datetime.timedelta(hours=8) @@ -143,13 +164,17 @@ class DifyExporter: document_name = knowledge['metadata']['document_name'] wiki_list.append(document_name.split("/")[-1]) + # 获取备注 + remark = self.get_remark(msg_debug_info) + wiki_list = list(set(wiki_list)) wiki_list_str = "\n".join(wiki_list) if wiki_list_str == "": wiki_list_str = "无" rating = self.dify_pgsql.get_message_rating(msg_id) # 直接通过字典键获取query_type - query_type = self.query_logs.get(msg_query, {}).get('query_type', "") + workflow_run_id = message['workflow_run_id'] + query_type = self.query_logs.get(workflow_run_id, {}).get('query_type', "") return { "msg_id": msg_id, @@ -159,7 +184,8 @@ class DifyExporter: "提问时间": created_at, "评价": rating, "问题分类": query_type, - "检索到的词条": wiki_list_str + "检索到的词条": wiki_list_str, + "备注": remark } def process_conversations(self): @@ -184,13 +210,10 @@ class DifyExporter: created_at_utc = message['created_at'] created_at_utc8 = created_at_utc + datetime.timedelta(hours=8) - # 提取消息的创建日期时间,精确到小时 - created_at_hour = created_at_utc8.strftime("%Y-%m-%d %H") - # 应用日期时间过滤 - if self.start_date and created_at_hour < self.start_date: + if self.start_date and created_at_utc8 < self.start_date: continue - if self.end_date and created_at_hour > self.end_date: + if self.end_date and created_at_utc8 > self.end_date: continue message_info = self.extract_message_info(message) @@ -281,7 +304,8 @@ class DifyExporter: 数据库中的时间是UTC+0时区,会自动转换为UTC+8时区进行过滤和显示 """ # 加载查询日志 - self.load_query_logs() + self.load_query_logs(self.query_log_file) + self.load_query_logs("data/query_logs/answer_type_logs_071409.json") # 处理会话数据 self.process_conversations() @@ -294,12 +318,12 @@ class DifyExporter: # 如果指定了日期范围,则在文件名中体现 date_suffix = "" if self.start_date: - # 将空格替换为下划线,使文件名更规范 - formatted_start = self.start_date.replace(" ", "_") + # 格式化日期对象为字符串 + formatted_start = self.start_date.strftime("%Y-%m-%d_%H") date_suffix += f"_from_{formatted_start}" if self.end_date: - # 将空格替换为下划线,使文件名更规范 - formatted_end = self.end_date.replace(" ", "_") + # 格式化日期对象为字符串 + formatted_end = self.end_date.strftime("%Y-%m-%d_%H") date_suffix += f"_to_{formatted_end}" output_file = os.path.join(os.getcwd(), "data", "excel", f"dify_export{date_suffix}_{timestamp}.xlsx") @@ -321,9 +345,9 @@ if __name__ == "__main__": help='Dify应用ID') parser.add_argument('--query_log_file', '-q', type=str, default="data/query_logs/answer_type_logs.json", help='查询日志文件路径') - parser.add_argument('--start_date', '-s', type=str, default="2025-07-09 13", + parser.add_argument('--start_date', '-s', type=str, default="2025-07-14 00", help='开始日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 14表示2025年7月8日14时(UTC+8时区)') - parser.add_argument('--end_date', '-e', type=str, default=None, + parser.add_argument('--end_date', '-e', type=str, default="2025-07-14 15", help='结束日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 18表示2025年7月8日18时(UTC+8时区)') args = parser.parse_args() diff --git a/rag2_0/intent_recognition/IntentRecognition.py b/rag2_0/intent_recognition/IntentRecognition.py index 55b507b..e3c7e1e 100755 --- a/rag2_0/intent_recognition/IntentRecognition.py +++ b/rag2_0/intent_recognition/IntentRecognition.py @@ -288,8 +288,10 @@ class AsyncIntentRecognizer: # 步骤2: 使用向量检索找到相似的专业名词 try: vector_start_time = time.time() - # 对matched_terms中的每个关键字进行向量检索 - for current_key in query_keys: + + # 创建并行任务列表 + async def process_single_keyword(current_key: str) -> List[Term]: + """处理单个关键词的向量检索和重排序""" vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False) current_key_terms = set() # 添加向量检索结果 @@ -304,7 +306,17 @@ class AsyncIntentRecognizer: current_key_terms.add(term) if len(current_key_terms) > 0: reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms) - matched_terms.extend(reranked_terms) + return reranked_terms + return [] + + # 并行处理所有关键词 + keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys] + keyword_results = await asyncio.gather(*keyword_tasks) + + # 合并所有结果 + for result in keyword_results: + matched_terms.extend(result) + vector_end_time = time.time() vector_time = vector_end_time - vector_start_time except Exception as e: @@ -649,7 +661,7 @@ class AsyncIntentRecognizer: formatted_prompt = step_back_prompt.format( query=query, chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]", - conversation_context=conversation_context, + # conversation_context=conversation_context, output_format=step_back_parser.get_format_instructions() ) @@ -688,7 +700,7 @@ class AsyncIntentRecognizer: formatted_prompt = follow_up_questions_prompt.format( query=query, chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]", - conversation_context=conversation_context, + # conversation_context=conversation_context, output_format=follow_up_parser.get_format_instructions() ) @@ -727,7 +739,7 @@ class AsyncIntentRecognizer: formatted_prompt = hyde_prompt.format( query=query, chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]", - conversation_context=conversation_context, + # conversation_context=conversation_context, output_format=hyde_parser.get_format_instructions() ) @@ -766,7 +778,7 @@ class AsyncIntentRecognizer: formatted_prompt = multi_questions_prompt.format( query=query, chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]", - conversation_context=conversation_context, + # conversation_context=conversation_context, output_format=multi_questions_parser.get_format_instructions() ) diff --git a/rag2_0/intent_recognition/PromptTemplates.py b/rag2_0/intent_recognition/PromptTemplates.py index 2c21525..32495d7 100755 --- a/rag2_0/intent_recognition/PromptTemplates.py +++ b/rag2_0/intent_recognition/PromptTemplates.py @@ -226,30 +226,30 @@ step_back_prompt = """ - 涵盖原始问题的核心主题 - 去除过于具体的限制条件(如时间、地点、特定版本、特定工程等) - 保持在同一领域和主题范围内 + - 依次移除问题中的限定词或者修饰词 ## 输入 用户原始问题: {query} 历史对话记录: {chat_history} -会话背景: {conversation_context} ## 输出格式 {output_format} ## 示例 -原始问题: "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?" +原始问题: "2023版本如何在Windows 11系统上导入单位工程量清单?" 后退问题: {{ - "original_query": "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?", + "original_query": "2023版本如何在Windows 11系统上导入单位工程量清单?", "can_use_back_prompt": True, - "step_back_query": ["配网D3软件如何导入工程量清单?", "如何导入单位工程量清单?"] + "step_back_query": ["如何在Windows 11系统上导入单位工程量清单?", "如何导入单位工程量清单?"] }} -原始问题: "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?" +原始问题: "某个设备更换后,如何在系统中更新对应的定额?" 后退问题: {{ - "original_query": "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?", + "original_query": "某个设备更换后,如何在系统中更新对应的定额?", "can_use_back_prompt": True, - "step_back_query": ["技改T1软件中如何更新设备对应的定额?", "如何更新设备对应的定额?"] + "step_back_query": ["如何更新设备对应的定额?", "如何更新定额?"] }} """ @@ -271,7 +271,6 @@ follow_up_questions_prompt = """ ## 输入 历史对话记录: {chat_history} 当前用户问题: {query} -会话背景: {conversation_context} ## 输出格式 {output_format} @@ -308,7 +307,6 @@ hyde_prompt = """ ## 输入 用户问题: {query} 历史对话记录: {chat_history} -会话背景: {conversation_context} ## 输出格式 {output_format} @@ -343,7 +341,6 @@ multi_questions_prompt = """ ## 输入 用户原始问题: {query} 历史对话记录: {chat_history} -会话背景: {conversation_context} ## 输出格式 {output_format}