import pandas as pd import json from regex import search import ijson df = pd.read_excel("data/excel/已分析数据汇总(第一轮).xlsx") df=df[df["评价"]=="dislike"] msg_id_list = df["msg_id"].tolist() msg_debug_list = {} # 流式解析 JSON 数组 with open("data/excel/msg_debug_list.json", "r", encoding="utf-8") as f: # 使用ijson.items直接获取顶层键值对 for msg_id, data in ijson.kvitems(f, ''): if msg_id in msg_id_list: msg_debug_list[msg_id] = data def get_rewrite_query(intent_node_execution_info)->str: outputs_result =json.loads(intent_node_execution_info['outputs']) return outputs_result['optimize_query'] def judge_error_node_and_reason(intent_node_execution_info, knowledge_filter_node_execution_info_list, answer_wiki_name)->dict: result = {"问题改写结果":None, "错误环节":None, "错误原因":None, "具体描述":None} if answer_wiki_name is None or pd.isna(answer_wiki_name): return result outputs_result =json.loads(intent_node_execution_info['outputs']) result["问题改写结果"] = outputs_result['optimize_query'] if outputs_result['is_complete'] == False: result["错误环节"] = "槽点填充" result["错误原因"] = f"槽点缺失" result["具体描述"] = f"缺失内容:{outputs_result['missing_slots']}" return result if len(knowledge_filter_node_execution_info_list) == 0: return result knowledge_filter_node_execution_info=knowledge_filter_node_execution_info_list[0] # 获取检索到的所有词条 knowledge_filter_outputs = json.loads(knowledge_filter_node_execution_info['outputs']) source_knowledge = knowledge_filter_outputs['source_kno'] source_knowledge_title ="\n".join([item['title'] for item in source_knowledge]) if answer_wiki_name not in source_knowledge_title: result["错误环节"] = "知识检索" result["错误原因"] = f"未检索到对应词条" # 获取词条名称及对应评分 result["具体描述"] = "检索到的词条如下:\n" for index, item in enumerate(source_knowledge): result["具体描述"] += f"词条名称:{item['title'].split('/')[-1]},重排评分:{item['metadata']['score']:.2f}\n" return result # 获取检索到的词条的metadata knowledge_filter = knowledge_filter_outputs['knowledge_list_metadata'] knowledge_filter_title ="\n".join([item['title'] for item in knowledge_filter]) if answer_wiki_name not in knowledge_filter_title: result["错误环节"] = "知识过滤" result["错误原因"] = f"词条被过滤" result["具体描述"] = "检索到的词条如下:\n" for index, item in enumerate(source_knowledge): result["具体描述"] += f"词条名称:{item['title'].split('/')[-1]},重排评分:{item['metadata']['score']:.2f}\n" return result # 检索正确,回答错误 result["错误环节"] = "生成错误" result["错误原因"] = f"" result["具体描述"] = f"" return result df["问题改写结果"] = None df["错误环节"] = None df["错误原因"] = None df["具体描述"] = None for index, row in df.iterrows(): try: msg_id = row["msg_id"] answer = row["回答"] query = row["提问"] rating = row["评价"] class_type = row["问题分类"] dislike_reason = row["点踩原因"] if dislike_reason is None or pd.isna(dislike_reason): continue answer_wiki_name = row["关联词条"] search_wiki = row["检索到的词条"] node_executions_info = msg_debug_list[msg_id] intent_node_execution_info = [node_execution_info for node_execution_info in node_executions_info if node_execution_info["title"] == "意图识别结果解析"] knowledge_filter_node_execution_info_list = [node_execution_info for node_execution_info in node_executions_info if node_execution_info["title"] == "提取处理后的知识"] if len(intent_node_execution_info) == 0: print(f"msg_id: {msg_id} 缺少节点信息") continue rewrite_query = get_rewrite_query(intent_node_execution_info[0]) df.loc[index, "问题改写结果"] = rewrite_query if "有词条" not in dislike_reason: continue result = judge_error_node_and_reason(intent_node_execution_info[0], knowledge_filter_node_execution_info_list, answer_wiki_name) for key, value in result.items(): df.loc[index, key] = value except Exception as e: print(f"msg_id: {msg_id} 处理失败: {e}") continue df.to_excel("data/excel/已分析数据汇总(第一轮)_分析.xlsx", index=False)