""" 答案正确性评判工具 此模块用于评判问题的新旧回答是否正确,通过与标准答案(Wiki内容)进行比较, 或者在没有标准答案的情况下比较新旧回答的差异。 用法示例: judge = AnswerCorrectnessJudge() judge.process() """ import pandas as pd from urllib.parse import unquote from rag2_0.tool.WikijsTool import WikijsTool from rag2_0.tool.html_to_md import convert_html_to_md from rag2_0.tool.ModelTool import OpenAiLLM from dotenv import load_dotenv import os from tqdm import tqdm load_dotenv() class AnswerCorrectnessJudge: """ 答案正确性评判工具类 用于评估问题的新旧回答是否正确,可以通过与标准答案(Wiki内容)进行比较, 或者在没有标准答案的情况下比较新旧回答的差异。 """ def __init__(self, wiki_excel_path="/data/Rag2_0/data/excel/部分提问_软件名称明确.xlsx", answer_excel_path="/data/Rag2_0/data/excel/主网软件提问_对比结果.xlsx", output_path="/data/Rag2_0/data/excel/主网软件提问回答_判断结果.xlsx"): """ 初始化答案正确性评判工具 参数: wiki_excel_path (str): Wiki Excel文件路径 answer_excel_path (str): 答案对比Excel文件路径 output_path (str): 输出Excel文件路径 """ self.wiki_excel_path = wiki_excel_path self.answer_excel_path = answer_excel_path self.output_path = output_path # 读取Excel文件 self.wiki_excel = pd.read_excel(self.wiki_excel_path) self.answer_excel = pd.read_excel(self.answer_excel_path) # 初始化LLM self.api_key = os.getenv("OPENAI_API_KEY") self.base_url = os.getenv("OPENAI_API_BASE") self.model = os.getenv("LLM_MODEL_NAME") if not all([self.api_key, self.base_url, self.model]): raise ValueError("请设置 OPENAI_API_KEY, OPENAI_API_BASE, 和 LLM_MODEL_NAME 环境变量") self.openai_llm = OpenAiLLM(api_key=self.api_key, base_url=self.base_url, model=self.model) def find_wiki_link(self, query) -> str | None: """ 根据查询(对应wiki_excel中的新提问列)找出对应的词条链接 参数: query (str): 查询内容,对应wiki_excel中的新提问列 返回: str: 对应的词条链接,如果没有找到则返回None """ # 确保query不为空 if not query or pd.isna(query): return None # 在"新提问"列中查找匹配的行 matched_rows = self.wiki_excel[self.wiki_excel['新提问'] == query] # 如果找到了匹配的行,返回对应的词条链接 if not matched_rows.empty: return matched_rows.iloc[0]['对应词条链接'] # 如果没有完全匹配,尝试部分匹配 # 去除软件名称部分(如果有) query_parts = query.split(',', 1) if len(query_parts) > 1: clean_query = query_parts[1].strip() # 在"提问"列中查找包含清理后查询的行 for idx, row in self.wiki_excel.iterrows(): if pd.notna(row['提问']) and clean_query in row['提问']: return row['对应词条链接'] return None def get_wiki_content(self, link) -> str: """ 获取词条链接的内容 参数: link (str): 词条链接 返回: str: 链接内容,如果获取失败则返回错误信息 """ try: if not link or pd.isna(link): return "链接为空或无效" # 移除域名部分,只保留路径 path = link.split('/', 3)[-1] decoded_path = unquote(path) path_parts = decoded_path.split('/') doc_path = "/".join(path_parts[1:]) wiki_doc = WikijsTool.get_all_doc_by_path(path=doc_path, path_is_dir=False) html_content = WikijsTool.query_doc_info(wiki_doc[0]["id"]).get('content') if not html_content: return "获取内容失败" options = {"heading_style": '', "keep_inline_images_in": ["figure", "img"], "escape_asterisks": True} new_content = (html_content.replace("h6>", "h7>") .replace("h5>", "h6>") .replace("h4>", "h5>") .replace("h3>", "h4>") .replace("h2>", "h3>") .replace("h1>", "h2>")) # 将HTML内容转换为Markdown markdown_content = convert_html_to_md(new_content, "", **options) markdown_content = f"# {path_parts[-1]}\n\n{markdown_content}" return markdown_content except Exception as e: raise RuntimeError(f"获取词条内容失败: {str(e)}") from e def create_prompt(self, standard_answer: str, answer_to_check: str) -> str: """ 创建用于评判答案的prompt 参数: standard_answer (str): 标准答案 answer_to_check (str): 需要检查的答案 返回: str: 格式化的prompt """ return f"""请作为一个专业的答案评判专家,评估以下回答与标准答案的匹配程度。 标准答案: {standard_answer} 待评估的回答: {answer_to_check} 请仔细分析两个答案的内容,并给出你的判断。只需要回答"正确"或"错误",不需要其他解释。 如果待评估的回答与标准答案在核心内容和关键信息(步骤)上一致,即使表达方式不同,也应判定为"正确"。 如果待评估的回答存在明显的错误信息或重要信息缺失,应判定为"错误"。 请严格按以下格式输出:【正确】或【错误】:""" def judge_old_answer(self, standard_answer: str, old_answer: str) -> bool | None: """ 调用LLM判断旧回答是否正确 参数: standard_answer (str): 标准答案(来自Wiki) old_answer (str): 旧流程的回答 返回: bool | None: 判断结果,True表示正确,False表示错误,None表示判断失败 """ prompt = self.create_prompt(standard_answer, old_answer) try: response = self.openai_llm.invoke(prompt) return "正确" in response.content except Exception as e: return None def judge_new_answer(self, standard_answer: str, new_answer: str) -> bool | None: """ 调用LLM判断新回答是否正确 参数: standard_answer (str): 标准答案(来自Wiki) new_answer (str): 新流程的回答 返回: bool | None: 判断结果,True表示正确,False表示错误,None表示判断失败 """ prompt = self.create_prompt(standard_answer, new_answer) try: response = self.openai_llm.invoke(prompt) return "正确" in response.content except Exception as e: return None def judge_by_standard_answer(self, standard_answer: str, old_answer: str, new_answer: str) -> str | None: """ 综合判断新旧回答的正确性 参数: standard_answer (str): 标准答案(来自Wiki) old_answer (str): 旧流程的回答 new_answer (str): 新流程的回答 返回: str | None: 包含新旧回答判断结果的字符串,None表示判断失败 """ old_result = self.judge_old_answer(standard_answer, old_answer) new_result = self.judge_new_answer(standard_answer, new_answer) if old_result is None or new_result is None: return None if new_result and old_result: return "新旧答案均正确" elif new_result and not old_result: return "新答案正确" elif not new_result and old_result: return "旧答案正确" else: return "新旧答案均错误" def judge_answer_diff(self, old_answer: str, new_answer: str) -> str | None: """ 判断新旧回答是否存在较大差异 参数: old_answer (str): 旧流程的回答 new_answer (str): 新流程的回答 返回: str | None: 差异判断结果,None表示判断失败 """ prompt = f"""请判断以下两个回答是否存在较大差异: 旧回答: {old_answer} 新回答: {new_answer} 主要是关键步骤、关键信息、或者关键主体的差异 请仅回答"存在较大差异"或"差异较小"。""" try: response = self.openai_llm.invoke(prompt) return "无法判断,新老答案差异较大" if "存在较大差异" in response.content else "无法判断,新老答案基本相同" except Exception as e: return None def process(self): """ 处理所有问题并评判答案正确性 读取Excel文件中的问题和答案,进行评判,并将结果保存到输出Excel文件 """ # 创建结果列表 results = [] # 读取Excel文件 for idx, row in tqdm(self.answer_excel.iterrows(), total=len(self.answer_excel), desc="处理问题"): query = row["问题"] old_answer = row["旧流程答案"] new_answer = row["新流程答案"] standard_answer = "" try: wiki_url = self.find_wiki_link(query) if wiki_url and not pd.isna(wiki_url): standard_answer = self.get_wiki_content(wiki_url) except Exception as e: print(f"处理问题 '{query}' 时发生错误: {str(e)}") if standard_answer: # 判断答案正确性 judge_result = self.judge_by_standard_answer(standard_answer, old_answer, new_answer) else: judge_result = self.judge_answer_diff(old_answer, new_answer) if judge_result is None: judge_result = "" results.append({ "问题": query, "旧流程答案": old_answer, "新流程答案": new_answer, "判断结果": judge_result }) # 将结果转换为DataFrame并保存 results_df = pd.DataFrame(results) results_df.to_excel(self.output_path, index=False) print(f"处理完成,共处理 {len(results)} 条记录,结果已保存至 {self.output_path}") # 测试函数 if __name__ == "__main__": # 创建答案正确性评判工具实例 judge = AnswerCorrectnessJudge() # 执行处理 judge.process()