Files
QueryRewrite/rag2_0/demo/judge_answer_right.py
T

293 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
答案正确性评判工具
此模块用于评判问题的新旧回答是否正确,通过与标准答案(Wiki内容)进行比较,
或者在没有标准答案的情况下比较新旧回答的差异。
用法示例:
judge = AnswerCorrectnessJudge()
judge.process()
"""
import pandas as pd
from urllib.parse import unquote
from rag2_0.tool.WikijsTool import WikijsTool
from rag2_0.tool.html_to_md import convert_html_to_md
from rag2_0.tool.ModelTool import OpenAiLLM
from dotenv import load_dotenv
import os
from tqdm import tqdm
load_dotenv()
class AnswerCorrectnessJudge:
"""
答案正确性评判工具类
用于评估问题的新旧回答是否正确,可以通过与标准答案(Wiki内容)进行比较,
或者在没有标准答案的情况下比较新旧回答的差异。
"""
def __init__(self, wiki_excel_path="/data/Rag2_0/data/excel/部分提问_软件名称明确.xlsx",
answer_excel_path="/data/Rag2_0/data/excel/主网软件提问_对比结果.xlsx",
output_path="/data/Rag2_0/data/excel/主网软件提问回答_判断结果.xlsx"):
"""
初始化答案正确性评判工具
参数:
wiki_excel_path (str): Wiki Excel文件路径
answer_excel_path (str): 答案对比Excel文件路径
output_path (str): 输出Excel文件路径
"""
self.wiki_excel_path = wiki_excel_path
self.answer_excel_path = answer_excel_path
self.output_path = output_path
# 读取Excel文件
self.wiki_excel = pd.read_excel(self.wiki_excel_path)
self.answer_excel = pd.read_excel(self.answer_excel_path)
# 初始化LLM
self.api_key = os.getenv("OPENAI_API_KEY")
self.base_url = os.getenv("OPENAI_API_BASE")
self.model = os.getenv("LLM_MODEL_NAME")
if not all([self.api_key, self.base_url, self.model]):
raise ValueError("请设置 OPENAI_API_KEY, OPENAI_API_BASE, 和 LLM_MODEL_NAME 环境变量")
self.openai_llm = OpenAiLLM(api_key=self.api_key, base_url=self.base_url, model=self.model)
def find_wiki_link(self, query) -> str | None:
"""
根据查询(对应wiki_excel中的新提问列)找出对应的词条链接
参数:
query (str): 查询内容,对应wiki_excel中的新提问列
返回:
str: 对应的词条链接,如果没有找到则返回None
"""
# 确保query不为空
if not query or pd.isna(query):
return None
# 在"新提问"列中查找匹配的行
matched_rows = self.wiki_excel[self.wiki_excel['新提问'] == query]
# 如果找到了匹配的行,返回对应的词条链接
if not matched_rows.empty:
return matched_rows.iloc[0]['对应词条链接']
# 如果没有完全匹配,尝试部分匹配
# 去除软件名称部分(如果有)
query_parts = query.split(',', 1)
if len(query_parts) > 1:
clean_query = query_parts[1].strip()
# 在"提问"列中查找包含清理后查询的行
for idx, row in self.wiki_excel.iterrows():
if pd.notna(row['提问']) and clean_query in row['提问']:
return row['对应词条链接']
return None
def get_wiki_content(self, link) -> str:
"""
获取词条链接的内容
参数:
link (str): 词条链接
返回:
str: 链接内容,如果获取失败则返回错误信息
"""
try:
if not link or pd.isna(link):
return "链接为空或无效"
# 移除域名部分,只保留路径
path = link.split('/', 3)[-1]
decoded_path = unquote(path)
path_parts = decoded_path.split('/')
doc_path = "/".join(path_parts[1:])
wiki_doc = WikijsTool.get_all_doc_by_path(path=doc_path, path_is_dir=False)
html_content = WikijsTool.query_doc_info(wiki_doc[0]["id"]).get('content')
if not html_content:
return "获取内容失败"
options = {"heading_style": '', "keep_inline_images_in": ["figure", "img"], "escape_asterisks": True}
new_content = (html_content.replace("h6>", "h7>")
.replace("h5>", "h6>")
.replace("h4>", "h5>")
.replace("h3>", "h4>")
.replace("h2>", "h3>")
.replace("h1>", "h2>"))
# 将HTML内容转换为Markdown
markdown_content = convert_html_to_md(new_content, "", **options)
markdown_content = f"# {path_parts[-1]}\n\n{markdown_content}"
return markdown_content
except Exception as e:
raise RuntimeError(f"获取词条内容失败: {str(e)}") from e
def create_prompt(self, standard_answer: str, answer_to_check: str) -> str:
"""
创建用于评判答案的prompt
参数:
standard_answer (str): 标准答案
answer_to_check (str): 需要检查的答案
返回:
str: 格式化的prompt
"""
return f"""请作为一个专业的答案评判专家,评估以下回答与标准答案的匹配程度。
标准答案:
{standard_answer}
待评估的回答:
{answer_to_check}
请仔细分析两个答案的内容,并给出你的判断。只需要回答"正确"或"错误",不需要其他解释。
如果待评估的回答与标准答案在核心内容和关键信息(步骤)上一致,即使表达方式不同,也应判定为"正确"。
如果待评估的回答存在明显的错误信息或重要信息缺失,应判定为"错误"。
请严格按以下格式输出:【正确】或【错误】:"""
def judge_old_answer(self, standard_answer: str, old_answer: str) -> bool | None:
"""
调用LLM判断旧回答是否正确
参数:
standard_answer (str): 标准答案(来自Wiki
old_answer (str): 旧流程的回答
返回:
bool | None: 判断结果,True表示正确,False表示错误,None表示判断失败
"""
prompt = self.create_prompt(standard_answer, old_answer)
try:
response = self.openai_llm.invoke(prompt)
return "正确" in response.content
except Exception as e:
return None
def judge_new_answer(self, standard_answer: str, new_answer: str) -> bool | None:
"""
调用LLM判断新回答是否正确
参数:
standard_answer (str): 标准答案(来自Wiki
new_answer (str): 新流程的回答
返回:
bool | None: 判断结果,True表示正确,False表示错误,None表示判断失败
"""
prompt = self.create_prompt(standard_answer, new_answer)
try:
response = self.openai_llm.invoke(prompt)
return "正确" in response.content
except Exception as e:
return None
def judge_by_standard_answer(self, standard_answer: str, old_answer: str, new_answer: str) -> str | None:
"""
综合判断新旧回答的正确性
参数:
standard_answer (str): 标准答案(来自Wiki
old_answer (str): 旧流程的回答
new_answer (str): 新流程的回答
返回:
str | None: 包含新旧回答判断结果的字符串,None表示判断失败
"""
old_result = self.judge_old_answer(standard_answer, old_answer)
new_result = self.judge_new_answer(standard_answer, new_answer)
if old_result is None or new_result is None:
return None
if new_result and old_result:
return "新旧答案均正确"
elif new_result and not old_result:
return "新答案正确"
elif not new_result and old_result:
return "旧答案正确"
else:
return "新旧答案均错误"
def judge_answer_diff(self, old_answer: str, new_answer: str) -> str | None:
"""
判断新旧回答是否存在较大差异
参数:
old_answer (str): 旧流程的回答
new_answer (str): 新流程的回答
返回:
str | None: 差异判断结果,None表示判断失败
"""
prompt = f"""请判断以下两个回答是否存在较大差异:
旧回答: {old_answer}
新回答: {new_answer}
主要是关键步骤、关键信息、或者关键主体的差异
请仅回答"存在较大差异"或"差异较小"。"""
try:
response = self.openai_llm.invoke(prompt)
return "无法判断,新老答案差异较大" if "存在较大差异" in response.content else "无法判断,新老答案基本相同"
except Exception as e:
return None
def process(self):
"""
处理所有问题并评判答案正确性
读取Excel文件中的问题和答案,进行评判,并将结果保存到输出Excel文件
"""
# 创建结果列表
results = []
# 读取Excel文件
for idx, row in tqdm(self.answer_excel.iterrows(), total=len(self.answer_excel), desc="处理问题"):
query = row["问题"]
old_answer = row["旧流程答案"]
new_answer = row["新流程答案"]
standard_answer = ""
try:
wiki_url = self.find_wiki_link(query)
if wiki_url and not pd.isna(wiki_url):
standard_answer = self.get_wiki_content(wiki_url)
except Exception as e:
print(f"处理问题 '{query}' 时发生错误: {str(e)}")
if standard_answer:
# 判断答案正确性
judge_result = self.judge_by_standard_answer(standard_answer, old_answer, new_answer)
else:
judge_result = self.judge_answer_diff(old_answer, new_answer)
if judge_result is None:
judge_result = ""
results.append({
"问题": query,
"旧流程答案": old_answer,
"新流程答案": new_answer,
"判断结果": judge_result
})
# 将结果转换为DataFrame并保存
results_df = pd.DataFrame(results)
results_df.to_excel(self.output_path, index=False)
print(f"处理完成,共处理 {len(results)} 条记录,结果已保存至 {self.output_path}")
# 测试函数
if __name__ == "__main__":
# 创建答案正确性评判工具实例
judge = AnswerCorrectnessJudge()
# 执行处理
judge.process()