新增单元测试
This commit is contained in:
@@ -0,0 +1,123 @@
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
import os,json,asyncio,nest_asyncio
|
||||
from typing import Dict
|
||||
from llama_index.core.evaluation import CorrectnessEvaluator
|
||||
from app.observability import init_observability
|
||||
from app.settings import init_settings
|
||||
|
||||
init_settings()
|
||||
init_observability()
|
||||
nest_asyncio.apply()
|
||||
|
||||
from llama_index.core.prompts import (
|
||||
ChatMessage,
|
||||
ChatPromptTemplate,
|
||||
MessageRole
|
||||
)
|
||||
|
||||
DEFAULT_SYSTEM_TEMPLATE = """
|
||||
您是一个问答聊天机器人的专业评估系统。
|
||||
|
||||
您将获得以下信息:
|
||||
|
||||
- 用户查询,
|
||||
- 生成的回答,
|
||||
|
||||
也可能提供一个参考答案作为评估的依据。
|
||||
|
||||
您的任务是判断生成回答的相关性和正确性。
|
||||
输出一个代表全面评估的单一分数。
|
||||
您必须在一行中仅返回该分数。
|
||||
不要以其他任何格式返回答案。
|
||||
在单独的一行提供给定分数的理由。
|
||||
|
||||
请遵循以下评分指南:
|
||||
|
||||
- 您的分数必须在1到5之间,其中1是最差,5是最好的。
|
||||
-如果生成的回答与用户查询不相关,您应该给出1分。
|
||||
-如果生成的回答相关但包含错误,您应该给出2到3分之间的分数。
|
||||
-如果生成的回答相关且完全正确,您应该给出4到5分之间的分数。
|
||||
示例响应:
|
||||
4.0
|
||||
生成的回答与参考答案的指标完全相同,但不够精炼。
|
||||
|
||||
"""
|
||||
|
||||
DEFAULT_USER_TEMPLATE = """
|
||||
## User Query
|
||||
{query}
|
||||
|
||||
## Reference Answer
|
||||
{reference_answer}
|
||||
|
||||
## Generated Answer
|
||||
{generated_answer}
|
||||
"""
|
||||
|
||||
DEFAULT_EVAL_TEMPLATE = ChatPromptTemplate(
|
||||
message_templates=[
|
||||
ChatMessage(role=MessageRole.SYSTEM, content=DEFAULT_SYSTEM_TEMPLATE),
|
||||
ChatMessage(role=MessageRole.USER, content=DEFAULT_USER_TEMPLATE),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# 初始化聊天引擎和评估器
|
||||
corr_evaluator_qwen = CorrectnessEvaluator()
|
||||
|
||||
|
||||
# 异步函数用于评估查询
|
||||
async def evaluate_query(title:str,contents:list, output_file:str):
|
||||
index = 0
|
||||
results = []
|
||||
for content in contents:
|
||||
question = content[0]
|
||||
answer = content[1]
|
||||
response = content[2]
|
||||
result = corr_evaluator_qwen.evaluate(
|
||||
query=question,
|
||||
response=response,
|
||||
reference=answer,
|
||||
)
|
||||
|
||||
result_dict = {
|
||||
"编号": index,
|
||||
"问题": question,
|
||||
"答案": answer,
|
||||
"回答": result.response,
|
||||
"得分(1~5)": result.score,
|
||||
"评价": result.feedback
|
||||
}
|
||||
results.append(result_dict)
|
||||
outInfo = {f'{title}':results}
|
||||
with open(output_file, 'a', encoding='utf-8') as f:
|
||||
f.write(json.dumps(outInfo, ensure_ascii=False, indent=4))
|
||||
f.write(',\n')
|
||||
|
||||
# 主异步函数
|
||||
async def excute(filePath:str,outFilePath:str):
|
||||
with open(filePath, 'r', encoding='utf-8') as f:
|
||||
data:dict= json.load(f)
|
||||
for title, items in data.items():
|
||||
content = []
|
||||
for index, item in enumerate(items, start=1):
|
||||
question = item['问题']
|
||||
answer = item['答案']
|
||||
response = item['回答']
|
||||
content.append((question,answer,response))
|
||||
await evaluate_query(title,content,outFilePath)
|
||||
|
||||
async def main():
|
||||
ans_Dir = os.path.join(os.getcwd(),f'unit_test\\Answers')
|
||||
eva_Dir = os.path.join(os.getcwd(),f'unit_test\\Evaluate')
|
||||
ans_files = [f for f in os.listdir(ans_Dir) if f.endswith('.json')]
|
||||
for ans_file in ans_files:
|
||||
filePath = os.path.join(ans_Dir, ans_file)
|
||||
os.makedirs(eva_Dir,exist_ok = True)
|
||||
output_file_path = os.path.join(eva_Dir,f'{ans_file}')
|
||||
await excute(filePath,output_file_path)
|
||||
|
||||
# 运行主协程
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user