Files
zjdataai-app/backend/unit_test/question.py
T
2024-09-05 17:02:42 +08:00

59 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from dotenv import load_dotenv
load_dotenv()
from app.observability import init_observability
from app.settings import init_settings
import nest_asyncio
nest_asyncio.apply()
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.core.evaluation import DatasetGenerator
import json
init_settings()
init_observability()
documents = SimpleDirectoryReader("backend\data-test").load_data()
splitter = SentenceSplitter(chunk_size=512)
# question_generator = DatasetGenerator.from_documents(documents)
quest_prompt = (
"你是一个电力造价工程相关的项目经理,现在给你一些上下文信息,"
"你需要根据现有的上下文信息,来生成{num_questions_per_chunk}个电力造价工程相关的问题和对应的回答,"
"问题的实例应该是这种类型的:'人工费的费率是多少?,费率是100','前期工作管理费用的金额是多少?,金额是0',"
"这种类似的问题和答案,生成的问题和答案必须一一对应,要符合文件里的内容,不要生成一些无关的问题,不要生成一些重复的问题,"
"不要生成一些过于简单的问题,不要生成一些过于复杂的问题。"
)
question_generator = DatasetGenerator.from_documents(
documents=documents,
question_gen_query=quest_prompt,
num_questions_per_chunk=5 #生成的问题数
)
eval_questions = question_generator.generate_questions_from_nodes(5)
# print(eval_questions)
# 处理生成的问题和答案,转换为JSON格式
qa_pairs = []
for qa in eval_questions:
# 处理可能没有 '' 的情况
if '' in qa:
question, answer = qa.split("", 1)
qa_pairs.append({
"question": question.strip(),
"answer": answer.strip()
})
else:
print(f"无法处理的问题和答案: {qa}")
# 保存为JSON文件
with open("backend/unit_test/questions_and_answers.json", "w", encoding="utf-8") as f:
json.dump(qa_pairs, f, ensure_ascii=False, indent=4)