59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
from app.observability import init_observability
|
||
from app.settings import init_settings
|
||
|
||
import nest_asyncio
|
||
nest_asyncio.apply()
|
||
|
||
from llama_index.core.node_parser import SentenceSplitter
|
||
from llama_index.core import SimpleDirectoryReader
|
||
|
||
from llama_index.core.evaluation import DatasetGenerator
|
||
|
||
import json
|
||
|
||
init_settings()
|
||
init_observability()
|
||
|
||
documents = SimpleDirectoryReader("backend\data-test").load_data()
|
||
|
||
splitter = SentenceSplitter(chunk_size=512)
|
||
|
||
# question_generator = DatasetGenerator.from_documents(documents)
|
||
quest_prompt = (
|
||
"你是一个电力造价工程相关的项目经理,现在给你一些上下文信息,"
|
||
"你需要根据现有的上下文信息,来生成{num_questions_per_chunk}个电力造价工程相关的问题和对应的回答,"
|
||
"问题的实例应该是这种类型的:'人工费的费率是多少?,费率是100','前期工作管理费用的金额是多少?,金额是0',"
|
||
"这种类似的问题和答案,生成的问题和答案必须一一对应,要符合文件里的内容,不要生成一些无关的问题,不要生成一些重复的问题,"
|
||
"不要生成一些过于简单的问题,不要生成一些过于复杂的问题。"
|
||
)
|
||
|
||
question_generator = DatasetGenerator.from_documents(
|
||
documents=documents,
|
||
question_gen_query=quest_prompt,
|
||
num_questions_per_chunk=5 #生成的问题数
|
||
)
|
||
|
||
eval_questions = question_generator.generate_questions_from_nodes(5)
|
||
|
||
# print(eval_questions)
|
||
|
||
# 处理生成的问题和答案,转换为JSON格式
|
||
qa_pairs = []
|
||
for qa in eval_questions:
|
||
# 处理可能没有 ',' 的情况
|
||
if '?' in qa:
|
||
question, answer = qa.split("?", 1)
|
||
qa_pairs.append({
|
||
"question": question.strip(),
|
||
"answer": answer.strip()
|
||
})
|
||
else:
|
||
print(f"无法处理的问题和答案: {qa}")
|
||
|
||
# 保存为JSON文件
|
||
with open("backend/unit_test/questions_and_answers.json", "w", encoding="utf-8") as f:
|
||
json.dump(qa_pairs, f, ensure_ascii=False, indent=4)
|