Files
QueryRewrite/rag2_0/dify/GenerateSoftwareWikiLibrary.py

65 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
import re
import sys
from dotenv import load_dotenv
load_dotenv()
from rag2_0.dify.dify_client import DifyApi
soft_name_map = {
"配网造价软件知识(new)": "配网计价通D3软件",
"西藏造价软件知识(new)": "西藏计价通Z1软件",
"储能C1计价通软件知识(new)": "储能计价通C1软件",
"技改检修工程计价通T1软件知识(new)": "技改检修工程计价通T1软件",
"技改检修清单计价通T1软件知识(new)": "技改检修清单计价通T1软件",
"电力建设计价通(2018)软件知识(new)": "电力建设计价通软件",
"下载安装注册(new)": "下载安装注册",
}
soft_wiki_file_name = {
"配网计价通D3软件": ["配网计价通D3软件.txt", []],
"西藏计价通Z1软件": ["西藏计价通Z1软件.txt", []],
"储能计价通C1软件": ["储能计价通C1软件.txt", []],
"技改检修工程计价通T1软件": ["技改检修工程计价通T1软件.txt", []],
"技改检修清单计价通T1软件": ["技改检修清单计价通T1软件.txt", []],
"电力建设计价通软件": ["电力建设计价通软件.txt", []],
"下载安装注册": ["下载安装注册.txt", []],
}
def get_soft_wiki_titles(dify_api, soft_name_map, soft_wiki_file_name):
"""获取每个软件的wiki标题列表"""
dataset_list = dify_api.get_all_dataset_list()
soft_name_map_keys = list(soft_name_map.keys())
for dataset in dataset_list:
if dataset["name"] not in soft_name_map_keys:
continue
dataset_name = dataset["name"]
dataset_id = dataset["id"]
documents = dify_api.get_documents(dataset_id=dataset_id)
for document_id, doc_info in documents.items():
document_name = doc_info["name"]
wiki_name = document_name.split("/")[-1]
wiki_title = re.sub(r'^.*?|^\(.*?\)', '', wiki_name)
if wiki_title not in soft_wiki_file_name[soft_name_map[dataset_name]][1]:
soft_wiki_file_name[soft_name_map[dataset_name]][1].append(wiki_title)
return soft_wiki_file_name
def save_wiki_titles(soft_wiki_file_name, output_dir="data/wiki_data"):
"""将wiki标题列表保存到对应txt文件"""
os.makedirs(output_dir, exist_ok=True)
for soft_name, (txt_file_name, wiki_titles) in soft_wiki_file_name.items():
output_path = os.path.join(output_dir, txt_file_name)
with open(output_path, "w", encoding="utf-8") as f:
for title in wiki_titles:
f.write(title + "\n")
print(f"已保存 {soft_name} 的wiki标题列表到 {output_path},共 {len(wiki_titles)} 条")
def main():
dify_api = DifyApi()
wiki_titles = get_soft_wiki_titles(dify_api, soft_name_map, soft_wiki_file_name)
save_wiki_titles(wiki_titles)
if __name__ == "__main__":
main()