import os import json import re import sys from dotenv import load_dotenv load_dotenv() sys.path.append(os.getcwd()) from rag2_0.dify.dify_client import DifyApi soft_name_map = { "配网造价软件知识(new)": "配网计价通D3软件", "西藏造价软件知识(new)": "西藏计价通Z1软件", "储能C1计价通软件知识(new)": "储能计价通C1软件", "技改检修工程计价通T1软件知识(new)": "技改检修工程计价通T1软件", "技改检修清单计价通T1软件知识(new)": "技改检修清单计价通T1软件", "电力建设计价通(2018)软件知识(new)": "电力建设计价通软件", "下载安装注册(new)": "下载安装注册", } soft_wiki_file_name = { "配网计价通D3软件": ["配网计价通D3软件.txt", []], "西藏计价通Z1软件": ["西藏计价通Z1软件.txt", []], "储能计价通C1软件": ["储能计价通C1软件.txt", []], "技改检修工程计价通T1软件": ["技改检修工程计价通T1软件.txt", []], "技改检修清单计价通T1软件": ["技改检修清单计价通T1软件.txt", []], "电力建设计价通软件": ["电力建设计价通软件.txt", []], "下载安装注册": ["下载安装注册.txt", []], } def get_soft_wiki_titles(dify_api, soft_name_map, soft_wiki_file_name): """获取每个软件的wiki标题列表""" dataset_list = dify_api.get_all_dataset_list() soft_name_map_keys = list(soft_name_map.keys()) for dataset in dataset_list: if dataset["name"] not in soft_name_map_keys: continue dataset_name = dataset["name"] dataset_id = dataset["id"] documents = dify_api.get_documents(dataset_id=dataset_id) for document_id, doc_info in documents.items(): document_name = doc_info["name"] wiki_name = document_name.split("/")[-1] wiki_title = re.sub(r'^(.*?)|^\(.*?\)', '', wiki_name) if wiki_title not in soft_wiki_file_name[soft_name_map[dataset_name]][1]: soft_wiki_file_name[soft_name_map[dataset_name]][1].append(wiki_title) return soft_wiki_file_name def save_wiki_titles(soft_wiki_file_name, output_dir="data/wiki_data"): """将wiki标题列表保存到对应txt文件""" os.makedirs(output_dir, exist_ok=True) for soft_name, (txt_file_name, wiki_titles) in soft_wiki_file_name.items(): output_path = os.path.join(output_dir, txt_file_name) with open(output_path, "w", encoding="utf-8") as f: for title in wiki_titles: f.write(title + "\n") print(f"已保存 {soft_name} 的wiki标题列表到 {output_path},共 {len(wiki_titles)} 条") def main(): dify_api = DifyApi() wiki_titles = get_soft_wiki_titles(dify_api, soft_name_map, soft_wiki_file_name) save_wiki_titles(wiki_titles) if __name__ == "__main__": main()