优化DifyCompareTest和WorkorderToDify模块,调整日志记录格式,修复API密钥获取方式,增强工单处理流程,添加元数据管理功能,改进并发上传逻辑,更新文档处理方式。
This commit is contained in:
+237
-40
@@ -1,50 +1,247 @@
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import logging
|
||||
import concurrent.futures
|
||||
import threading
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S',
|
||||
handlers=[
|
||||
logging.StreamHandler(),
|
||||
logging.FileHandler(f'data/logs/WorkorderToDify_{datetime.datetime.now().strftime("%Y%m%d")}.log', encoding='utf-8')
|
||||
]
|
||||
)
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
import rag2_0.dify.dify_client.dify_api as DifyApi
|
||||
|
||||
import pandas as pd
|
||||
pd_data = pd.read_excel("data/excel/工单汇总(给AI)_工单拆分.xlsx")
|
||||
|
||||
|
||||
dify_api = DifyApi.DifyApi()
|
||||
peiwang_dataset_id = dify_api.get_or_create_dataset_by_name("配网工单数据")
|
||||
zhuwang_dataset_id = dify_api.get_or_create_dataset_by_name("主网工单数据")
|
||||
jianga_dataset_id = dify_api.get_or_create_dataset_by_name("技改工单数据")
|
||||
chuneng_dataset_id = dify_api.get_or_create_dataset_by_name("储能工单数据")
|
||||
xizang_dataset_id = dify_api.get_or_create_dataset_by_name("西藏工单数据")
|
||||
|
||||
|
||||
soft_segments_list={}
|
||||
for index, row in pd_data.iterrows():
|
||||
query = row["客户问题"]
|
||||
answer = row["解决方案"]
|
||||
skill_group = row["技能组"]
|
||||
class WorkorderToDify:
|
||||
def __init__(self, excel_path="data/excel/2025.1-6月工单(人工整理后).xlsx"):
|
||||
self.pd_data = pd.read_excel(excel_path)
|
||||
self.dify_api = DifyApi.DifyApi()
|
||||
self.dataset_ids = {}
|
||||
self.skill_group_data = {}
|
||||
self.metadata_ids = {} # 用于缓存元数据ID
|
||||
|
||||
# 初始化各技能组的数据集ID
|
||||
self.dataset_ids["博微配网计价通D3"] = self.dify_api.get_or_create_dataset_by_name("配网工单数据")
|
||||
self.dataset_ids["博微电力建设计价通软件"] = self.dify_api.get_or_create_dataset_by_name("主网工单数据")
|
||||
self.dataset_ids["博微技改检修计价通T1软件"] = self.dify_api.get_or_create_dataset_by_name("技改工单数据")
|
||||
self.dataset_ids["新能源系列"] = self.dify_api.get_or_create_dataset_by_name("储能工单数据")
|
||||
self.dataset_ids["博微西藏计价通Z1"] = self.dify_api.get_or_create_dataset_by_name("西藏工单数据")
|
||||
self.dataset_ids["通用"] = self.dify_api.get_or_create_dataset_by_name("通用工单数据")
|
||||
|
||||
content = f"问题:{query}\n回答:{answer}"
|
||||
if skill_group not in soft_segments_list:
|
||||
soft_segments_list[skill_group]=[]
|
||||
soft_segments_list[skill_group].append({
|
||||
"content": str(content),
|
||||
"answer": "",
|
||||
"keywords": []
|
||||
})
|
||||
def check_and_create_metadata(self):
|
||||
"""检查并创建workorder_time元数据,同时缓存元数据ID"""
|
||||
for skill_group, dataset_id in self.dataset_ids.items():
|
||||
# 获取当前数据集的所有元数据
|
||||
metadata_info = self.dify_api.get_dataset_metadata(dataset_id)
|
||||
metadata_list = metadata_info['doc_metadata']
|
||||
|
||||
# 查找workorder_time元数据
|
||||
workorder_time_id = None
|
||||
has_workorder_time = False
|
||||
|
||||
for metadata in metadata_list:
|
||||
if metadata.get("name") == "workorder_time":
|
||||
has_workorder_time = True
|
||||
workorder_time_id = metadata.get("id")
|
||||
break
|
||||
|
||||
# 如果不存在,则创建
|
||||
if not has_workorder_time:
|
||||
metadata = self.dify_api.create_dataset_metadata(dataset_id, "string", "workorder_time")
|
||||
if metadata and "id" in metadata:
|
||||
workorder_time_id = metadata["id"]
|
||||
|
||||
# 缓存元数据ID
|
||||
if workorder_time_id:
|
||||
self.metadata_ids[skill_group] = {"workorder_time": workorder_time_id}
|
||||
else:
|
||||
logging.error(f"无法获取或创建 {skill_group} 的 workorder_time 元数据ID")
|
||||
|
||||
def classify_workorders(self):
|
||||
"""按技能组分类工单"""
|
||||
logging.info("开始按技能组分类工单")
|
||||
total_count = len(self.pd_data)
|
||||
processed_count = 0
|
||||
error_count = 0
|
||||
|
||||
for index, row in self.pd_data.iterrows():
|
||||
try:
|
||||
query = row["客户问题"]
|
||||
answer = row["解决方案"]
|
||||
skill_group = row["产品线"]
|
||||
if skill_group=="" or pd.isna(skill_group):
|
||||
skill_group="通用"
|
||||
create_time = row["创建时间"] # 2025-07-22 15:00:35
|
||||
if isinstance(create_time, str):
|
||||
try:
|
||||
# 尝试原始格式 %Y-%m-%d %H:%M:%S
|
||||
create_time = datetime.datetime.strptime(create_time, "%Y-%m-%d %H:%M:%S")
|
||||
except ValueError:
|
||||
try:
|
||||
# 尝试格式 %Y/%m/%d %H:%M
|
||||
create_time = datetime.datetime.strptime(create_time, "%Y/%m/%d %H:%M")
|
||||
except ValueError:
|
||||
# 如果仍然失败,记录错误并尝试其他可能的格式
|
||||
try:
|
||||
# 尝试格式 %Y/%m/%d
|
||||
create_time = datetime.datetime.strptime(create_time, "%Y/%m/%d")
|
||||
except ValueError:
|
||||
raise ValueError(f"创建时间格式错误: {create_time}")
|
||||
|
||||
for skill_group, segments_list in soft_segments_list.items():
|
||||
if skill_group == "配网":
|
||||
dataset_id = peiwang_dataset_id
|
||||
elif skill_group == "主网":
|
||||
dataset_id = zhuwang_dataset_id
|
||||
elif skill_group == "技改":
|
||||
dataset_id = jianga_dataset_id
|
||||
elif skill_group == "储能":
|
||||
dataset_id = chuneng_dataset_id
|
||||
elif skill_group == "西藏":
|
||||
dataset_id = xizang_dataset_id
|
||||
else:
|
||||
continue
|
||||
document_id = dify_api.get_document_id(dataset_id=dataset_id, document_name=f"{skill_group}工单数据")
|
||||
if not document_id:
|
||||
document_id = dify_api.upload_text_to_document(text_name=f"{skill_group}工单数据", text="", dataset_id=dataset_id)
|
||||
dify_api.add_document_segments(dataset_id=dataset_id, document_id=document_id, segments_list=segments_list)
|
||||
if not isinstance(create_time, datetime.datetime):
|
||||
raise ValueError(f"创建时间格式错误: {create_time}")
|
||||
conversation_id = row["会话id"]
|
||||
|
||||
content = f"问题:{query}\n回答:{answer}"
|
||||
|
||||
if skill_group not in self.skill_group_data:
|
||||
self.skill_group_data[skill_group] = []
|
||||
|
||||
self.skill_group_data[skill_group].append({
|
||||
"document_name": query,
|
||||
"content": content,
|
||||
"create_time": create_time,
|
||||
"conversation_id": conversation_id
|
||||
})
|
||||
|
||||
processed_count += 1
|
||||
if processed_count % 100 == 0:
|
||||
logging.info(f"已处理 {processed_count}/{total_count} 条工单")
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
logging.error(f"处理第 {index} 行工单时出错: {str(e)}")
|
||||
logging.error(f"错误工单内容: {row.to_dict()}")
|
||||
continue
|
||||
|
||||
logging.info(f"工单分类完成,共处理 {processed_count} 条,错误 {error_count} 条")
|
||||
for skill_group, data in self.skill_group_data.items():
|
||||
logging.info(f"技能组 {skill_group}: {len(data)} 条工单")
|
||||
|
||||
def deduplicate_workorders(self):
|
||||
"""对每个技能组内的工单进行去重,保留时间最新的"""
|
||||
logging.info("开始对工单进行去重处理")
|
||||
for skill_group in self.skill_group_data:
|
||||
logging.info(f"处理技能组: {skill_group}, 去重前工单数量: {len(self.skill_group_data[skill_group])}")
|
||||
# 创建一个临时字典,用于存储每个客户问题的最新工单
|
||||
latest_workorders = {}
|
||||
|
||||
for workorder in self.skill_group_data[skill_group]:
|
||||
query = workorder["document_name"]
|
||||
create_time = workorder["create_time"]
|
||||
|
||||
# 如果该问题尚未在字典中或当前工单的时间比已有的更新
|
||||
if query not in latest_workorders or create_time > latest_workorders[query]["datetime"]:
|
||||
latest_workorders[query] = {
|
||||
"workorder": workorder,
|
||||
"datetime": create_time
|
||||
}
|
||||
logging.debug(f"更新工单: {query}, 时间: {create_time}")
|
||||
|
||||
# 用去重后的工单列表替换原列表
|
||||
self.skill_group_data[skill_group] = [item["workorder"] for item in latest_workorders.values()]
|
||||
logging.info(f"技能组 {skill_group} 去重完成, 去重后工单数量: {len(self.skill_group_data[skill_group])}")
|
||||
|
||||
logging.info("所有技能组工单去重处理完成")
|
||||
|
||||
def upload_workorders(self):
|
||||
"""上传每个技能组的工单作为独立文档"""
|
||||
logging.info("开始上传工单文档")
|
||||
total_docs = sum(len(docs) for docs in self.skill_group_data.values())
|
||||
processed_count = 0
|
||||
error_count = 0
|
||||
|
||||
# 创建线程锁,用于保护计数器更新
|
||||
lock = threading.Lock()
|
||||
|
||||
# 创建一个线程池
|
||||
max_workers = min(20, os.cpu_count() * 5) # 最多20个线程,或者CPU核心数的5倍
|
||||
logging.info(f"创建线程池,最大线程数: {max_workers}")
|
||||
|
||||
def upload_document(args):
|
||||
skill_group, doc, dataset_id, workorder_time_id = args
|
||||
nonlocal processed_count, error_count
|
||||
|
||||
try:
|
||||
document_id = self.dify_api.get_document_id(dataset_id=dataset_id, document_name=doc["document_name"])
|
||||
if document_id:
|
||||
# 如果文档已存在,先删除
|
||||
self.dify_api.del_document_by_id(dataset_id=dataset_id, document_id=document_id)
|
||||
|
||||
# 上传文档
|
||||
document_id = self.dify_api.upload_text_to_document(
|
||||
text_name=doc["document_name"],
|
||||
text=doc["content"],
|
||||
dataset_id=dataset_id
|
||||
)
|
||||
create_time_str = doc["create_time"].strftime("%Y-%m-%d %H:%M")
|
||||
# 上传成功后,添加创建时间作为元数据
|
||||
if document_id:
|
||||
metadata_list = [
|
||||
{
|
||||
"id": workorder_time_id,
|
||||
"name": "workorder_time",
|
||||
"value": create_time_str
|
||||
}
|
||||
]
|
||||
self.dify_api.add_document_metadata(dataset_id, document_id, metadata_list)
|
||||
|
||||
with lock:
|
||||
processed_count += 1
|
||||
if processed_count % 10 == 0 or processed_count == total_docs:
|
||||
logging.info(f"已上传 {processed_count}/{total_docs} 个文档")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
with lock:
|
||||
error_count += 1
|
||||
logging.error(f"上传文档 '{doc['document_name']}' 失败: {str(e)}")
|
||||
return False
|
||||
|
||||
# 准备上传任务列表
|
||||
upload_tasks = []
|
||||
for skill_group, documents in self.skill_group_data.items():
|
||||
if skill_group not in self.dataset_ids:
|
||||
logging.warning(f"技能组 '{skill_group}' 没有对应的数据集ID,跳过上传")
|
||||
continue
|
||||
|
||||
dataset_id = self.dataset_ids[skill_group]
|
||||
|
||||
# 检查是否有缓存的元数据ID
|
||||
if skill_group not in self.metadata_ids or "workorder_time" not in self.metadata_ids[skill_group]:
|
||||
logging.error(f"未找到 {skill_group} 的 workorder_time 元数据ID,跳过上传")
|
||||
continue
|
||||
|
||||
workorder_time_id = self.metadata_ids[skill_group]["workorder_time"]
|
||||
|
||||
# 为每个工单创建独立文档
|
||||
for doc in documents:
|
||||
upload_tasks.append((skill_group, doc, dataset_id, workorder_time_id))
|
||||
|
||||
# 使用线程池并发执行上传任务
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(upload_document, upload_tasks))
|
||||
|
||||
success_count = sum(1 for result in results if result)
|
||||
logging.info(f"工单上传完成,总计: {total_docs},成功: {success_count},失败: {error_count}")
|
||||
|
||||
def process(self):
|
||||
"""执行完整的工单处理流程"""
|
||||
self.check_and_create_metadata()
|
||||
self.classify_workorders()
|
||||
self.deduplicate_workorders()
|
||||
self.upload_workorders()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
processor = WorkorderToDify()
|
||||
processor.process()
|
||||
Reference in New Issue
Block a user