247 lines
11 KiB
Python
247 lines
11 KiB
Python
import os
|
|
import sys
|
|
import datetime
|
|
import logging
|
|
import concurrent.futures
|
|
import threading
|
|
|
|
# 配置日志
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S',
|
|
handlers=[
|
|
logging.StreamHandler(),
|
|
logging.FileHandler(f'data/logs/WorkorderToDify_{datetime.datetime.now().strftime("%Y%m%d")}.log', encoding='utf-8')
|
|
]
|
|
)
|
|
|
|
sys.path.append(os.getcwd())
|
|
import rag2_0.dify.dify_client.dify_api as DifyApi
|
|
import pandas as pd
|
|
|
|
|
|
class WorkorderToDify:
|
|
def __init__(self, excel_path="data/excel/2025.1-6月工单(人工整理后).xlsx"):
|
|
self.pd_data = pd.read_excel(excel_path)
|
|
self.dify_api = DifyApi.DifyApi()
|
|
self.dataset_ids = {}
|
|
self.skill_group_data = {}
|
|
self.metadata_ids = {} # 用于缓存元数据ID
|
|
|
|
# 初始化各技能组的数据集ID
|
|
self.dataset_ids["博微配网计价通D3"] = self.dify_api.get_or_create_dataset_by_name("配网工单数据")
|
|
self.dataset_ids["博微电力建设计价通软件"] = self.dify_api.get_or_create_dataset_by_name("主网工单数据")
|
|
self.dataset_ids["博微技改检修计价通T1软件"] = self.dify_api.get_or_create_dataset_by_name("技改工单数据")
|
|
self.dataset_ids["新能源系列"] = self.dify_api.get_or_create_dataset_by_name("储能工单数据")
|
|
self.dataset_ids["博微西藏计价通Z1"] = self.dify_api.get_or_create_dataset_by_name("西藏工单数据")
|
|
self.dataset_ids["通用"] = self.dify_api.get_or_create_dataset_by_name("通用工单数据")
|
|
|
|
def check_and_create_metadata(self):
|
|
"""检查并创建workorder_time元数据,同时缓存元数据ID"""
|
|
for skill_group, dataset_id in self.dataset_ids.items():
|
|
# 获取当前数据集的所有元数据
|
|
metadata_info = self.dify_api.get_dataset_metadata(dataset_id)
|
|
metadata_list = metadata_info['doc_metadata']
|
|
|
|
# 查找workorder_time元数据
|
|
workorder_time_id = None
|
|
has_workorder_time = False
|
|
|
|
for metadata in metadata_list:
|
|
if metadata.get("name") == "workorder_time":
|
|
has_workorder_time = True
|
|
workorder_time_id = metadata.get("id")
|
|
break
|
|
|
|
# 如果不存在,则创建
|
|
if not has_workorder_time:
|
|
metadata = self.dify_api.create_dataset_metadata(dataset_id, "string", "workorder_time")
|
|
if metadata and "id" in metadata:
|
|
workorder_time_id = metadata["id"]
|
|
|
|
# 缓存元数据ID
|
|
if workorder_time_id:
|
|
self.metadata_ids[skill_group] = {"workorder_time": workorder_time_id}
|
|
else:
|
|
logging.error(f"无法获取或创建 {skill_group} 的 workorder_time 元数据ID")
|
|
|
|
def classify_workorders(self):
|
|
"""按技能组分类工单"""
|
|
logging.info("开始按技能组分类工单")
|
|
total_count = len(self.pd_data)
|
|
processed_count = 0
|
|
error_count = 0
|
|
|
|
for index, row in self.pd_data.iterrows():
|
|
try:
|
|
query = row["客户问题"]
|
|
answer = row["解决方案"]
|
|
skill_group = row["产品线"]
|
|
if skill_group=="" or pd.isna(skill_group):
|
|
skill_group="通用"
|
|
create_time = row["创建时间"] # 2025-07-22 15:00:35
|
|
if isinstance(create_time, str):
|
|
try:
|
|
# 尝试原始格式 %Y-%m-%d %H:%M:%S
|
|
create_time = datetime.datetime.strptime(create_time, "%Y-%m-%d %H:%M:%S")
|
|
except ValueError:
|
|
try:
|
|
# 尝试格式 %Y/%m/%d %H:%M
|
|
create_time = datetime.datetime.strptime(create_time, "%Y/%m/%d %H:%M")
|
|
except ValueError:
|
|
# 如果仍然失败,记录错误并尝试其他可能的格式
|
|
try:
|
|
# 尝试格式 %Y/%m/%d
|
|
create_time = datetime.datetime.strptime(create_time, "%Y/%m/%d")
|
|
except ValueError:
|
|
raise ValueError(f"创建时间格式错误: {create_time}")
|
|
|
|
if not isinstance(create_time, datetime.datetime):
|
|
raise ValueError(f"创建时间格式错误: {create_time}")
|
|
conversation_id = row["会话id"]
|
|
|
|
content = f"问题:{query}\n回答:{answer}"
|
|
|
|
if skill_group not in self.skill_group_data:
|
|
self.skill_group_data[skill_group] = []
|
|
|
|
self.skill_group_data[skill_group].append({
|
|
"document_name": query,
|
|
"content": content,
|
|
"create_time": create_time,
|
|
"conversation_id": conversation_id
|
|
})
|
|
|
|
processed_count += 1
|
|
if processed_count % 100 == 0:
|
|
logging.info(f"已处理 {processed_count}/{total_count} 条工单")
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
logging.error(f"处理第 {index} 行工单时出错: {str(e)}")
|
|
logging.error(f"错误工单内容: {row.to_dict()}")
|
|
continue
|
|
|
|
logging.info(f"工单分类完成,共处理 {processed_count} 条,错误 {error_count} 条")
|
|
for skill_group, data in self.skill_group_data.items():
|
|
logging.info(f"技能组 {skill_group}: {len(data)} 条工单")
|
|
|
|
def deduplicate_workorders(self):
|
|
"""对每个技能组内的工单进行去重,保留时间最新的"""
|
|
logging.info("开始对工单进行去重处理")
|
|
for skill_group in self.skill_group_data:
|
|
logging.info(f"处理技能组: {skill_group}, 去重前工单数量: {len(self.skill_group_data[skill_group])}")
|
|
# 创建一个临时字典,用于存储每个客户问题的最新工单
|
|
latest_workorders = {}
|
|
|
|
for workorder in self.skill_group_data[skill_group]:
|
|
query = workorder["document_name"]
|
|
create_time = workorder["create_time"]
|
|
|
|
# 如果该问题尚未在字典中或当前工单的时间比已有的更新
|
|
if query not in latest_workorders or create_time > latest_workorders[query]["datetime"]:
|
|
latest_workorders[query] = {
|
|
"workorder": workorder,
|
|
"datetime": create_time
|
|
}
|
|
logging.debug(f"更新工单: {query}, 时间: {create_time}")
|
|
|
|
# 用去重后的工单列表替换原列表
|
|
self.skill_group_data[skill_group] = [item["workorder"] for item in latest_workorders.values()]
|
|
logging.info(f"技能组 {skill_group} 去重完成, 去重后工单数量: {len(self.skill_group_data[skill_group])}")
|
|
|
|
logging.info("所有技能组工单去重处理完成")
|
|
|
|
def upload_workorders(self):
|
|
"""上传每个技能组的工单作为独立文档"""
|
|
logging.info("开始上传工单文档")
|
|
total_docs = sum(len(docs) for docs in self.skill_group_data.values())
|
|
processed_count = 0
|
|
error_count = 0
|
|
|
|
# 创建线程锁,用于保护计数器更新
|
|
lock = threading.Lock()
|
|
|
|
# 创建一个线程池
|
|
max_workers = min(20, os.cpu_count() * 5) # 最多20个线程,或者CPU核心数的5倍
|
|
logging.info(f"创建线程池,最大线程数: {max_workers}")
|
|
|
|
def upload_document(args):
|
|
skill_group, doc, dataset_id, workorder_time_id = args
|
|
nonlocal processed_count, error_count
|
|
|
|
try:
|
|
document_id = self.dify_api.get_document_id(dataset_id=dataset_id, document_name=doc["document_name"])
|
|
if document_id:
|
|
# 如果文档已存在,先删除
|
|
self.dify_api.del_document_by_id(dataset_id=dataset_id, document_id=document_id)
|
|
|
|
# 上传文档
|
|
document_id = self.dify_api.upload_text_to_document(
|
|
text_name=doc["document_name"],
|
|
text=doc["content"],
|
|
dataset_id=dataset_id
|
|
)
|
|
create_time_str = doc["create_time"].strftime("%Y-%m-%d %H:%M")
|
|
# 上传成功后,添加创建时间作为元数据
|
|
if document_id:
|
|
metadata_list = [
|
|
{
|
|
"id": workorder_time_id,
|
|
"name": "workorder_time",
|
|
"value": create_time_str
|
|
}
|
|
]
|
|
self.dify_api.add_document_metadata(dataset_id, document_id, metadata_list)
|
|
|
|
with lock:
|
|
processed_count += 1
|
|
if processed_count % 10 == 0 or processed_count == total_docs:
|
|
logging.info(f"已上传 {processed_count}/{total_docs} 个文档")
|
|
|
|
return True
|
|
except Exception as e:
|
|
with lock:
|
|
error_count += 1
|
|
logging.error(f"上传文档 '{doc['document_name']}' 失败: {str(e)}")
|
|
return False
|
|
|
|
# 准备上传任务列表
|
|
upload_tasks = []
|
|
for skill_group, documents in self.skill_group_data.items():
|
|
if skill_group not in self.dataset_ids:
|
|
logging.warning(f"技能组 '{skill_group}' 没有对应的数据集ID,跳过上传")
|
|
continue
|
|
|
|
dataset_id = self.dataset_ids[skill_group]
|
|
|
|
# 检查是否有缓存的元数据ID
|
|
if skill_group not in self.metadata_ids or "workorder_time" not in self.metadata_ids[skill_group]:
|
|
logging.error(f"未找到 {skill_group} 的 workorder_time 元数据ID,跳过上传")
|
|
continue
|
|
|
|
workorder_time_id = self.metadata_ids[skill_group]["workorder_time"]
|
|
|
|
# 为每个工单创建独立文档
|
|
for doc in documents:
|
|
upload_tasks.append((skill_group, doc, dataset_id, workorder_time_id))
|
|
|
|
# 使用线程池并发执行上传任务
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
results = list(executor.map(upload_document, upload_tasks))
|
|
|
|
success_count = sum(1 for result in results if result)
|
|
logging.info(f"工单上传完成,总计: {total_docs},成功: {success_count},失败: {error_count}")
|
|
|
|
def process(self):
|
|
"""执行完整的工单处理流程"""
|
|
self.check_and_create_metadata()
|
|
self.classify_workorders()
|
|
self.deduplicate_workorders()
|
|
self.upload_workorders()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
processor = WorkorderToDify()
|
|
processor.process() |