diff --git a/rag2_0/dify/WorkorderToDify.py b/rag2_0/dify/WorkorderToDify.py index 4869aa6..3e1bcaf 100644 --- a/rag2_0/dify/WorkorderToDify.py +++ b/rag2_0/dify/WorkorderToDify.py @@ -128,27 +128,48 @@ class WorkorderToDify: logging.info(f"技能组 {skill_group}: {len(data)} 条工单") def deduplicate_workorders(self): - """对每个技能组内的工单进行去重,保留时间最新的""" + """对每个技能组内的工单进行去重,将问题相同的工单内容合并""" logging.info("开始对工单进行去重处理") for skill_group in self.skill_group_data: logging.info(f"处理技能组: {skill_group}, 去重前工单数量: {len(self.skill_group_data[skill_group])}") - # 创建一个临时字典,用于存储每个客户问题的最新工单 - latest_workorders = {} + # 创建一个临时字典,用于存储每个客户问题的合并工单 + merged_workorders = {} for workorder in self.skill_group_data[skill_group]: query = workorder["document_name"] create_time = workorder["create_time"] + content = workorder["content"] - # 如果该问题尚未在字典中或当前工单的时间比已有的更新 - if query not in latest_workorders or create_time > latest_workorders[query]["datetime"]: - latest_workorders[query] = { + # 如果该问题尚未在字典中,则添加 + if query not in merged_workorders: + merged_workorders[query] = { "workorder": workorder, - "datetime": create_time + "datetime": create_time, + "contents": [content] } - logging.debug(f"更新工单: {query}, 时间: {create_time}") + else: + # 如果问题已存在,添加内容并更新时间(如果当前时间更新) + merged_workorders[query]["contents"].append(content) + if create_time > merged_workorders[query]["datetime"]: + merged_workorders[query]["datetime"] = create_time + # 更新会话ID等其他信息,但保留合并后的内容 + merged_workorders[query]["workorder"].update({ + "create_time": create_time, + "conversation_id": workorder["conversation_id"] + }) - # 用去重后的工单列表替换原列表 - self.skill_group_data[skill_group] = [item["workorder"] for item in latest_workorders.values()] + # 合并内容并用去重后的工单列表替换原列表 + result_workorders = [] + for query, data in merged_workorders.items(): + workorder = data["workorder"].copy() + # 合并所有内容,使用换行符和分隔符分隔 + if len(data["contents"])>1: + breakpoint() + merged_content = "\n\n---\n\n".join(data["contents"]) + workorder["content"] = merged_content + result_workorders.append(workorder) + + self.skill_group_data[skill_group] = result_workorders logging.info(f"技能组 {skill_group} 去重完成, 去重后工单数量: {len(self.skill_group_data[skill_group])}") logging.info("所有技能组工单去重处理完成")