更新pyproject.toml和uv.lock文件,新增ijson和langfuse依赖,同时在对话到工单的分析流程中添加时间范围过滤功能,优化日志记录,支持按时间范围过滤会话数据。新增获取工作流运行信息的方法,并更新意图识别API以支持使用jieba分词。

This commit is contained in:
2025-07-12 13:05:57 +08:00
parent fbe11486cb
commit a100a9a106
9 changed files with 226 additions and 31 deletions
+40 -16
View File
@@ -499,7 +499,7 @@ class DialogueToWorkorder:
return workorder_list
def analyze_conversation_data(self, conversation_excel_path, product_detail_excel_path, max_workers=10):
def analyze_conversation_data(self, conversation_excel_path, product_detail_excel_path, max_workers=10, start_date=None, end_date=None):
"""分析会话数据主流程,使用多线程并发处理"""
# 读取Excel文件
df = pd.read_excel(conversation_excel_path)
@@ -511,21 +511,29 @@ class DialogueToWorkorder:
# 解析产品详情
product_detail_dict = self.parse_product_detail_excel(product_detail_excel_path)
# 如果指定了时间范围,则过滤数据
if start_date or end_date:
# 确保创建时间列为日期时间类型
if '创建时间' in df.columns:
df['创建时间'] = pd.to_datetime(df['创建时间'], errors='coerce')
# 按时间范围过滤
if start_date:
start_date = pd.to_datetime(start_date)
df = df[df['创建时间'] >= start_date]
logger.info(f"过滤开始时间 {start_date},剩余数据行数: {len(df)}")
if end_date:
end_date = pd.to_datetime(end_date)
df = df[df['创建时间'] <= end_date]
logger.info(f"过滤结束时间 {end_date},剩余数据行数: {len(df)}")
else:
logger.warning("数据中没有'创建时间'列,无法按时间范围过滤")
# 按会话ID分组
conversation_dict = self.group_conversations_by_id(df)
# 限制处理的会话数量为前2000个
if len(conversation_dict) > 2000:
logger.info(f"会话总数为 {len(conversation_dict)},限制处理前2000个会话")
# 获取所有会话ID
conversation_ids = list(conversation_dict.keys())
# 只保留前2000个会话
limited_conversation_dict = {
conversation_id: conversation_dict[conversation_id]
for conversation_id in conversation_ids[:2000]
}
conversation_dict = limited_conversation_dict
else:
logger.info(f"会话总数为 {len(conversation_dict)},处理全部会话")
logger.info(f"会话总数为 {len(conversation_dict)},处理全部会话")
# 使用线程池处理每个会话
workorder_dict_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -629,6 +637,10 @@ def parse_arguments():
help='产品详情Excel文件路径')
parser.add_argument('--max_workers', type=int, default=16,
help='并发处理线程数,默认为16')
parser.add_argument('--start_date', type=str, required=False,
help='开始日期,格式为YYYY-MM-DD')
parser.add_argument('--end_date', type=str, required=False,
help='结束日期,格式为YYYY-MM-DD')
return parser.parse_args()
@@ -649,9 +661,21 @@ def main():
workorder_dict_list = processor.analyze_conversation_data(
conversation_excel_path,
product_detail_excel_path,
max_workers=args.max_workers
max_workers=args.max_workers,
start_date=args.start_date,
end_date=args.end_date
)
output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx')
# 生成输出文件名
if args.start_date and args.end_date:
output_file = conversation_excel_path.replace('.xlsx', f'_{args.start_date}{args.end_date}_转工单.xlsx')
elif args.start_date:
output_file = conversation_excel_path.replace('.xlsx', f'_从{args.start_date}起_转工单.xlsx')
elif args.end_date:
output_file = conversation_excel_path.replace('.xlsx', f'_至{args.end_date}_转工单.xlsx')
else:
output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx')
# 保存结果
processor.save_results_to_excel(workorder_dict_list, output_file)