更新pyproject.toml和uv.lock文件,新增ijson和langfuse依赖,同时在对话到工单的分析流程中添加时间范围过滤功能,优化日志记录,支持按时间范围过滤会话数据。新增获取工作流运行信息的方法,并更新意图识别API以支持使用jieba分词。
This commit is contained in:
@@ -499,7 +499,7 @@ class DialogueToWorkorder:
|
||||
|
||||
return workorder_list
|
||||
|
||||
def analyze_conversation_data(self, conversation_excel_path, product_detail_excel_path, max_workers=10):
|
||||
def analyze_conversation_data(self, conversation_excel_path, product_detail_excel_path, max_workers=10, start_date=None, end_date=None):
|
||||
"""分析会话数据主流程,使用多线程并发处理"""
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(conversation_excel_path)
|
||||
@@ -511,21 +511,29 @@ class DialogueToWorkorder:
|
||||
# 解析产品详情
|
||||
product_detail_dict = self.parse_product_detail_excel(product_detail_excel_path)
|
||||
|
||||
# 如果指定了时间范围,则过滤数据
|
||||
if start_date or end_date:
|
||||
# 确保创建时间列为日期时间类型
|
||||
if '创建时间' in df.columns:
|
||||
df['创建时间'] = pd.to_datetime(df['创建时间'], errors='coerce')
|
||||
|
||||
# 按时间范围过滤
|
||||
if start_date:
|
||||
start_date = pd.to_datetime(start_date)
|
||||
df = df[df['创建时间'] >= start_date]
|
||||
logger.info(f"过滤开始时间 {start_date},剩余数据行数: {len(df)}")
|
||||
|
||||
if end_date:
|
||||
end_date = pd.to_datetime(end_date)
|
||||
df = df[df['创建时间'] <= end_date]
|
||||
logger.info(f"过滤结束时间 {end_date},剩余数据行数: {len(df)}")
|
||||
else:
|
||||
logger.warning("数据中没有'创建时间'列,无法按时间范围过滤")
|
||||
|
||||
# 按会话ID分组
|
||||
conversation_dict = self.group_conversations_by_id(df)
|
||||
# 限制处理的会话数量为前2000个
|
||||
if len(conversation_dict) > 2000:
|
||||
logger.info(f"会话总数为 {len(conversation_dict)},限制处理前2000个会话")
|
||||
# 获取所有会话ID
|
||||
conversation_ids = list(conversation_dict.keys())
|
||||
# 只保留前2000个会话
|
||||
limited_conversation_dict = {
|
||||
conversation_id: conversation_dict[conversation_id]
|
||||
for conversation_id in conversation_ids[:2000]
|
||||
}
|
||||
conversation_dict = limited_conversation_dict
|
||||
else:
|
||||
logger.info(f"会话总数为 {len(conversation_dict)},处理全部会话")
|
||||
logger.info(f"会话总数为 {len(conversation_dict)},处理全部会话")
|
||||
|
||||
# 使用线程池处理每个会话
|
||||
workorder_dict_list = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
@@ -629,6 +637,10 @@ def parse_arguments():
|
||||
help='产品详情Excel文件路径')
|
||||
parser.add_argument('--max_workers', type=int, default=16,
|
||||
help='并发处理线程数,默认为16')
|
||||
parser.add_argument('--start_date', type=str, required=False,
|
||||
help='开始日期,格式为YYYY-MM-DD')
|
||||
parser.add_argument('--end_date', type=str, required=False,
|
||||
help='结束日期,格式为YYYY-MM-DD')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -649,9 +661,21 @@ def main():
|
||||
workorder_dict_list = processor.analyze_conversation_data(
|
||||
conversation_excel_path,
|
||||
product_detail_excel_path,
|
||||
max_workers=args.max_workers
|
||||
max_workers=args.max_workers,
|
||||
start_date=args.start_date,
|
||||
end_date=args.end_date
|
||||
)
|
||||
output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx')
|
||||
|
||||
# 生成输出文件名
|
||||
if args.start_date and args.end_date:
|
||||
output_file = conversation_excel_path.replace('.xlsx', f'_{args.start_date}至{args.end_date}_转工单.xlsx')
|
||||
elif args.start_date:
|
||||
output_file = conversation_excel_path.replace('.xlsx', f'_从{args.start_date}起_转工单.xlsx')
|
||||
elif args.end_date:
|
||||
output_file = conversation_excel_path.replace('.xlsx', f'_至{args.end_date}_转工单.xlsx')
|
||||
else:
|
||||
output_file = conversation_excel_path.replace('.xlsx', '_转工单.xlsx')
|
||||
|
||||
# 保存结果
|
||||
processor.save_results_to_excel(workorder_dict_list, output_file)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user