优化对话到工单的处理逻辑,新增进度条显示,调整日期参数的默认值,并修复日志目录创建的冗余代码。同时,更新DifyExporter类以支持按日期范围过滤消息,重构查询日志加载逻辑,新增备注提取功能,提升代码可读性和可维护性。
This commit is contained in:
@@ -14,6 +14,7 @@ import httpx
|
||||
import traceback
|
||||
import re
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
|
||||
# 将项目根目录添加到Python路径
|
||||
sys.path.append(os.getcwd())
|
||||
@@ -469,9 +470,10 @@ class DialogueToWorkorder:
|
||||
"客户问题": user_question_str,
|
||||
"问题类型": problem_type,
|
||||
"是否抱怨": "是" if is_dissatisfaction else '否',
|
||||
"抱怨内容": dissatisfaction_reasoning if is_dissatisfaction else '',
|
||||
"抱怨级别": dissatisfaction_level if is_dissatisfaction else '',
|
||||
"是否投诉": "是" if is_complaint else '否',
|
||||
"解决方案": (solution_str + '\n存在抱怨:' + dissatisfaction_reasoning) if is_dissatisfaction else solution_str
|
||||
"解决方案": solution_str
|
||||
})
|
||||
workorder_list.append(base_workorder_dict)
|
||||
# for user_question in user_question_list:
|
||||
@@ -544,7 +546,7 @@ class DialogueToWorkorder:
|
||||
}
|
||||
|
||||
# 获取结果
|
||||
for future in concurrent.futures.as_completed(future_to_conversation):
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="处理会话进度"):
|
||||
conversation_id = future_to_conversation[future]
|
||||
try:
|
||||
result_workorders = future.result()
|
||||
@@ -637,9 +639,9 @@ def parse_arguments():
|
||||
help='产品详情Excel文件路径')
|
||||
parser.add_argument('--max_workers', type=int, default=16,
|
||||
help='并发处理线程数,默认为16')
|
||||
parser.add_argument('--start_date', type=str, required=False,
|
||||
parser.add_argument('--start_date', type=str, required=False,default="2025-05-25 00:00:00",
|
||||
help='开始日期,格式为YYYY-MM-DD')
|
||||
parser.add_argument('--end_date', type=str, required=False,
|
||||
parser.add_argument('--end_date', type=str, required=False,default="2025-05-30 15:54",
|
||||
help='结束日期,格式为YYYY-MM-DD')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -23,6 +23,7 @@ from tqdm import tqdm
|
||||
import concurrent.futures
|
||||
import sys
|
||||
|
||||
os.makedirs('./data/log', exist_ok=True)
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -33,7 +34,7 @@ logging.basicConfig(
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
os.makedirs('./data/log', exist_ok=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatabaseConfig:
|
||||
@@ -492,8 +493,8 @@ def main() -> None:
|
||||
# 创建数据库客户端
|
||||
with MariaDBClient(config, max_connections=12) as db_client:
|
||||
# 查询会话数据
|
||||
start_date = '2025-01-01 00:00:00'
|
||||
end_date = '2025-06-12 00:00:00'
|
||||
start_date = '2025-06-12 00:00:00'
|
||||
end_date = '2025-07-01 00:00:00'
|
||||
|
||||
logger.info(f"查询时间范围: {start_date} 到 {end_date}")
|
||||
# 创建会话处理器
|
||||
|
||||
@@ -402,7 +402,8 @@ class DifyApi:
|
||||
content: str,
|
||||
answer: str,
|
||||
keywords: List[str],
|
||||
enabled: bool
|
||||
enabled: bool,
|
||||
regenerate_child_chunks: bool = True
|
||||
) -> Dict:
|
||||
"""
|
||||
更新指定文档的某个分段信息。
|
||||
@@ -430,7 +431,7 @@ class DifyApi:
|
||||
"answer": answer,
|
||||
"keywords": keywords,
|
||||
"enabled": enabled,
|
||||
"regenerate_child_chunks": True
|
||||
"regenerate_child_chunks": regenerate_child_chunks
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,9 +37,9 @@ class DifyExporter:
|
||||
self.query_log_dir = os.path.join(os.getcwd(), "data", "query_logs")
|
||||
self.query_log_file = query_log_file or os.path.join(self.query_log_dir, "answer_type_logs.json")
|
||||
|
||||
# 设置日期过滤
|
||||
self.start_date = start_date
|
||||
self.end_date = end_date
|
||||
# 设置日期过滤,转换为datetime对象
|
||||
self.start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d %H") if start_date else None
|
||||
self.end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d %H") if end_date else None
|
||||
|
||||
# 初始化工具类
|
||||
self.dify_pgsql = PgSql()
|
||||
@@ -49,22 +49,22 @@ class DifyExporter:
|
||||
self.message_info_list = []
|
||||
self.query_logs = {}
|
||||
|
||||
def load_query_logs(self):
|
||||
def load_query_logs(self,path):
|
||||
"""
|
||||
从文件加载查询日志
|
||||
"""
|
||||
try:
|
||||
with open(self.query_log_file, 'r', encoding='utf-8') as f:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
query_logs_list = json.load(f)
|
||||
# 创建字典来存储每个查询的最新记录
|
||||
# 创建字典来存储每个查询的最新记录workflow_run_id
|
||||
for record in query_logs_list:
|
||||
query = record['query']
|
||||
workflow_run_id = record['workflow_run_id']
|
||||
timestamp = record.get('timestamp')
|
||||
# 如果查询不在字典中或者当前记录的时间戳更新,则更新字典
|
||||
if query not in self.query_logs or (timestamp and self.query_logs.get(query, {}).get('timestamp') and
|
||||
if workflow_run_id not in self.query_logs or (timestamp and self.query_logs.get(workflow_run_id, {}).get('timestamp') and
|
||||
datetime.datetime.fromisoformat(timestamp) >
|
||||
datetime.datetime.fromisoformat(self.query_logs[query]['timestamp'])):
|
||||
self.query_logs[query] = record
|
||||
datetime.datetime.fromisoformat(self.query_logs[workflow_run_id]['timestamp'])):
|
||||
self.query_logs[workflow_run_id] = record
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"加载查询日志失败: {e}")
|
||||
@@ -103,6 +103,27 @@ class DifyExporter:
|
||||
message_chain_new.append(msg)
|
||||
return message_chain_new
|
||||
|
||||
def get_remark(self, msg_debug_info):
|
||||
"""
|
||||
获取备注
|
||||
"""
|
||||
intent_node_execution_info = [node_execution_info for node_execution_info in msg_debug_info['workflow_node_executions_info']
|
||||
if node_execution_info["title"] == "意图识别结果解析"]
|
||||
if len(intent_node_execution_info) == 0:
|
||||
return ""
|
||||
|
||||
if intent_node_execution_info[0]["outputs"] is None:
|
||||
return ""
|
||||
intent_result = json.loads(intent_node_execution_info[0]["outputs"])
|
||||
vertical_classification = intent_result.get("vertical_classification", "")
|
||||
sub_classification = intent_result.get("sub_classification", "")
|
||||
if vertical_classification == "固定话术类":
|
||||
return "使用固定话术"
|
||||
|
||||
if sub_classification == "软件锁类":
|
||||
return "固定引导至博微软件助手中操作"
|
||||
return ""
|
||||
|
||||
def extract_message_info(self, message):
|
||||
"""
|
||||
从消息中提取信息
|
||||
@@ -121,7 +142,7 @@ class DifyExporter:
|
||||
user_name = msg_inputs.get("user_name", "")
|
||||
msg_query = message["query"]
|
||||
msg_answer = message["answer"]
|
||||
|
||||
msg_answer = msg_answer.split("----------------------------------------")[0]
|
||||
# 将UTC+0时间转换为UTC+8时间
|
||||
created_at_utc = message['created_at']
|
||||
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
||||
@@ -143,13 +164,17 @@ class DifyExporter:
|
||||
document_name = knowledge['metadata']['document_name']
|
||||
wiki_list.append(document_name.split("/")[-1])
|
||||
|
||||
# 获取备注
|
||||
remark = self.get_remark(msg_debug_info)
|
||||
|
||||
wiki_list = list(set(wiki_list))
|
||||
wiki_list_str = "\n".join(wiki_list)
|
||||
if wiki_list_str == "":
|
||||
wiki_list_str = "无"
|
||||
rating = self.dify_pgsql.get_message_rating(msg_id)
|
||||
# 直接通过字典键获取query_type
|
||||
query_type = self.query_logs.get(msg_query, {}).get('query_type', "")
|
||||
workflow_run_id = message['workflow_run_id']
|
||||
query_type = self.query_logs.get(workflow_run_id, {}).get('query_type', "")
|
||||
|
||||
return {
|
||||
"msg_id": msg_id,
|
||||
@@ -159,7 +184,8 @@ class DifyExporter:
|
||||
"提问时间": created_at,
|
||||
"评价": rating,
|
||||
"问题分类": query_type,
|
||||
"检索到的词条": wiki_list_str
|
||||
"检索到的词条": wiki_list_str,
|
||||
"备注": remark
|
||||
}
|
||||
|
||||
def process_conversations(self):
|
||||
@@ -184,13 +210,10 @@ class DifyExporter:
|
||||
created_at_utc = message['created_at']
|
||||
created_at_utc8 = created_at_utc + datetime.timedelta(hours=8)
|
||||
|
||||
# 提取消息的创建日期时间,精确到小时
|
||||
created_at_hour = created_at_utc8.strftime("%Y-%m-%d %H")
|
||||
|
||||
# 应用日期时间过滤
|
||||
if self.start_date and created_at_hour < self.start_date:
|
||||
if self.start_date and created_at_utc8 < self.start_date:
|
||||
continue
|
||||
if self.end_date and created_at_hour > self.end_date:
|
||||
if self.end_date and created_at_utc8 > self.end_date:
|
||||
continue
|
||||
|
||||
message_info = self.extract_message_info(message)
|
||||
@@ -281,7 +304,8 @@ class DifyExporter:
|
||||
数据库中的时间是UTC+0时区,会自动转换为UTC+8时区进行过滤和显示
|
||||
"""
|
||||
# 加载查询日志
|
||||
self.load_query_logs()
|
||||
self.load_query_logs(self.query_log_file)
|
||||
self.load_query_logs("data/query_logs/answer_type_logs_071409.json")
|
||||
|
||||
# 处理会话数据
|
||||
self.process_conversations()
|
||||
@@ -294,12 +318,12 @@ class DifyExporter:
|
||||
# 如果指定了日期范围,则在文件名中体现
|
||||
date_suffix = ""
|
||||
if self.start_date:
|
||||
# 将空格替换为下划线,使文件名更规范
|
||||
formatted_start = self.start_date.replace(" ", "_")
|
||||
# 格式化日期对象为字符串
|
||||
formatted_start = self.start_date.strftime("%Y-%m-%d_%H")
|
||||
date_suffix += f"_from_{formatted_start}"
|
||||
if self.end_date:
|
||||
# 将空格替换为下划线,使文件名更规范
|
||||
formatted_end = self.end_date.replace(" ", "_")
|
||||
# 格式化日期对象为字符串
|
||||
formatted_end = self.end_date.strftime("%Y-%m-%d_%H")
|
||||
date_suffix += f"_to_{formatted_end}"
|
||||
output_file = os.path.join(os.getcwd(), "data", "excel", f"dify_export{date_suffix}_{timestamp}.xlsx")
|
||||
|
||||
@@ -321,9 +345,9 @@ if __name__ == "__main__":
|
||||
help='Dify应用ID')
|
||||
parser.add_argument('--query_log_file', '-q', type=str, default="data/query_logs/answer_type_logs.json",
|
||||
help='查询日志文件路径')
|
||||
parser.add_argument('--start_date', '-s', type=str, default="2025-07-09 13",
|
||||
parser.add_argument('--start_date', '-s', type=str, default="2025-07-14 00",
|
||||
help='开始日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 14表示2025年7月8日14时(UTC+8时区)')
|
||||
parser.add_argument('--end_date', '-e', type=str, default=None,
|
||||
parser.add_argument('--end_date', '-e', type=str, default="2025-07-14 15",
|
||||
help='结束日期时间,格式为YYYY-MM-DD HH,例如2025-07-08 18表示2025年7月8日18时(UTC+8时区)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -288,8 +288,10 @@ class AsyncIntentRecognizer:
|
||||
# 步骤2: 使用向量检索找到相似的专业名词
|
||||
try:
|
||||
vector_start_time = time.time()
|
||||
# 对matched_terms中的每个关键字进行向量检索
|
||||
for current_key in query_keys:
|
||||
|
||||
# 创建并行任务列表
|
||||
async def process_single_keyword(current_key: str) -> List[Term]:
|
||||
"""处理单个关键词的向量检索和重排序"""
|
||||
vector_results = await self._noun_retriever.query_async(current_key, top_k=5, use_intersection=False)
|
||||
current_key_terms = set()
|
||||
# 添加向量检索结果
|
||||
@@ -304,7 +306,17 @@ class AsyncIntentRecognizer:
|
||||
current_key_terms.add(term)
|
||||
if len(current_key_terms) > 0:
|
||||
reranked_terms = await self._rerank_matched_terms_async(current_key, current_key_terms)
|
||||
matched_terms.extend(reranked_terms)
|
||||
return reranked_terms
|
||||
return []
|
||||
|
||||
# 并行处理所有关键词
|
||||
keyword_tasks = [process_single_keyword(current_key) for current_key in query_keys]
|
||||
keyword_results = await asyncio.gather(*keyword_tasks)
|
||||
|
||||
# 合并所有结果
|
||||
for result in keyword_results:
|
||||
matched_terms.extend(result)
|
||||
|
||||
vector_end_time = time.time()
|
||||
vector_time = vector_end_time - vector_start_time
|
||||
except Exception as e:
|
||||
@@ -649,7 +661,7 @@ class AsyncIntentRecognizer:
|
||||
formatted_prompt = step_back_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
# conversation_context=conversation_context,
|
||||
output_format=step_back_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
@@ -688,7 +700,7 @@ class AsyncIntentRecognizer:
|
||||
formatted_prompt = follow_up_questions_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
# conversation_context=conversation_context,
|
||||
output_format=follow_up_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
@@ -727,7 +739,7 @@ class AsyncIntentRecognizer:
|
||||
formatted_prompt = hyde_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
# conversation_context=conversation_context,
|
||||
output_format=hyde_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
@@ -766,7 +778,7 @@ class AsyncIntentRecognizer:
|
||||
formatted_prompt = multi_questions_prompt.format(
|
||||
query=query,
|
||||
chat_history=json.dumps(chat_history, ensure_ascii=False) if chat_history else "[]",
|
||||
conversation_context=conversation_context,
|
||||
# conversation_context=conversation_context,
|
||||
output_format=multi_questions_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
|
||||
@@ -226,30 +226,30 @@ step_back_prompt = """
|
||||
- 涵盖原始问题的核心主题
|
||||
- 去除过于具体的限制条件(如时间、地点、特定版本、特定工程等)
|
||||
- 保持在同一领域和主题范围内
|
||||
- 依次移除问题中的限定词或者修饰词
|
||||
|
||||
## 输入
|
||||
用户原始问题: {query}
|
||||
历史对话记录: {chat_history}
|
||||
会话背景: {conversation_context}
|
||||
|
||||
## 输出格式
|
||||
{output_format}
|
||||
|
||||
## 示例
|
||||
原始问题: "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?"
|
||||
原始问题: "2023版本如何在Windows 11系统上导入单位工程量清单?"
|
||||
后退问题:
|
||||
{{
|
||||
"original_query": "配网D3软件2023版本如何在Windows 11系统上导入单位工程量清单?",
|
||||
"original_query": "2023版本如何在Windows 11系统上导入单位工程量清单?",
|
||||
"can_use_back_prompt": True,
|
||||
"step_back_query": ["配网D3软件如何导入工程量清单?", "如何导入单位工程量清单?"]
|
||||
"step_back_query": ["如何在Windows 11系统上导入单位工程量清单?", "如何导入单位工程量清单?"]
|
||||
}}
|
||||
|
||||
原始问题: "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?"
|
||||
原始问题: "某个设备更换后,如何在系统中更新对应的定额?"
|
||||
后退问题:
|
||||
{{
|
||||
"original_query": "技改T1软件中的某个设备更换后,如何在系统中更新对应的定额?",
|
||||
"original_query": "某个设备更换后,如何在系统中更新对应的定额?",
|
||||
"can_use_back_prompt": True,
|
||||
"step_back_query": ["技改T1软件中如何更新设备对应的定额?", "如何更新设备对应的定额?"]
|
||||
"step_back_query": ["如何更新设备对应的定额?", "如何更新定额?"]
|
||||
}}
|
||||
|
||||
"""
|
||||
@@ -271,7 +271,6 @@ follow_up_questions_prompt = """
|
||||
## 输入
|
||||
历史对话记录: {chat_history}
|
||||
当前用户问题: {query}
|
||||
会话背景: {conversation_context}
|
||||
|
||||
## 输出格式
|
||||
{output_format}
|
||||
@@ -308,7 +307,6 @@ hyde_prompt = """
|
||||
## 输入
|
||||
用户问题: {query}
|
||||
历史对话记录: {chat_history}
|
||||
会话背景: {conversation_context}
|
||||
|
||||
## 输出格式
|
||||
{output_format}
|
||||
@@ -343,7 +341,6 @@ multi_questions_prompt = """
|
||||
## 输入
|
||||
用户原始问题: {query}
|
||||
历史对话记录: {chat_history}
|
||||
会话背景: {conversation_context}
|
||||
|
||||
## 输出格式
|
||||
{output_format}
|
||||
|
||||
Reference in New Issue
Block a user