Files
QueryRewrite/rag2_0/dify/export_new_dify.py
T

303 lines
11 KiB
Python

from dotenv import load_dotenv
import os
import json
import datetime
import pandas as pd
import sys
sys.path.append(os.getcwd())
from rag2_0.dify.dify_tool import PgSql, DifyTool
class DifyExporter:
"""
Dify数据导出工具,用于从Dify系统中导出对话和消息数据
"""
def __init__(self, app_id=None, query_log_file=None):
"""
初始化DifyExporter实例
Args:
app_id: Dify应用ID,默认为None
query_log_file: 查询日志文件路径,默认为None
"""
# 设置默认值
self.app_id = app_id or "72d03c7d-8bea-42f9-9e8d-cdfb9480f372"
# 设置查询日志文件路径
self.query_log_dir = os.path.join(os.getcwd(), "data", "query_logs")
self.query_log_file = query_log_file or os.path.join(self.query_log_dir, "answer_type_logs.json")
# 初始化工具类
self.dify_pgsql = PgSql()
self.dify_tool = DifyTool()
# 初始化数据存储
self.message_info_list = []
self.query_logs = {}
def load_query_logs(self):
"""
从文件加载查询日志
"""
try:
with open(self.query_log_file, 'r', encoding='utf-8') as f:
query_logs_list = json.load(f)
# 创建字典来存储每个查询的最新记录
for record in query_logs_list:
query = record['query']
timestamp = record.get('timestamp')
# 如果查询不在字典中或者当前记录的时间戳更新,则更新字典
if query not in self.query_logs or (timestamp and self.query_logs.get(query, {}).get('timestamp') and
datetime.datetime.fromisoformat(timestamp) >
datetime.datetime.fromisoformat(self.query_logs[query]['timestamp'])):
self.query_logs[query] = record
return True
except Exception as e:
print(f"加载查询日志失败: {e}")
return False
def process_message_chain(self, messages):
"""
处理消息链,按照时间顺序重新组织消息
Args:
messages: 消息列表
Returns:
按时间顺序组织的消息列表
"""
message_chain = {}
for message in messages:
if message["parent_message_id"] in message_chain:
message_chain[message["parent_message_id"]].append(message)
else:
message_chain[message["parent_message_id"]] = [message]
message_chain_new = []
current_message_id = None
processed_ids = set() # 防止无限循环
while True:
# 获取当前父消息ID对应的所有消息
msg_list = message_chain.get(current_message_id, [])
# 如果没有消息或已处理过该ID,则退出循环
if not msg_list or current_message_id in processed_ids:
break
# 记录已处理的ID
if current_message_id is not None:
processed_ids.add(current_message_id)
# 使用max()函数找出创建时间最新的消息
new_msg = max(msg_list, key=lambda x: x["created_at"]) if msg_list else None
# 将最新消息添加到结果列表,并更新当前消息ID
if new_msg:
message_chain_new.append(new_msg)
current_message_id = new_msg["id"]
else:
break
return message_chain_new
def extract_message_info(self, message):
"""
从消息中提取信息
Args:
message: 消息对象
Returns:
包含消息信息的字典
"""
msg_id = message["id"]
msg_inputs = message["inputs"]
user_name = msg_inputs.get("user_name", "")
msg_query = message["query"]
msg_answer = message["answer"]
created_at = message['created_at'].strftime("%Y-%m-%d")
msg_debug_info = self.dify_tool.get_message_debug_info_by_id(msg_id)
if not msg_debug_info:
return None
wiki_list = []
for node_execution in msg_debug_info['workflow_node_executions_info']:
if node_execution["title"] == "提取处理后的知识":
source_kno = json.loads(node_execution["outputs"])["source_kno"]
knowledge_list_metadata = json.loads(node_execution["outputs"])["knowledge_list_metadata"]
for knowledge in knowledge_list_metadata:
document_name = knowledge['metadata']['document_name']
wiki_list.append(document_name.split("/")[-1])
wiki_list = list(set(wiki_list))
wiki_list_str = "\n".join(wiki_list)
if wiki_list_str == "":
wiki_list_str = "无"
rating = self.dify_pgsql.get_message_rating(msg_id)
# 直接通过字典键获取query_type
query_type = self.query_logs.get(msg_query, {}).get('query_type', "")
return {
"msg_id": msg_id,
"提问": msg_query,
"回答": msg_answer,
"提问人": user_name,
"提问时间": created_at,
"评价": rating,
"问题分类": query_type,
"检索到的词条": wiki_list_str
}
def process_conversations(self):
"""
处理会话数据
Returns:
处理后的消息信息列表
"""
conversations = self.dify_pgsql.get_app_conversations(appid=self.app_id)
for conversation in conversations:
messages = self.dify_pgsql.get_conversation_messages(conversation_id=conversation['conversation_id'])
message_chain_new = self.process_message_chain(messages)
for message in message_chain_new:
message_info = self.extract_message_info(message)
if message_info:
self.message_info_list.append(message_info)
return self.message_info_list
def save_to_excel(self, message_info_list, output_file):
"""
将消息信息列表保存到Excel文件
Args:
message_info_list: 消息信息列表
output_file: 输出文件路径
Returns:
输出文件路径
"""
# 创建DataFrame
df = pd.DataFrame(message_info_list)
# 设置列的顺序
columns_order = [
"msg_id", "提问", "回答", "提问人", "提问时间",
"评价", "问题分类", "检索到的词条"
]
# 确保所有列都存在,如果不存在则添加空列
for col in columns_order:
if col not in df.columns:
df[col] = None
# 按指定顺序重排列
df = df[columns_order]
# 确保目录存在
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# 创建ExcelWriter对象,用于设置Excel样式
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
# 写入数据
df.to_excel(writer, index=False, sheet_name='Dify对话记录')
# 获取工作簿和工作表
workbook = writer.book
worksheet = writer.sheets['Dify对话记录']
# 设置行高(20磅 ≈ 26.67像素)
for row in worksheet.iter_rows():
worksheet.row_dimensions[row[0].row].height = 20
# 设置列宽
column_widths = {
"msg_id": 15,
"提问": 40,
"回答": 60,
"提问人": 15,
"提问时间": 15,
"评价": 10,
"问题分类": 20,
"检索到的词条": 40
}
# 应用列宽设置
for i, column in enumerate(columns_order):
col_letter = chr(65 + i) # A, B, C, ...
if i >= 26: # 超过Z的情况
col_letter = chr(64 + i // 26) + chr(65 + i % 26)
worksheet.column_dimensions[col_letter].width = column_widths[column]
print(f"结果已保存到 {output_file}")
return output_file
def export(self, output_file=None):
"""
执行导出流程
Args:
output_file: 输出文件路径,默认为None(自动生成文件名)
Returns:
处理后的消息信息列表
"""
# 加载查询日志
self.load_query_logs()
# 处理会话数据
self.process_conversations()
# 如果指定了输出文件,保存结果
if output_file or len(self.message_info_list) > 0:
# 如果没有指定输出文件,则使用默认文件名
if output_file is None:
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_file = os.path.join(os.getcwd(), "data", "excel", f"dify_export_{timestamp}.xlsx")
# 保存到Excel文件
self.save_to_excel(self.message_info_list, output_file)
return self.message_info_list
# 示例用法
if __name__ == "__main__":
import argparse
# 解析命令行参数
parser = argparse.ArgumentParser(description='Dify数据导出工具')
parser.add_argument('--output', '-o', type=str, default="data/excel/dify_export.xlsx",
help='输出Excel文件路径')
parser.add_argument('--app_id', '-a', type=str, default=None,
help='Dify应用ID')
parser.add_argument('--query_log_file', '-q', type=str, default=None,
help='查询日志文件路径')
args = parser.parse_args()
load_dotenv()
# 设置环境变量
os.environ["DIFY_PG_HOST"] = "10.1.16.39"
os.environ["DIFY_PG_PORT"] = "5432"
os.environ["DIFY_PG_USER"] = "postgres"
os.environ["DIFY_PG_PASSWORD"] = "difyai123456"
os.environ["DIFY_PG_DATABASE"] = "dify"
# 创建导出器实例
exporter = DifyExporter(
app_id=args.app_id,
query_log_file=args.query_log_file
)
# 执行导出
results = exporter.export(output_file=args.output)
# 打印结果
print(f"导出了 {len(results)} 条消息信息")