更新.gitignore以忽略临时文件,修改api_key文件,重构合并名词的逻辑,删除不再使用的脚本,优化对话到工单的处理流程,添加会话结果保存为JSON的功能,调整API调用参数,修复部分代码中的错误。
This commit is contained in:
@@ -1,187 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
File: deduplicate_json.py
|
||||
Description: 对指定JSON文件进行去重并重新保存
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dotenv import load_dotenv
|
||||
from rag2_0.tool.ModelTool import OpenAiLLM
|
||||
from langchain.output_parsers import PydanticOutputParser
|
||||
from pydantic import BaseModel, Field
|
||||
from rag2_0.intent_recognition.DataModels import Term
|
||||
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class JsonDeduplicator:
|
||||
"""JSON文件去重类"""
|
||||
|
||||
def __init__(self, input_path=None, output_path=None, key_field="name", max_workers=3):
|
||||
"""初始化JSON去重器
|
||||
|
||||
Args:
|
||||
input_path: 输入JSON文件路径
|
||||
output_path: 去重后的输出文件路径
|
||||
key_field: 用于去重的键字段名
|
||||
max_workers: 线程池最大工作线程数
|
||||
"""
|
||||
self.INPUT_PATH = input_path
|
||||
self.OUTPUT_PATH = output_path or input_path.replace('.json', '_deduplicated.json')
|
||||
self.KEY_FIELD = key_field
|
||||
self.MAX_WORKERS = max_workers
|
||||
self.item_parser = PydanticOutputParser(pydantic_object=Term)
|
||||
self.MERGE_PROMPT = '''
|
||||
请将以下多个描述相同名词"{name}"的条目合并为一个,合并时请:
|
||||
- 同义词(synonymous)去重合并
|
||||
- 描述(description)合并为更完整、简明的描述
|
||||
- 保持输出格式为:
|
||||
{output_format}
|
||||
原始条目:
|
||||
{items}
|
||||
'''
|
||||
# 配置LLM
|
||||
model_name = os.getenv("MODEL_NAME", "gpt-3.5-turbo")
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
base_url = os.getenv("OPENAI_API_BASE")
|
||||
llm_params = {"temperature": 0.3, "model": model_name}
|
||||
if api_key:
|
||||
llm_params["api_key"] = api_key
|
||||
if base_url:
|
||||
llm_params["base_url"] = base_url
|
||||
self.llm = OpenAiLLM(**llm_params)
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def load_json_data(self):
|
||||
"""读取JSON文件"""
|
||||
try:
|
||||
with open(self.INPUT_PATH, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
logging.info(f"从{self.INPUT_PATH}加载了{len(data)}条记录")
|
||||
return data
|
||||
except Exception as e:
|
||||
logging.error(f"读取{self.INPUT_PATH}失败: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
def group_items_by_key(self, items):
|
||||
"""按指定键字段聚合项目"""
|
||||
key_to_items = defaultdict(list)
|
||||
for item in items:
|
||||
key = item.get(self.KEY_FIELD, '').strip()
|
||||
if key:
|
||||
key_to_items[key].append(item)
|
||||
return key_to_items
|
||||
|
||||
def merge_items_with_llm(self, key, item_list):
|
||||
"""调用LLM合并具有相同键的项目,失败最多重试三次"""
|
||||
items = json.dumps(item_list, ensure_ascii=False)
|
||||
prompt = self.MERGE_PROMPT.format(
|
||||
name=key,
|
||||
items=items,
|
||||
output_format=self.item_parser.get_format_instructions()
|
||||
)
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
response = self.llm.invoke(prompt, False)
|
||||
parsed_output = self.item_parser.parse(response.content)
|
||||
return {"name": parsed_output.name, "synonymous": parsed_output.synonymous, "description": parsed_output.description}
|
||||
except Exception as e:
|
||||
if attempt == max_retries:
|
||||
logging.warning(f"解析LLM合并结果失败: {e}")
|
||||
return None
|
||||
else:
|
||||
import time
|
||||
time.sleep(5*attempt)
|
||||
|
||||
def process_item(self, key_items_tuple):
|
||||
"""处理单个键值对应的项目,用于线程池并行处理"""
|
||||
key, item_list = key_items_tuple
|
||||
try:
|
||||
if len(item_list) == 1:
|
||||
return item_list[0]
|
||||
|
||||
merged = self.merge_items_with_llm(key, item_list)
|
||||
if merged:
|
||||
return merged
|
||||
else:
|
||||
# 如果合并失败,返回第一个项目
|
||||
return item_list[0]
|
||||
except Exception as e:
|
||||
logging.error(f"处理键 {key} 时出错: {e}", exc_info=True)
|
||||
return item_list[0]
|
||||
|
||||
def deduplicate(self):
|
||||
"""去重所有项目的入口方法"""
|
||||
# 1. 读取JSON数据
|
||||
all_items = self.load_json_data()
|
||||
if not all_items:
|
||||
return []
|
||||
|
||||
# 2. 按键字段聚合
|
||||
key_to_items = self.group_items_by_key(all_items)
|
||||
logging.info(f"共{len(key_to_items)}个唯一键")
|
||||
|
||||
# 3. 使用线程池并行处理
|
||||
deduplicated_items = []
|
||||
items_to_process = []
|
||||
|
||||
# 先处理只有一个项目的键(不需要合并)
|
||||
for key, item_list in key_to_items.items():
|
||||
if len(item_list) == 1:
|
||||
deduplicated_items.append(item_list[0])
|
||||
else:
|
||||
items_to_process.append((key, item_list))
|
||||
|
||||
logging.info(f"共{len(deduplicated_items)}个单一项目,{len(items_to_process)}个需要合并的项目")
|
||||
|
||||
# 只对需要合并的项目使用线程池处理
|
||||
if items_to_process:
|
||||
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
||||
# 使用tqdm显示进度
|
||||
for result in tqdm(executor.map(self.process_item, items_to_process), total=len(items_to_process)):
|
||||
deduplicated_items.append(result)
|
||||
|
||||
# 4. 保存去重结果
|
||||
os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True)
|
||||
with open(self.OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
||||
json.dump(deduplicated_items, f, ensure_ascii=False, indent=2)
|
||||
logging.info(f"去重后结果已保存到: {self.OUTPUT_PATH}")
|
||||
|
||||
return deduplicated_items
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数,解析命令行参数并执行去重"""
|
||||
parser = argparse.ArgumentParser(description='对JSON文件进行去重')
|
||||
input_path = 'data/wiki_extracted_nouns/技改检修计价通_nouns.json'
|
||||
|
||||
parser.add_argument('-i', '--input',default=input_path, help='输入JSON文件路径')
|
||||
parser.add_argument('-o', '--output', help='输出JSON文件路径')
|
||||
parser.add_argument('-k', '--key', default='name', help='用于去重的键字段名,默认为"name"')
|
||||
parser.add_argument('-w', '--workers', type=int, default=30, help='线程池最大工作线程数,默认为2')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
deduplicator = JsonDeduplicator(
|
||||
input_path=args.input,
|
||||
output_path=args.output,
|
||||
key_field=args.key,
|
||||
max_workers=args.workers
|
||||
)
|
||||
deduplicator.deduplicate()
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.getLogger('httpx').setLevel(logging.WARNING)
|
||||
logging.getLogger('openai').setLevel(logging.WARNING)
|
||||
main()
|
||||
@@ -15,6 +15,8 @@ import traceback
|
||||
import re
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
import glob
|
||||
import shutil
|
||||
|
||||
# 将项目根目录添加到Python路径
|
||||
sys.path.append(os.getcwd())
|
||||
@@ -84,13 +86,6 @@ class IsComplaint(BaseModel):
|
||||
dissatisfaction_reasoning: str = Field(description="抱怨原因")
|
||||
is_complaint: bool = Field(description="是否明确/暗示将进行投诉")
|
||||
|
||||
class ProductNameAndModuleName(BaseModel):
|
||||
product_name: str = Field(description="产品名称")
|
||||
module_name: str = Field(description="模块名称")
|
||||
|
||||
class ProductLine(BaseModel):
|
||||
product_line: str = Field(description="产品线")
|
||||
|
||||
# ================ 工具函数 ================
|
||||
def retry_llm_call(max_retries=3, delay=2):
|
||||
"""
|
||||
@@ -138,8 +133,6 @@ class DialogueToWorkorder:
|
||||
self.user_question_and_solution_list_parser = PydanticOutputParser(pydantic_object=UserQuestionAndSolutionList)
|
||||
self.question_type_parser = PydanticOutputParser(pydantic_object=QuestionType)
|
||||
self.is_complaint_parser = PydanticOutputParser(pydantic_object=IsComplaint)
|
||||
self.product_name_and_module_name_parser = PydanticOutputParser(pydantic_object=ProductNameAndModuleName)
|
||||
self.product_line_parser = PydanticOutputParser(pydantic_object=ProductLine)
|
||||
# 初始化LLM模型
|
||||
self.llm_params = llm_params or {
|
||||
"temperature": 0.2,
|
||||
@@ -158,6 +151,10 @@ class DialogueToWorkorder:
|
||||
# "timeout": httpx.Timeout(600.0)
|
||||
# }
|
||||
self.llm = self._get_llm_instance()
|
||||
|
||||
# 创建工单JSON文件目录
|
||||
self.workorder_json_dir = "data/temp_workorder_json"
|
||||
os.makedirs(self.workorder_json_dir, exist_ok=True)
|
||||
|
||||
def _get_llm_instance(self):
|
||||
"""获取LLM实例"""
|
||||
@@ -483,6 +480,66 @@ class DialogueToWorkorder:
|
||||
is_complaint.dissatisfaction_reasoning,
|
||||
is_complaint.is_complaint)
|
||||
|
||||
def save_conversation_to_json(self, conversation_id, workorder_list):
|
||||
"""
|
||||
将会话处理结果保存为JSON文件
|
||||
|
||||
参数:
|
||||
conversation_id: 会话ID
|
||||
workorder_list: 工单列表
|
||||
"""
|
||||
# 确保目录存在
|
||||
os.makedirs(self.workorder_json_dir, exist_ok=True)
|
||||
|
||||
# 构建文件路径
|
||||
file_path = os.path.join(self.workorder_json_dir, f"{conversation_id}.json")
|
||||
|
||||
# 将工单列表转换为可序列化的字典列表
|
||||
serializable_workorder_list = []
|
||||
for workorder in workorder_list:
|
||||
# 处理datetime对象
|
||||
serializable_workorder = {}
|
||||
for key, value in workorder.items():
|
||||
if isinstance(value, datetime):
|
||||
serializable_workorder[key] = value.strftime("%Y-%m-%d %H:%M:%S")
|
||||
else:
|
||||
serializable_workorder[key] = value
|
||||
serializable_workorder_list.append(serializable_workorder)
|
||||
|
||||
# 保存为JSON文件
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(serializable_workorder_list, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"会话ID: {conversation_id} 的处理结果已保存到 {file_path}")
|
||||
|
||||
def load_conversation_from_json(self, conversation_id):
|
||||
"""
|
||||
从JSON文件加载会话处理结果
|
||||
|
||||
参数:
|
||||
conversation_id: 会话ID
|
||||
|
||||
返回:
|
||||
工单列表,如果文件不存在则返回None
|
||||
"""
|
||||
# 构建文件路径
|
||||
file_path = os.path.join(self.workorder_json_dir, f"{conversation_id}.json")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(file_path):
|
||||
return None
|
||||
|
||||
# 从JSON文件加载工单列表
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
workorder_list = json.load(f)
|
||||
|
||||
logger.info(f"已从 {file_path} 加载会话ID: {conversation_id} 的处理结果")
|
||||
return workorder_list
|
||||
except Exception as e:
|
||||
logger.error(f"加载会话ID: {conversation_id} 的处理结果时发生错误: {e}")
|
||||
return None
|
||||
|
||||
def process_conversation(self, conversation_id, conversation_rows):
|
||||
"""处理单个会话的函数,用于多线程并发"""
|
||||
# if conversation_id!="b157aa91-3acb-11f0-a191-4fb224ef4b40":
|
||||
@@ -534,13 +591,16 @@ class DialogueToWorkorder:
|
||||
# 将工单添加到列表中
|
||||
workorder_list.append(workorder_dict)
|
||||
|
||||
# 将处理结果保存为JSON文件
|
||||
self.save_conversation_to_json(conversation_id, workorder_list)
|
||||
|
||||
return workorder_list
|
||||
except Exception as e:
|
||||
logger.error(f"处理会话ID: {conversation_id} 时发生错误: {e}")
|
||||
return []
|
||||
|
||||
def analyze_conversation_data(self, conversation_excel_path, max_workers=10, start_date=None, end_date=None):
|
||||
"""分析会话数据主流程,使用多线程并发处理"""
|
||||
"""分析会话数据主流程,使用多线程并发处理,支持失败重试和JSON合并"""
|
||||
# 读取Excel文件
|
||||
df = pd.read_excel(conversation_excel_path)
|
||||
|
||||
@@ -575,37 +635,123 @@ class DialogueToWorkorder:
|
||||
conversation_dict = new_conversation_dict
|
||||
|
||||
logger.info(f"会话总数为 {len(conversation_dict)},处理全部会话")
|
||||
|
||||
# ========== 新增:扫描已存在的JSON文件 ==========
|
||||
existing_json_files = set()
|
||||
workorder_json_dir = self.workorder_json_dir
|
||||
if not os.path.exists(workorder_json_dir):
|
||||
os.makedirs(workorder_json_dir, exist_ok=True)
|
||||
for fname in os.listdir(workorder_json_dir):
|
||||
if fname.endswith('.json'):
|
||||
conversation_id = fname[:-5]
|
||||
existing_json_files.add(conversation_id)
|
||||
|
||||
# 本次新生成的JSON文件
|
||||
newly_generated_json_files = set()
|
||||
# 本次未重新生成但已存在的JSON文件
|
||||
reused_json_files = set()
|
||||
|
||||
# ========== 线程池处理会话 ==========
|
||||
successful_conversations = set()
|
||||
failed_conversations = set()
|
||||
import threading
|
||||
lock = threading.Lock()
|
||||
|
||||
def process_wrapper(conversation_id, conversation_rows):
|
||||
json_file_path = os.path.join(workorder_json_dir, f"{conversation_id}.json")
|
||||
if conversation_id in existing_json_files and os.path.exists(json_file_path):
|
||||
# 已存在,直接复用
|
||||
with lock:
|
||||
reused_json_files.add(conversation_id)
|
||||
return None # 不处理
|
||||
# 否则正常处理
|
||||
result = self.process_conversation(conversation_id, conversation_rows)
|
||||
if result:
|
||||
with lock:
|
||||
newly_generated_json_files.add(conversation_id)
|
||||
return result
|
||||
|
||||
# 使用线程池处理每个会话
|
||||
workorder_dict_list = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 创建任务
|
||||
future_to_conversation = {
|
||||
executor.submit(self.process_conversation, conversation_id, conversation_rows): conversation_id
|
||||
executor.submit(process_wrapper, conversation_id, conversation_rows): conversation_id
|
||||
for conversation_id, conversation_rows in conversation_dict.items()
|
||||
}
|
||||
|
||||
# 获取结果
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="处理会话进度"):
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="第一轮处理会话进度"):
|
||||
conversation_id = future_to_conversation[future]
|
||||
try:
|
||||
result_workorders = future.result()
|
||||
if result_workorders:
|
||||
# 将每个会话的所有工单添加到总列表中
|
||||
workorder_dict_list.extend(result_workorders)
|
||||
successful_conversations.add(conversation_id)
|
||||
logger.info(f"完成处理会话ID: {conversation_id},生成工单数量: {len(result_workorders)}")
|
||||
elif conversation_id in reused_json_files:
|
||||
successful_conversations.add(conversation_id)
|
||||
logger.info(f"跳过已存在JSON,会话ID: {conversation_id}")
|
||||
else:
|
||||
failed_conversations.add(conversation_id)
|
||||
logger.warning(f"会话ID: {conversation_id} 处理可能失败,将在第二轮重试")
|
||||
except Exception as exc:
|
||||
failed_conversations.add(conversation_id)
|
||||
logger.error(f"处理会话ID: {conversation_id} 时发生错误: {exc}")
|
||||
|
||||
|
||||
# 检查哪些会话没有成功生成JSON文件
|
||||
all_conversation_ids = set(conversation_dict.keys())
|
||||
for conversation_id in all_conversation_ids:
|
||||
json_file_path = os.path.join(workorder_json_dir, f"{conversation_id}.json")
|
||||
if not os.path.exists(json_file_path):
|
||||
failed_conversations.add(conversation_id)
|
||||
if conversation_id in successful_conversations:
|
||||
successful_conversations.remove(conversation_id)
|
||||
|
||||
# ========== 第二轮重试 ==========
|
||||
if failed_conversations:
|
||||
logger.info(f"第一轮处理后有 {len(failed_conversations)} 个会话需要重试")
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, max_workers // 2)) as executor:
|
||||
future_to_conversation = {
|
||||
executor.submit(process_wrapper, conversation_id, conversation_dict[conversation_id]): conversation_id
|
||||
for conversation_id in failed_conversations
|
||||
}
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_conversation), total=len(future_to_conversation), desc="第二轮重试处理进度"):
|
||||
conversation_id = future_to_conversation[future]
|
||||
try:
|
||||
result_workorders = future.result()
|
||||
if result_workorders:
|
||||
successful_conversations.add(conversation_id)
|
||||
newly_generated_json_files.add(conversation_id)
|
||||
failed_conversations.remove(conversation_id)
|
||||
logger.info(f"重试成功: 会话ID: {conversation_id},生成工单数量: {len(result_workorders)}")
|
||||
elif conversation_id in reused_json_files:
|
||||
successful_conversations.add(conversation_id)
|
||||
failed_conversations.remove(conversation_id)
|
||||
logger.info(f"重试跳过已存在JSON,会话ID: {conversation_id}")
|
||||
except Exception as exc:
|
||||
logger.error(f"重试处理会话ID: {conversation_id} 时仍然发生错误: {exc}")
|
||||
|
||||
# ========== 合并本次所有成功的JSON文件 ==========
|
||||
logger.info(f"开始合并JSON文件结果,成功处理会话数: {len(successful_conversations)},失败会话数: {len(failed_conversations)}")
|
||||
workorder_dict_list = []
|
||||
# 只合并本次新生成和本次未重新生成但已存在的JSON
|
||||
all_json_ids_to_merge = newly_generated_json_files.union(reused_json_files)
|
||||
json_files = [os.path.join(workorder_json_dir, f"{cid}.json") for cid in all_json_ids_to_merge if os.path.exists(os.path.join(workorder_json_dir, f"{cid}.json"))]
|
||||
for json_file in tqdm(json_files, desc="合并JSON文件"):
|
||||
conversation_id = os.path.basename(json_file).replace(".json", "")
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
workorders = json.load(f)
|
||||
workorder_dict_list.extend(workorders)
|
||||
except Exception as e:
|
||||
logger.error(f"加载JSON文件 {json_file} 时发生错误: {e}")
|
||||
logger.info(f"处理完成,成功处理会话数: {len(successful_conversations)},失败会话数: {len(failed_conversations)}")
|
||||
if failed_conversations:
|
||||
logger.warning(f"以下会话处理失败: {failed_conversations}")
|
||||
return workorder_dict_list
|
||||
|
||||
def save_results_to_excel(self, workorder_dict_list, output_file=None):
|
||||
"""将结果保存到Excel文件"""
|
||||
"""将结果保存到Excel文件,并清理JSON文件"""
|
||||
result_df = pd.DataFrame(workorder_dict_list)
|
||||
|
||||
# 按照指定的列顺序重新排列DataFrame的列
|
||||
columns_order = [
|
||||
'工单编号', '产品线', '产品名称', '模块名称', '问题类型',
|
||||
'工单编号', '产品线', '产品名称', '问题类型',
|
||||
'客户问题', '解决方案', '是否抱怨', "抱怨内容", '是否投诉', '抱怨级别',
|
||||
'会话id', '访客昵称', '处理坐席', "处理人", "处理技能组",'创建时间'
|
||||
]
|
||||
@@ -645,7 +791,6 @@ class DialogueToWorkorder:
|
||||
'工单编号': 15,
|
||||
'产品线': 24,
|
||||
'产品名称': 40,
|
||||
'模块名称': 40,
|
||||
'问题类型': 9,
|
||||
'客户问题': 20,
|
||||
'解决方案': 30,
|
||||
@@ -668,8 +813,7 @@ class DialogueToWorkorder:
|
||||
col_letter = chr(64 + i // 26) + chr(65 + i % 26)
|
||||
worksheet.column_dimensions[col_letter].width = column_widths[column]
|
||||
|
||||
logger.info(f"结果已保存到 {output_file}")
|
||||
|
||||
logger.info(f"结果已保存到 {output_file}")
|
||||
return output_file
|
||||
|
||||
# ================ 参数解析 ================
|
||||
|
||||
@@ -23,7 +23,7 @@ from tqdm import tqdm
|
||||
import concurrent.futures
|
||||
import sys
|
||||
|
||||
os.makedirs('./data/log', exist_ok=True)
|
||||
os.makedirs('./data/logs', exist_ok=True)
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
|
||||
@@ -214,20 +214,7 @@ class QueryRewriteProcessor:
|
||||
|
||||
# 根据enable_retrieval参数决定是否进行文档检索
|
||||
retrieved_doc = None
|
||||
if enable_retrieval:
|
||||
retrieved_doc = self.dify_query_retrieval.retrieve(original_query, query_list, classification_obj, current_softname)
|
||||
|
||||
# 判断检索文档是否相关
|
||||
relevance_result = {}
|
||||
if retrieved_doc:
|
||||
# 判断文档相关性
|
||||
relevance_result = self.is_retrieved_doc_relevant(query, retrieved_doc)
|
||||
else:
|
||||
relevance_result = {
|
||||
"is_relevant": False,
|
||||
"explanation": "没有检索到文档" if enable_retrieval else "文档检索功能未启用",
|
||||
"relevance_score": 0.0
|
||||
}
|
||||
|
||||
|
||||
retrieved_doc_titles=[]
|
||||
if retrieved_doc:
|
||||
@@ -251,8 +238,6 @@ class QueryRewriteProcessor:
|
||||
"槽位信息": slot_filling_str,
|
||||
"检索的文档": "\n".join(retrieved_doc_titles),
|
||||
"检索的内容": json.dumps(retrieved_doc, ensure_ascii=False, indent=2) if retrieved_doc else "",
|
||||
"文档能否解决问题": "能" if relevance_result["is_relevant"] else "不能",
|
||||
"文档相关性解释": relevance_result["explanation"]
|
||||
}
|
||||
except Exception as e:
|
||||
logging.error(f"处理问题 '{query}' 时出错: ",exc_info=True)
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
File: merge_nouns_with_llm.py
|
||||
Description: 合并多个nouns.json中的同名专业名词,利用LLM生成唯一合并结果
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from collections import defaultdict
|
||||
from dotenv import load_dotenv
|
||||
from rag2_0.tool.ModelTool import OpenAiLLM
|
||||
from rag2_0.intent_recognition.DataModels import Term
|
||||
import logging
|
||||
from langchain.output_parsers import PydanticOutputParser
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
class TermMerger:
|
||||
"""专业名词合并类,用于合并多个数据源中的同名专业名词"""
|
||||
|
||||
def __init__(self, input_dir=None, output_path=None, max_workers=3):
|
||||
"""初始化名词合并器
|
||||
|
||||
Args:
|
||||
input_dir: 包含nouns.json文件的目录路径
|
||||
output_path: 合并结果的输出文件路径
|
||||
max_workers: 线程池最大工作线程数
|
||||
"""
|
||||
self.EXTRACTED_NOUNS_DIR = input_dir
|
||||
self.OUTPUT_PATH = output_path
|
||||
self.MAX_WORKERS = max_workers
|
||||
self.terms_parser = PydanticOutputParser(pydantic_object=Term)
|
||||
self.MERGE_PROMPT = '''
|
||||
请将以下多个描述相同名词"{name}"的条目合并为一个,合并时请:
|
||||
- 同义词(synonymous)去重合并
|
||||
- 描述(description)合并为更完整、简明的描述
|
||||
- 保持输出格式为:
|
||||
{output_format}
|
||||
原始条目:
|
||||
{items}
|
||||
'''
|
||||
# 配置LLM
|
||||
model_name = os.getenv("MODEL_NAME", "gpt-3.5-turbo")
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
base_url = os.getenv("OPENAI_API_BASE")
|
||||
llm_params = {"temperature": 0.3, "model": model_name}
|
||||
if api_key:
|
||||
llm_params["api_key"] = api_key
|
||||
if base_url:
|
||||
llm_params["base_url"] = base_url
|
||||
self.llm = OpenAiLLM(**llm_params)
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def load_all_terms(self):
|
||||
"""读取目录下所有nouns.json,返回所有Term列表"""
|
||||
all_terms = []
|
||||
for file in glob.glob(os.path.join(self.EXTRACTED_NOUNS_DIR, '*_nouns.json')):
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
file_terms = json.load(f)
|
||||
new_terms = [{"name": term["name"].upper(), "synonymous": term["synonymous"], "description": term["description"]} for term in file_terms]
|
||||
all_terms.extend(new_terms)
|
||||
logging.info(f"加载{file},共{len(new_terms)}条")
|
||||
except Exception as e:
|
||||
logging.warning(f"读取{file}失败: {e}")
|
||||
|
||||
# 加载suffix_keywords.json文件
|
||||
# suffix_keywords_path = os.path.join(os.path.dirname(os.path.dirname(self.EXTRACTED_NOUNS_DIR)), 'data', 'nouns', 'suffix_keywords.json')
|
||||
# if os.path.exists(suffix_keywords_path):
|
||||
# try:
|
||||
# with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
|
||||
# suffix_terms = json.load(f)
|
||||
# suffix_terms = [{"name": term["name"].upper(), "synonymous": "", "description": ""} for term in suffix_terms]
|
||||
# all_terms.extend(suffix_terms)
|
||||
# logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条")
|
||||
# except Exception as e:
|
||||
# logging.warning(f"读取{suffix_keywords_path}失败: {e}")
|
||||
|
||||
return all_terms
|
||||
|
||||
def group_terms_by_name(self, terms):
|
||||
"""按name聚合Term"""
|
||||
name2terms = defaultdict(list)
|
||||
for term in terms:
|
||||
name = term.get('name', '').strip()
|
||||
if name:
|
||||
name2terms[name].append(term)
|
||||
return name2terms
|
||||
|
||||
def merge_terms_with_llm(self, name, term_list):
|
||||
"""调用LLM合并同名Term,失败最多重试三次"""
|
||||
items = json.dumps(term_list, ensure_ascii=False)
|
||||
prompt = self.MERGE_PROMPT.format(name=name, items=items, output_format=self.terms_parser.get_format_instructions())
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
response = self.llm.invoke(prompt, False)
|
||||
parsed_output = self.terms_parser.parse(response.content)
|
||||
return {"name": parsed_output.name, "synonymous": parsed_output.synonymous, "description": parsed_output.description}
|
||||
except Exception as e:
|
||||
if attempt == max_retries:
|
||||
logging.warning(f"解析LLM合并结果失败: {e}")
|
||||
return None
|
||||
else:
|
||||
time.sleep(10*attempt)
|
||||
|
||||
def process_term(self, name_terms_tuple):
|
||||
"""处理单个词条,用于线程池并行处理"""
|
||||
name, term_list = name_terms_tuple
|
||||
try:
|
||||
merged = self.merge_terms_with_llm(name, term_list)
|
||||
if merged:
|
||||
return merged
|
||||
else:
|
||||
return term_list[0]
|
||||
except Exception as e:
|
||||
logging.error(f"处理词条 {name} 时出错: {e}", exc_info=True)
|
||||
return term_list[0]
|
||||
|
||||
def merge(self):
|
||||
"""合并所有词条的入口方法"""
|
||||
# 1. 读取所有术语
|
||||
all_terms = self.load_all_terms()
|
||||
logging.info(f"共加载{len(all_terms)}条术语")
|
||||
|
||||
# 2. 按名称聚合
|
||||
name2terms = self.group_terms_by_name(all_terms)
|
||||
logging.info(f"共{len(name2terms)}个唯一名词")
|
||||
|
||||
# 3. 使用线程池并行处理
|
||||
merged_terms = []
|
||||
items_to_process = []
|
||||
|
||||
# 先处理只有一个条目的词条(不需要合并)
|
||||
for name, term_list in name2terms.items():
|
||||
if len(term_list) == 1:
|
||||
merged_terms.append(term_list[0])
|
||||
else:
|
||||
items_to_process.append((name, term_list))
|
||||
|
||||
logging.info(f"共{len(merged_terms)}个单一条目,{len(items_to_process)}个需要合并的条目")
|
||||
|
||||
# 只对需要合并的词条使用线程池处理
|
||||
if items_to_process:
|
||||
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
||||
# 使用tqdm显示进度
|
||||
for result in tqdm(executor.map(self.process_term, items_to_process), total=len(items_to_process)):
|
||||
merged_terms.append(result)
|
||||
|
||||
# 4. 去重
|
||||
merged_terms = self.deduplicate_synonymous_name(merged_terms)
|
||||
|
||||
# 4. 保存合并结果
|
||||
os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True)
|
||||
with open(self.OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged_terms, f, ensure_ascii=False, indent=2)
|
||||
logging.info(f"合并后结果已保存到: {self.OUTPUT_PATH}")
|
||||
|
||||
return merged_terms
|
||||
|
||||
def deduplicate_synonymous_name(self, terms):
|
||||
# 1. 删除name字段重复的条目
|
||||
unique_names = set()
|
||||
unique_data = []
|
||||
|
||||
for item in terms:
|
||||
if item["name"] not in unique_names:
|
||||
unique_names.add(item["name"])
|
||||
unique_data.append(item)
|
||||
# 如果重复,则跳过该条目
|
||||
|
||||
# 2. 如果A条目的某一个synonymou字段是B条目的name,则删除A条目中的对应的synonymou
|
||||
name_set = {item["name"] for item in unique_data}
|
||||
|
||||
for item in unique_data:
|
||||
# 过滤掉synonymous中与其他条目name重复的部分
|
||||
filtered_synonymous = [syn for syn in item["synonymous"] if syn not in name_set]
|
||||
item["synonymous"] = filtered_synonymous
|
||||
|
||||
return unique_data
|
||||
|
||||
def main():
|
||||
"""主函数,创建TermMerger实例并执行合并"""
|
||||
|
||||
cur_path = os.path.dirname(__file__)
|
||||
input_dir = os.path.abspath(os.path.join(cur_path, '../../data/wiki_extracted_nouns'))
|
||||
output_path = os.path.join(cur_path, "..", "..", "data", "nouns", 'merged_nouns.json')
|
||||
merger = TermMerger(input_dir=input_dir, output_path=output_path, max_workers=20)
|
||||
merger.merge()
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.getLogger('httpx').setLevel(logging.WARNING)
|
||||
logging.getLogger('openai').setLevel(logging.WARNING)
|
||||
main()
|
||||
Reference in New Issue
Block a user