优化对话转工单处理逻辑,调整LLM参数,增强用户问题和解决方案的提取功能,添加槽位填充支持,提升代码结构和可读性。
This commit is contained in:
@@ -15,6 +15,7 @@ import json
|
||||
import concurrent.futures
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
import sys
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
@@ -43,7 +44,7 @@ def load_questions_from_excel(file_path=None):
|
||||
|
||||
def process_query(recognizer, query):
|
||||
"""
|
||||
处理单个查询,支持重试机制
|
||||
处理单个查询,支持重试机制,并包含槽位填充
|
||||
|
||||
Args:
|
||||
recognizer: 意图识别器实例
|
||||
@@ -57,32 +58,48 @@ def process_query(recognizer, query):
|
||||
|
||||
while retry_count <= max_retries:
|
||||
try:
|
||||
# 如果是重试,添加重试信息到日志
|
||||
classification, keywords, rewrite, query_keys = recognizer.process_query(query)
|
||||
# 使用新的process_query_with_slots方法处理查询
|
||||
result = recognizer.process_query_with_slots(query)
|
||||
|
||||
# 将keywords对象转换为字符串
|
||||
# 提取分类信息
|
||||
classification = result["classification"]
|
||||
|
||||
# 提取关键词信息
|
||||
keywords = result["keywords"]
|
||||
keywords_str = ""
|
||||
if keywords and keywords.terms:
|
||||
if keywords and keywords.get("terms"):
|
||||
term_details = []
|
||||
for term in keywords.terms:
|
||||
for term in keywords["terms"]:
|
||||
term_info = {
|
||||
"名称": term.name,
|
||||
"同义词": ";".join(term.synonymous) if term.synonymous else "",
|
||||
"描述": term.description
|
||||
"名称": term["name"],
|
||||
"同义词": ";".join(term["synonymous"]) if term["synonymous"] else "",
|
||||
"描述": term["description"]
|
||||
}
|
||||
term_details.append(term_info)
|
||||
|
||||
# 将term_details转换为JSON字符串,确保中文正确显示
|
||||
keywords_str = json.dumps(term_details, ensure_ascii=False, indent=2)
|
||||
|
||||
# 提取槽位填充信息
|
||||
slot_filling = result.get("slot_filling", {})
|
||||
slot_filling_str = ""
|
||||
if slot_filling and "filled_data" in slot_filling:
|
||||
# 格式化槽位填充结果
|
||||
slot_filling_str = json.dumps({
|
||||
"是否完整": slot_filling.get("is_complete", False),
|
||||
"缺失槽位": slot_filling.get("missing_slots", {}),
|
||||
"填充数据": slot_filling.get("filled_data", {})
|
||||
}, ensure_ascii=False, indent=2)
|
||||
|
||||
# 处理成功,返回结果
|
||||
return {
|
||||
"提问": query,
|
||||
"问题拆解": query_keys,
|
||||
"一级分类": classification.vertical_classification,
|
||||
"二级分类": classification.sub_classification,
|
||||
"问题改写": rewrite.rewrite,
|
||||
"检索的关键词": keywords_str
|
||||
"问题拆解": result["query_keys"],
|
||||
"一级分类": classification["vertical_classification"],
|
||||
"二级分类": classification["sub_classification"],
|
||||
"问题改写": result["rewrite"]["rewrite"],
|
||||
"检索的关键词": keywords_str,
|
||||
"槽位填充": slot_filling_str
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -96,13 +113,15 @@ def process_query(recognizer, query):
|
||||
"一级分类": "处理出错",
|
||||
"二级分类": "处理出错",
|
||||
"问题改写": "处理出错",
|
||||
"检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}"
|
||||
"检索的关键词": f"重试 {max_retries} 次后失败: {str(e)}",
|
||||
"槽位填充": "处理出错"
|
||||
}
|
||||
else:
|
||||
# 可以在这里添加延迟,避免过快重试
|
||||
time.sleep(10 * retry_count)
|
||||
|
||||
examples_query = """下载软件在哪下载?"""
|
||||
# 示例查询
|
||||
examples_query = """这个安全文明费费率在哪里调"""
|
||||
|
||||
def main():
|
||||
"""
|
||||
@@ -119,56 +138,67 @@ def main():
|
||||
|
||||
# 读取提问数据
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
data_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据.xlsx")
|
||||
examples = load_questions_from_excel(data_file)
|
||||
# examples = examples_query.split("\n")
|
||||
max_workers = 20
|
||||
logging.info(f"共有 {len(examples)} 个问题需要处理,使用 {max_workers} 个并发线程")
|
||||
data_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据.xlsx")
|
||||
|
||||
# 创建一个与输入顺序相同的结果列表
|
||||
results = [None] * len(examples)
|
||||
# 检测是否为调试模式,调试模式下使用examples_query,否则从Excel读取
|
||||
|
||||
is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
|
||||
if is_debug:
|
||||
examples = examples_query.strip().split("\n")
|
||||
else:
|
||||
examples = load_questions_from_excel(data_file)
|
||||
|
||||
# 使用线程池进行并发处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务并记录它们的索引
|
||||
future_to_index = {}
|
||||
if not is_debug:
|
||||
|
||||
max_workers = 5 # 减少并发数以避免API限制
|
||||
logging.info(f"共有 {len(examples)} 个问题需要处理,使用 {max_workers} 个并发线程")
|
||||
# 创建一个与输入顺序相同的结果列表
|
||||
results = [None] * len(examples)
|
||||
# 使用线程池进行并发处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务并记录它们的索引
|
||||
future_to_index = {}
|
||||
for idx, query in enumerate(examples):
|
||||
future = executor.submit(process_query, recognizer, query)
|
||||
future_to_index[future] = idx
|
||||
|
||||
# 使用tqdm显示进度条
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
|
||||
idx = future_to_index[future]
|
||||
result = future.result()
|
||||
# 将结果放在与输入相同的位置
|
||||
results[idx] = result
|
||||
|
||||
# 将结果保存到Excel文件
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx")
|
||||
|
||||
# 使用ExcelWriter设置格式
|
||||
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
|
||||
results_df.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
# 获取工作簿和工作表对象
|
||||
workbook = writer.book
|
||||
worksheet = writer.sheets['Sheet1']
|
||||
|
||||
# 设置列宽(单位:像素)
|
||||
# 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位)
|
||||
worksheet.set_column('A:A', 60) # 提问列 60个Excel单位
|
||||
worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位
|
||||
worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位
|
||||
worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位
|
||||
worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位
|
||||
worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位
|
||||
worksheet.set_column('G:G', 80) # 槽位填充 80个Excel单位
|
||||
|
||||
# 设置所有行高为20磅
|
||||
for i in range(len(results_df) + 1): # +1 是为了包括表头
|
||||
worksheet.set_row(i, 20)
|
||||
else:
|
||||
for idx, query in enumerate(examples):
|
||||
future = executor.submit(process_query, recognizer, query)
|
||||
future_to_index[future] = idx
|
||||
process_query(recognizer, query)
|
||||
|
||||
# 使用tqdm显示进度条
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
|
||||
idx = future_to_index[future]
|
||||
result = future.result()
|
||||
# 将结果放在与输入相同的位置
|
||||
results[idx] = result
|
||||
|
||||
# 将结果保存到Excel文件
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
output_file = os.path.join(current_dir, "..", "..", "data", "excel", "200条提问数据_重写结果.xlsx")
|
||||
|
||||
# 使用ExcelWriter设置格式
|
||||
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
|
||||
results_df.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
# 获取工作簿和工作表对象
|
||||
workbook = writer.book
|
||||
worksheet = writer.sheets['Sheet1']
|
||||
|
||||
# 设置列宽(单位:像素)
|
||||
# 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位)
|
||||
worksheet.set_column('A:A', 60) # 提问列 60个Excel单位
|
||||
worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位
|
||||
worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位
|
||||
worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位
|
||||
worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位
|
||||
worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位
|
||||
|
||||
# 设置所有行高为20磅
|
||||
for i in range(len(results_df) + 1): # +1 是为了包括表头
|
||||
worksheet.set_row(i, 20)
|
||||
|
||||
logging.info(f"处理完成,结果已保存至: {output_file}")
|
||||
|
||||
def setup_logging():
|
||||
|
||||
Reference in New Issue
Block a user