新增对话处理功能,优化意图识别逻辑,添加结果保存至Excel的功能,更新依赖项以支持新的数据库驱动和ORM,重构代码以提高可读性和维护性,删除冗余文件以简化项目结构。
This commit is contained in:
@@ -118,10 +118,65 @@ def process_query(recognizer, query):
|
||||
}
|
||||
else:
|
||||
# 可以在这里添加延迟,避免过快重试
|
||||
time.sleep(10 * retry_count)
|
||||
time.sleep(10)
|
||||
|
||||
def save_results_to_excel(results, output_file, is_final=False):
|
||||
"""
|
||||
将结果保存到Excel文件
|
||||
|
||||
Args:
|
||||
results: 结果列表
|
||||
output_file: 输出文件路径
|
||||
is_final: 是否为最终保存,如果是则使用完整文件名,否则添加临时标记
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# 过滤掉None值
|
||||
valid_results = [r for r in results if r is not None]
|
||||
|
||||
if not valid_results:
|
||||
logging.warning("没有有效结果可保存")
|
||||
return
|
||||
|
||||
# 创建DataFrame
|
||||
results_df = pd.DataFrame(valid_results)
|
||||
|
||||
# 根据是否为最终保存确定文件名
|
||||
if not is_final:
|
||||
file_name, file_ext = os.path.splitext(output_file)
|
||||
temp_output_file = f"{file_name}_temp{file_ext}"
|
||||
else:
|
||||
temp_output_file = output_file
|
||||
|
||||
# 使用ExcelWriter设置格式
|
||||
with pd.ExcelWriter(temp_output_file, engine='xlsxwriter') as writer:
|
||||
results_df.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
# 获取工作簿和工作表对象
|
||||
workbook = writer.book
|
||||
worksheet = writer.sheets['Sheet1']
|
||||
|
||||
# 设置列宽(单位:像素)
|
||||
# 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位)
|
||||
worksheet.set_column('A:A', 60) # 提问列 60个Excel单位
|
||||
worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位
|
||||
worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位
|
||||
worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位
|
||||
worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位
|
||||
worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位
|
||||
worksheet.set_column('G:G', 80) # 槽位填充 80个Excel单位
|
||||
|
||||
# 设置所有行高为20磅
|
||||
for i in range(len(results_df) + 1): # +1 是为了包括表头
|
||||
worksheet.set_row(i, 20)
|
||||
|
||||
logging.info(f"已保存{len(valid_results)}条结果至: {temp_output_file}")
|
||||
|
||||
# 示例查询
|
||||
examples_query = """储能软件组合件界面,点击隐藏空项目划分后界面没有任何变化"""
|
||||
examples_query = """"锁标签号:811621005858, 注册单位:惠州电力勘察设计院有限公司,软件名称:广东迁改导则2022, 注册号:BW278-83834-58155-58339.迁改导则是要另外下载安装软件吗?"
|
||||
|
||||
"""
|
||||
|
||||
def main():
|
||||
"""
|
||||
@@ -138,10 +193,10 @@ def main():
|
||||
|
||||
# 读取提问数据
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
data_file = os.path.join(current_dir, "..", "..", "data", "excel", "400条提问意图分类数据-原始.xlsx")
|
||||
data_file = os.path.join(current_dir, "..", "..", "data", "excel", "历史提问数据(dislike)_提问明确.xlsx")
|
||||
output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx")
|
||||
|
||||
# 检测是否为调试模式,调试模式下使用examples_query,否则从Excel读取
|
||||
|
||||
is_debug = hasattr(sys, 'gettrace') and sys.gettrace() is not None
|
||||
if is_debug:
|
||||
examples = examples_query.strip().split("\n")
|
||||
@@ -149,11 +204,13 @@ def main():
|
||||
examples = load_questions_from_excel(data_file)
|
||||
|
||||
if not is_debug:
|
||||
|
||||
max_workers = 10 # 减少并发数以避免API限制
|
||||
max_workers = 20 # 减少并发数以避免API限制
|
||||
logging.info(f"共有 {len(examples)} 个问题需要处理,使用 {max_workers} 个并发线程")
|
||||
|
||||
# 创建一个与输入顺序相同的结果列表
|
||||
results = [None] * len(examples)
|
||||
batch_size = 100 # 每100条保存一次
|
||||
|
||||
# 使用线程池进行并发处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# 提交所有任务并记录它们的索引
|
||||
@@ -163,43 +220,27 @@ def main():
|
||||
future_to_index[future] = idx
|
||||
|
||||
# 使用tqdm显示进度条
|
||||
completed = 0
|
||||
for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(examples), desc="处理进度"):
|
||||
idx = future_to_index[future]
|
||||
result = future.result()
|
||||
# 将结果放在与输入相同的位置
|
||||
results[idx] = result
|
||||
|
||||
completed += 1
|
||||
# 每处理batch_size条数据保存一次
|
||||
if completed % batch_size == 0:
|
||||
logging.info(f"已完成 {completed}/{len(examples)} 条,保存中间结果...")
|
||||
save_results_to_excel(results, output_file, is_final=False)
|
||||
|
||||
# 将结果保存到Excel文件
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
output_file = os.path.join(current_dir, "..", "..", "data", "excel", "测试提问数据_槽位填充结果.xlsx")
|
||||
|
||||
# 使用ExcelWriter设置格式
|
||||
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
|
||||
results_df.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
# 获取工作簿和工作表对象
|
||||
workbook = writer.book
|
||||
worksheet = writer.sheets['Sheet1']
|
||||
|
||||
# 设置列宽(单位:像素)
|
||||
# 定义列宽(厘米转为Excel单位,1cm约等于4.7个Excel单位)
|
||||
worksheet.set_column('A:A', 60) # 提问列 60个Excel单位
|
||||
worksheet.set_column('B:B', 20) # 问题拆解 20个Excel单位
|
||||
worksheet.set_column('C:C', 20) # 一级分类 20个Excel单位
|
||||
worksheet.set_column('D:D', 20) # 二级分类 20个Excel单位
|
||||
worksheet.set_column('E:E', 60) # 问题改写 60个Excel单位
|
||||
worksheet.set_column('F:F', 60) # 检索到的关键词 60个Excel单位
|
||||
worksheet.set_column('G:G', 80) # 槽位填充 80个Excel单位
|
||||
|
||||
# 设置所有行高为20磅
|
||||
for i in range(len(results_df) + 1): # +1 是为了包括表头
|
||||
worksheet.set_row(i, 20)
|
||||
# 处理完所有数据后,保存最终结果
|
||||
save_results_to_excel(results, output_file, is_final=True)
|
||||
logging.info(f"所有处理完成,最终结果已保存至: {output_file}")
|
||||
else:
|
||||
for idx, query in enumerate(examples):
|
||||
if query.strip() == "":
|
||||
continue
|
||||
process_query(recognizer, query)
|
||||
|
||||
logging.info(f"处理完成,结果已保存至: {output_file}")
|
||||
|
||||
def setup_logging():
|
||||
# 配置日志输出到控制台
|
||||
|
||||
Reference in New Issue
Block a user