3.31 上传 dm rewrite

This commit is contained in:
Zdao032
2025-03-31 15:17:47 +08:00
commit b444310280
430 changed files with 39039 additions and 0 deletions
+947
View File
@@ -0,0 +1,947 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d53385f4-0763-4d6a-a3de-4269a044115d",
"metadata": {},
"source": [
"# 1. 数据读取"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "88be1d08-21a6-4ad5-bdce-d77821cd790c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def read_specified_rows(file_path, row_index):\n",
" if file_path.endswith('.csv'):\n",
" df = pd.read_csv(file_path, header=None, encoding='utf-8')\n",
" else:\n",
" raise ValueError(\"仅支持 CSV 文件\")\n",
"\n",
" # 选取第一列并转换为字符串列表\n",
" selected_data = df.iloc[row_index:, 0].astype(str).tolist()\n",
" return selected_data\n",
"\n",
"# 示例用法\n",
"file_path = \"D:/博微知识助手400问分类_2.13.16.33.csv\" # 可替换为 Excel 文件\n",
"row_index = 2 # 读取第 1, 3, 5, 7 行(索引从 0 开始)\n",
"\n",
"result_list = read_specified_rows(file_path, row_index)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6f208b20-778d-4923-a463-4283bed21160",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('多个工程需要统一修改定额中材料的单价是否可以呢', '可以导入多个投标报价新建全口径预算工程吗', 444)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_list[0], result_list[-1], len(result_list)"
]
},
{
"cell_type": "markdown",
"id": "dd23a946-ae58-4ff7-b611-a37c61ca1504",
"metadata": {},
"source": [
"# 2.prefix_re"
]
},
{
"cell_type": "markdown",
"id": "1ae7e16e-2870-450f-a0b2-8ba2925e1cdd",
"metadata": {},
"source": [
"## 2.1 检测后缀名"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b7866697-0c75-4abb-aeb2-f467b3eca50f",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def detect_fields(input_str):\n",
" # 定义要检测的字段列表\n",
" fields = [\n",
" \"xzwb\", \"bxqd2\", \"bpz17\", \"zwqd\", \"bwpw\", \"BJGX\", \"bt2\", \"BDQ3\", \"BT2\", \n",
" \"gec5\", \"BDY3\", \"dwg\", \"bwpwz\", \"BDD3\", \"bt1\", \"bphq18\", \"zwzj\", \"bczc2\", \n",
" \"BPQ\", \"BPY\", \"BDQ3\", \"SXZB23\", \"SXZ\", \"xzwb2\", \"bpz17\" \n",
" ]\n",
" \n",
" # 构建正则表达式模式,匹配大小写不敏感且前面可能带有.\n",
" # 去掉 \\b 以允许字段是其他字符串的一部分\n",
" pattern = r'(?:\\.?)(' + '|'.join(re.escape(field) for field in fields) + r')'\n",
" \n",
" # 使用 re.IGNORECASE 标志来忽略大小写\n",
" if re.search(pattern, input_str, re.IGNORECASE):\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"index1 = []\n",
"for i in range(len(result_list)):\n",
" if detect_fields(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index1.append(i)"
]
},
{
"cell_type": "markdown",
"id": "5854667e-f149-4eb3-b97b-5daa4056a19c",
"metadata": {},
"source": [
"## 2.2 检测“锁”"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c3be0a91-2a50-4669-95b4-925d84bbad0a",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"软件\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index2 = []\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]) == True:\n",
" # print(i, result_list[i])\n",
" index2.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c137ffce-aebe-41bc-afb6-fe31788b35c2",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"工程\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index3=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index3.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b6a68005-ce58-48e3-88d7-dd9564d7a81c",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"计价\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index4=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index4.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1c053ae1-dff0-4ff8-9039-c64d8246e402",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"配网\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index5=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index5.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "de72b59d-137e-4507-b91d-776f23f2cd1b",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"清单\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index6=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index6.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "80704f93-65b3-439e-b688-d8ea74217e9d",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"定额\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index7=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index7.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "ff0213ff-eddb-4d25-90b6-d009edf94bfc",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"施工\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index8=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index8.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "5f08eedc-d642-45bb-82ec-3526f9616d68",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"技改\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index9=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index9.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "789bd94a-7d21-4382-8c01-0f0373a66455",
"metadata": {},
"outputs": [],
"source": [
"def word_query(input_str, target_word=\"计算\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index10=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" # print(i, result_list[i])\n",
" index10.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8cd839bc-a72f-4c35-bcf5-1f755bd16740",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25 .bpz17对应的哪个锁\n",
"79 软件锁怎么自动注册不起了呢\n",
"80 问下锁激活怎么弄?\n",
"102 zwqd请问这是用什么锁做的\n",
"109 网络锁怎么登录\n",
"151 19-029758帮查下一下锁号\n",
"164 怎么激活锁保险\n",
"172 打开工程提示需要检修高级版锁\n",
"194 清单锁住了,怎么解锁\n",
"210 我想知道这个锁有问题吗,为什么激活不了呢\n",
"214 你好,显示我的锁号注册失败是什么原因\n",
"246 查询锁许可证号锁号19-079728,19-079718\n",
"249 我的清单锁不好用了,总是找不到可用许可证\n",
"251 配网设计锁不好使\n",
"254 2009年版的软件,插上了锁,怎么打不开\n",
"262 配网2017软件读不到锁,打不开软件\n",
"264 软件锁激活\n",
"297 插了电建的锁 识别不出来,打开软件首行灰色\n",
"303 软件是识别不到锁\n",
"310 主网造价国网清单的锁,接入结算后,费用和投标时候的投标价格不一样\n",
"319 想调定额量,怎么解锁\n",
"332 新的软件锁插入显示锁中未找到本软件可用的许可证,请问应该怎么解决呢\n",
"351 可以用网络锁登陆吗\n",
"360 清单怎么解锁\n",
"377 锁怎么激活呢\n",
"392 我有个新的锁,怎么注册?\n",
"419 请问清单如何解锁\n",
"425 我想咨询这我个工程后缀是zwzj 要用什么锁打开\n"
]
}
],
"source": [
"import re\n",
"\n",
"def word_query(input_str, target_word=\"锁\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) \n",
"\n",
"index11=[]\n",
"for i in range(len(result_list)):\n",
" if word_query(result_list[i]):\n",
" print(i, result_list[i])\n",
" index11.append(i)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "54e94090-3acb-4550-80bc-01c2f8191ec2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"306"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index = index1 + index2 + index3 + index4 + index5 + index6 + index7 + index8 + index9 + index10 + index11\n",
"len(set(index))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "1b8b8559-ea36-454f-915d-da2122b8a620",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"other_index = []\n",
"other_content = []\n",
"for i in range(len(result_list)):\n",
" if i not in set(index):\n",
" other_index.append(i+3)\n",
" other_content.append(result_list[i])\n",
" # print((i + 3),result_list[i])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "09634a84-0df0-4bf7-9926-3fccf30b588f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"138"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(other_index)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "1229f23d-17de-4f1e-a48c-6bd8f2b77eeb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10, '怎样新增取费表')"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"other_index[0], other_content[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7dfaabd3-7956-4842-882c-16f2566563b0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab78d90d-deaa-46e4-9629-212b1be5991c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b0ce7ae-b9dd-4713-b39a-6ddd820914e3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d0517bb-106f-410b-b34c-891597440219",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1166e47b-741f-44a3-a341-14440e015309",
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"\n",
"\n",
"# Qwen/Qwen2.5-72B-Instruct\n",
"# deepseek-ai/DeepSeek-R1\n",
"# deepseek-ai/DeepSeek-V3\n",
"qwen_llm = ChatOpenAI(\n",
" openai_api_base=\"https://api.siliconflow.cn/v1\",\n",
" model_name=\"Qwen/Qwen2.5-72B-Instruct\",\n",
" openai_api_key=\"sk-muuqautpcyuowjtgfecbnivqodlhzydtfslqkmwbknawejsx\"\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "03871a43-8859-47bb-bc67-6b137b3a7205",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.prompts import ChatPromptTemplate\n",
"from langchain_core.output_parsers import StrOutputParser"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "85179eaa-303e-44a1-aa3e-f7d84ac8be08",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████| 144/144 [03:48<00:00, 1.59s/it]\n"
]
}
],
"source": [
"PromptTemplate2 = \"\"\"\n",
"你是博微公司的电力造价员专家,需要将后续用户输入的对于多款软件产品使用和业务方面的咨询问题转化。\n",
"请站在电力造价领域将用户的问题转为书面化咨询语句,不要假设上下文,更不要尝试回答问题\n",
"\n",
"# 用户输入\n",
"{query}\n",
"\n",
"# 注意,不要扩展礼貌用词等等\n",
"\"\"\"\n",
"\n",
"Prompt2 = ChatPromptTemplate.from_template(PromptTemplate2)\n",
"\n",
"Chain2 = Prompt2 | qwen_llm | StrOutputParser()\n",
"\n",
"\n",
"from tqdm import tqdm\n",
"\n",
"id_info2 = []\n",
"for i in tqdm(result_list[300:]):\n",
" id_info2.append(Chain2.invoke({\"query\":i}))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "6247506d-f131-4b7d-b04b-79e24f078989",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"144"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(id_info2)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "eccc683c-98b0-4132-8c85-5beeb871a640",
"metadata": {},
"outputs": [],
"source": [
"def save_list_to_txt(filename, data_list):\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(\"\\n\".join(data_list)) # 每个元素换行\n",
"\n",
"save_list_to_txt(\"ceshi100.txt\", id_info2)"
]
},
{
"cell_type": "markdown",
"id": "4be940c3-e2ee-4efb-87e3-328cfaf59602",
"metadata": {},
"source": [
"# prompt"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "33c374c9-0ac7-460d-90ce-2a617a20e081",
"metadata": {},
"outputs": [],
"source": [
"PromptTemplate1 = \"\"\"\n",
"你是博微公司的电力造价员专家,需要将后续用户输入的对博微公司多款软件产品使用和业务方面的咨询问题进行意图分类。\n",
"并且站在电力造价领域角度上,将用户的问题理解意图后采用以下指定槽位结构填充将用户问题转化为JSON格式输出。\n",
"如果问题中没有给出对应槽位的值则为未知,而不要假设上下文,更不要尝试回答问题。\n",
"\n",
"# 用户输入\n",
"{query}\n",
"\n",
"# 一级意图\n",
"[操作指南, 规范解读, 费用构成, 其他]\n",
"\n",
"# 二级意图\n",
"## 操作指南\n",
"下载安装注册(系统环境要求、安装包的下载和安装步骤、激活码获取与绑定,离线激活流程、版本冲突、操作西戎适配性等)\n",
"软件使用操作(如 新建/打开工程, 数据相关操作, 报表生成与导出、版本兼容性处理等;以及计价通软件、造价软件相关操作咨询)\n",
"数据管理(如 数据备份与恢复,版本兼容性处理,多人协作权限设置)\n",
"\n",
"### 规范解读\n",
"国家规范(如 《电力建设工程概预算编制规定》, 《电网工程建设预算编制与计算标准》,预估相关)\n",
"行业标准(如 变电工程定额应用, 线路工程取费规则,主网和配网和技改检修的定额)\n",
"地方政策(如 地区人工费调整系数, 特殊材料价差处理)\n",
"行业知识查询(行业知识的解读查询)\n",
"\n",
"### 费用构成\n",
"费用类别解析(如 建筑工程费, 安装工程费, 设备购置费)\n",
"费用计算逻辑(如 直接费(人工、材料、机械), 间接费(企业管理费、规费), 利润与税金)\n",
"特殊场景费用(如 临时设施费, 冬季施工增加费)\n",
"\n",
"# 意图类别的槽位结构:\n",
"\n",
"## 操作指南 \n",
"一级意图: \n",
"二级意图:\n",
"software: 用户提到的具体软件产品名称。\n",
"functionality: 用户询问的具体功能或操作步骤。\n",
"specifics: 关于问题的具体描述或背景信息(如果有提及)。\n",
"version: 软件版本号(如果有提及)。\n",
"standard: 具体的电力造价规范或标准(如果有提及)。\n",
"context: 问题的上下文或应用场景,例如特定项目、合同条款等。\n",
"\n",
"## 规范解读\n",
"一级意图:\n",
"二级意图: \n",
"standard: 具体的电力造价规范或标准名称,例如《电力工程建设预算编制与计算规定》等等。\n",
"section: 规范中的具体章节或条款编号。\n",
"interpretation: 用户希望解读的具体内容或条款,例如某一条款的具体含义、适用范围等。\n",
"context: 问题的具体上下文或应用场景,例如某个项目、合同条款、特定工程阶段等。\n",
"software: 如果涉及软件操作,具体使用的博微公司软件产品名称(如果有提及)。\n",
"version: 软件版本号(如果有提及)。\n",
"specifics: 关于问题的具体描述或背景信息。\n",
"example: 是否需要具体的示例来帮助理解条款的应用。\n",
"\n",
"## 费用构成\n",
"一级意图: \n",
"二级意图:\n",
"software: 用户提到的具体软件产品名称(隐含为博微公司的某款电力造价软件)。\n",
"functionality: 用户询问的具体功能或操作步骤(如果适用)。\n",
"specifics: 关于问题的具体描述或背景信息。\n",
"version: 软件版本号(如果有提及)。\n",
"fee_type: 费用类型,如设计费、施工图预算编制费、竣工图文件编制费等。\n",
"cost_component: 具体的成本组成部分,例如直接成本、间接成本、管理费等。\n",
"inclusion: 是否包含特定费用项,例如是否包含某项费用或是否需要单独计列。\n",
"basis: 计算依据或标准,例如按工程造价的百分比、固定金额等。\n",
"context: 问题的上下文或应用场景,例如某个项目、合同条款、特定工程阶段等。\n",
"standard: 涉及的具体电力造价规范或标准,例如《电力工程建设预算编制与计算规定》等等。\n",
"\n",
"## 其他\n",
"一级意图: \n",
"二级意图:\n",
"qa: 问题咨询\n",
"ty: 闲聊\n",
"\n",
"# 注意:\n",
"1. 请按JSON格式返回,未知字段填'未知'\n",
"2. json的keys,一定含有'一级意图'、'二级意图',且无论用户输入上下文多少,输出json只有一个\n",
"\n",
"\"\"\"\n",
"\n",
"Prompt1 = ChatPromptTemplate.from_template(PromptTemplate1)\n",
"\n",
"Chain1 = Prompt1 | qwen_llm | StrOutputParser()\n",
"\n",
"####################################################################\n",
"\n",
"PromptTemplate2 = \"\"\"\n",
"请在电力造价领域角度上,对用户的输入进行指定槽位的填充,并转换为JSON结构输出。\n",
"如果问题中没有给出对应槽位的值则为未知,而不要假设上下文,更不要尝试回答问题。\n",
"\n",
"# 用户输入\n",
"{query}\n",
"\n",
"# 槽位结构:\n",
"\n",
"## 操作指南 \n",
"一级意图: 操作指南\n",
"二级意图:下载安装注册\n",
"software: 用户提到的具体软件产品名称(如果适用)。\n",
"functionality: 用户询问的具体功能或操作步骤(如果适用)。\n",
"issueType: 问题的具体类型,如“咨询费用包含内容”、“操作方法不明”等。\n",
"specifics: 关于问题的具体描述或背景信息。\n",
"version: 软件版本号(如果适用)。\n",
"standard: 具体的电力造价规范或标准(如果有提及)。\n",
"context: 问题的上下文或应用场景,例如特定项目、合同条款等。\n",
"\n",
"# 注意:\n",
"1. 请按JSON格式返回,未知字段填'未知'\n",
"2. 一级意图: 操作指南 和 二级意图:下载安装注册 是固定不变的\n",
"3. 无论用户输入上下文多少,输出json只有一个\n",
"\n",
"\"\"\"\n",
"Prompt2 = ChatPromptTemplate.from_template(PromptTemplate2)\n",
"\n",
"Chain2 = Prompt2 | qwen_llm | StrOutputParser()\n",
"\n",
"\n",
"# query = \"多个工程要修改建筑的材料和机械价格\"\n",
"# result1 = Chain1.invoke({\"query\":query})\n",
"# print(result1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35f36068-b2a9-491a-8b35-292565aa0542",
"metadata": {},
"outputs": [],
"source": [
"Chain1.invoke(\"\")"
]
},
{
"cell_type": "markdown",
"id": "7c197c26-3a5c-43b4-9940-0cdbfcc69622",
"metadata": {},
"source": [
"## pipe"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d06537d9-6cd2-45bf-8fd5-a0620d95db86",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def detect_fields(input_str):\n",
" # 定义要检测的字段列表\n",
" fields = [\n",
" \"xzwb\", \"bxqd2\", \"bpz17\", \"zwqd\", \"bwpw\", \"BJGX\", \"bt2\", \"BDQ3\", \"BT2\", \n",
" \"gec5\", \"BDY3\", \"dwg\", \"bwpwz\", \"BDD3\", \"bt1\", \"bphq18\", \"zwzj\", \"bczc2\", \n",
" \"BPQ\", \"BPY\", \"BDQ3\", \"SXZB23\", \"SXZ\", \"xzwb2\", \"bpz17\" \n",
" ]\n",
" \n",
" # 构建正则表达式模式,匹配大小写不敏感且前面可能带有.\n",
" # 去掉 \\b 以允许字段是其他字符串的一部分\n",
" pattern = r'(?:\\.?)(' + '|'.join(re.escape(field) for field in fields) + r')'\n",
" \n",
" # 使用 re.IGNORECASE 标志来忽略大小写\n",
" if re.search(pattern, input_str, re.IGNORECASE):\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"def word_query(input_str, target_word=\"锁\"):\n",
" pattern = rf\"{target_word}\" \n",
" return bool(re.search(pattern, input_str)) "
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "c836c925-e143-4c49-8366-da265fe70dd9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94/94 [14:30<00:00, 9.26s/it]\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"\n",
"id_info1 = []\n",
"for i in tqdm(result_list[350:]):\n",
" if detect_fields(i) or word_query(i):\n",
" id_info1.append(Chain2.invoke({\"query\":i})) \n",
" else:\n",
" id_info1.append(Chain1.invoke({\"query\":i})) \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "dfdbdb26-0c81-481a-8694-ca7339781ff5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(94, 94)"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"# 正则表达式匹配 \"一级意图\" 和 \"二级意图\" 的值\n",
"pattern = r'\"(一级意图|二级意图)\"\\s*:\\s*\"([^\"]+)\"'\n",
"\n",
"# 初始化两个列表\n",
"list1 = []\n",
"list2 = []\n",
"\n",
"for i in id_info1:\n",
" matches = re.findall(pattern, i)\n",
"\n",
" for key, value in matches:\n",
" if key == \"一级意图\":\n",
" list1.append(value)\n",
" elif key == \"二级意图\":\n",
" list2.append(value)\n",
"\n",
"len(list1), len(list2)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "5ecc2d23-bb2c-4d40-bb8b-9e0e6f7f88e3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'我在取费标修改完 报表输出没有变化'"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_list[349]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "0054ca5b-0464-462e-be10-43a419d401af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"detect_fields(result_list[332]) "
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "d8dae57a-3303-49d8-b7bb-396c83449bde",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"```json\n",
"{\n",
" \"操作指南\": \"操作指南\",\n",
" \"二级意图\": \"下载安装注册\",\n",
" \"software\": \"bphq18\",\n",
" \"functionality\": \"未知\",\n",
" \"issueType\": \"咨询费用包含内容\",\n",
" \"specifics\": \"后缀是什么软件\",\n",
" \"version\": \"未知\",\n",
" \"standard\": \"未知\",\n",
" \"context\": \"未知\"\n",
"}\n",
"```\n"
]
}
],
"source": [
"print(id_info1[5])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "2a1767a1-0578-4645-ad2b-032ae22073fc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"文件已保存:list1.txt 和 list2.txt\n"
]
}
],
"source": [
"# 保存列表数据到 txt 文件\n",
"def save_list_to_txt(filename, data_list):\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(\"\\n\".join(data_list)) # 每个元素换行\n",
"\n",
"# 保存到 txt 文件\n",
"save_list_to_txt(\"list1.txt\", list1)\n",
"save_list_to_txt(\"list2.txt\", list2)\n",
"\n",
"print(\"文件已保存:list1.txt 和 list2.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "d93f91c7-0116-4408-99bd-f8a06ccfc50d",
"metadata": {},
"outputs": [],
"source": [
"def save_list_to_txt(filename, data_list):\n",
" with open(filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(\"\\n\".join(data_list)) # 每个元素换行\n",
"\n",
"save_list_to_txt(\"ceshi100.txt\", id_info1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5e06817a-18d1-451d-99a0-a9be3df1a46e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dify_lab",
"language": "python",
"name": "dify_lab"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}