Files
DM_rewrite_3.31/booway_kg_api/lab.ipynb
T
2025-03-31 15:17:47 +08:00

296 lines
8.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e2b31c11-d818-483d-b99d-6b9c8477dacd",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# 读取 CSV 文件\n",
"csv_filename = \"info_dify.csv\"\n",
"md_filename = \"output.md\"\n",
"\n",
"# 读取数据\n",
"df = pd.read_csv(csv_filename)\n",
"\n",
"# 处理 path 列,去除前后 /\n",
"df['path'] = df['path'].str.strip('/')\n",
"\n",
"# 构建树结构\n",
"tree = {}\n",
"\n",
"def insert_path(tree, levels):\n",
" \"\"\" 递归插入路径到树形结构,确保相同层级合并 \"\"\"\n",
" if not levels:\n",
" return\n",
" key = levels[0]\n",
" if key not in tree:\n",
" tree[key] = {}\n",
" insert_path(tree[key], levels[1:])\n",
"\n",
"# 遍历 DataFrame 的 path 列\n",
"for path in df['path']:\n",
" levels = path.split('/') # 拆分层级\n",
" insert_path(tree, levels) # 插入到树结构\n",
"\n",
"def generate_md(tree, level=1):\n",
" \"\"\" 递归生成 Markdown 文本,合并相同路径 \"\"\"\n",
" md_text = []\n",
" for key in sorted(tree.keys()): # 确保有序输出\n",
" md_text.append(f\"{'#' * level} {key}\") # 根据层级添加 `#`\n",
" md_text.extend(generate_md(tree[key], level + 1)) # 递归生成子项\n",
" return md_text\n",
"\n",
"# 生成 Markdown 内容\n",
"md_content = generate_md(tree)\n",
"\n",
"# 保存到 Markdown 文件\n",
"with open(md_filename, mode='w', encoding='utf-8') as md_file:\n",
" md_file.write(\"\\n\".join(md_content))\n",
"\n",
"print(f\"Markdown 文件已保存为 {md_filename}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "796cc4fe-4912-4cdc-9b80-f217f46b5487",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"\n",
"# 读取 CSV 文件\n",
"input_file = 'info_data.csv' # 原始 CSV 文件路径\n",
"output_file = 'info_data_cleaned.csv' # 处理后保存的 CSV 文件路径\n",
"\n",
"# 加载 CSV 到 DataFrame\n",
"df = pd.read_csv(input_file, encoding='utf-8')\n",
"\n",
"# 检查 'title' 列是否存在\n",
"if 'title' not in df.columns:\n",
" raise ValueError(\"CSV 文件中没有找到 'title' 列,请检查文件内容\")\n",
"\n",
"# 定义正则表达式:匹配括号及其中的内容\n",
"pattern = re.compile(r'[()()].*?[()()]')\n",
"\n",
"# 遍历每一行,处理 'title' 列\n",
"def clean_title(title):\n",
" # 转换为 str 并删除括号内容\n",
" cleaned_title = re.sub(pattern, '', str(title))\n",
" # 去除多余空格\n",
" return cleaned_title.strip()\n",
"\n",
"# 更新 'title' 列\n",
"df['title'] = df['title'].apply(clean_title)\n",
"\n",
"# 保存到新的 CSV 文件\n",
"df.to_csv(output_file, index=False, encoding='utf-8')\n",
"\n",
"print(f\"处理完成!已保存到:{output_file}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf63d138-4d16-496a-ad3c-5d67008412f6",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# 读取 CSV 文件\n",
"file_path = 'info_data_cleaned.csv' # 请替换为你的 CSV 文件路径\n",
"df = pd.read_csv(file_path, encoding='utf-8')\n",
"\n",
"# 检查是否包含 'path' 列\n",
"if 'path' not in df.columns:\n",
" raise ValueError(\"CSV 文件中未找到 'path' 列,请检查文件内容。\")\n",
"\n",
"# 将 'path' 列按 '/' 分割,并展开为多列\n",
"split_columns = df['path'].str.split('/', expand=True)\n",
"\n",
"# 重命名列名为 title1, title2, ..., titlen\n",
"split_columns.columns = [f'title{i+1}' for i in range(split_columns.shape[1])]\n",
"\n",
"# 合并原 DataFrame 和新拆分的列\n",
"df = pd.concat([df, split_columns], axis=1)\n",
"\n",
"# 保存结果到新 CSV 文件\n",
"output_file = 'info_data_cleaned_split.csv'\n",
"df.to_csv(output_file, index=False, encoding='utf-8')\n",
"\n",
"print(f\"处理完成,结果已保存到 {output_file}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7520603c-90cf-4982-9273-2f130614ca96",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"\n",
"# 读取 CSV 文件\n",
"input_file = 'info_data_cleaned_split.csv' # 原始 CSV 文件路径\n",
"output_file = 'info_data_cleaned_split2.csv' # 处理后保存的 CSV 文件路径\n",
"\n",
"# 加载 CSV 到 DataFrame\n",
"df = pd.read_csv(input_file, encoding='utf-8')\n",
"\n",
"# 定义正则表达式:匹配括号及其中的内容\n",
"pattern = re.compile(r'[()()].*?[()()]')\n",
"\n",
"# 清洗函数:删除括号及其中内容,并去除多余空格\n",
"def clean_text(text):\n",
" # 转换为 str 并删除括号内容\n",
" cleaned_text = re.sub(pattern, '', str(text))\n",
" # 去除多余空格\n",
" return cleaned_text.strip()\n",
"\n",
"# 从第2列开始遍历并清洗\n",
"for col in df.columns[1:]:\n",
" df[col] = df[col].apply(clean_text)\n",
"\n",
"# 保存到新的 CSV 文件\n",
"df.to_csv(output_file, index=False, encoding='utf-8')\n",
"\n",
"print(f\"处理完成!已保存到:{output_file}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f03c96de-8bff-4b41-933a-9d9775bb7ad8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# 加载CSV文件\n",
"file_path = 'info_data_cleaned_split2.csv' # 请修改为你的文件路径\n",
"df = pd.read_csv(file_path, encoding='utf-8')\n",
"\n",
"# 定义新列名\n",
"new_column = 'Previous_Row'\n",
"\n",
"# 初始化新列\n",
"df[new_column] = None\n",
"\n",
"# 遍历每一行\n",
"for i in range(1, len(df)):\n",
" # 检查每一列是否为 NaN\n",
" if df.iloc[i].isna().any():\n",
" # 将前一行的内容放入新列中\n",
" df.at[i, new_column] = df.iloc[i-1].to_dict()\n",
"\n",
"# 输出处理后的 DataFrame\n",
"print(df)\n",
"\n",
"# 如果需要保存结果到新CSV\n",
"df.to_csv('info_data_cleaned_split3.csv', index=False, encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5838e3fb-0546-4433-aa82-0ed367ad05e0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "5f91c155-92b7-4c82-863f-6c942e8df58a",
"metadata": {},
"source": [
"## 2.20"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c708560b-13c7-494e-b6fa-96c038ae36f5",
"metadata": {},
"outputs": [],
"source": [
"synonyms_file = \"synonyms.json\"\n",
"with open(synonyms_file, \"r\", encoding=\"utf-8\") as f:\n",
" synonyms = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "2f6e0cc6-7d6e-46cd-a2bc-561fb193ae92",
"metadata": {},
"outputs": [],
"source": [
"synonym_to_standard = {}\n",
"for standard, values in synonyms.items():\n",
" for value in values:\n",
" synonym_to_standard[value] = standard # 反向映射 { \"表单\": \"报表\" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4199840f-1d36-4328-969c-f973103b437e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "122c3629-ffb1-4deb-8df1-4be71baf82d0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e88830a-663e-44b6-9b4d-452ac174e046",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5273065-8470-4a77-8e5c-2cca33cf2e5f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dify_lab",
"language": "python",
"name": "dify_lab"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}