{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e2b31c11-d818-483d-b99d-6b9c8477dacd", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# 读取 CSV 文件\n", "csv_filename = \"info_dify.csv\"\n", "md_filename = \"output.md\"\n", "\n", "# 读取数据\n", "df = pd.read_csv(csv_filename)\n", "\n", "# 处理 path 列,去除前后 /\n", "df['path'] = df['path'].str.strip('/')\n", "\n", "# 构建树结构\n", "tree = {}\n", "\n", "def insert_path(tree, levels):\n", " \"\"\" 递归插入路径到树形结构,确保相同层级合并 \"\"\"\n", " if not levels:\n", " return\n", " key = levels[0]\n", " if key not in tree:\n", " tree[key] = {}\n", " insert_path(tree[key], levels[1:])\n", "\n", "# 遍历 DataFrame 的 path 列\n", "for path in df['path']:\n", " levels = path.split('/') # 拆分层级\n", " insert_path(tree, levels) # 插入到树结构\n", "\n", "def generate_md(tree, level=1):\n", " \"\"\" 递归生成 Markdown 文本,合并相同路径 \"\"\"\n", " md_text = []\n", " for key in sorted(tree.keys()): # 确保有序输出\n", " md_text.append(f\"{'#' * level} {key}\") # 根据层级添加 `#`\n", " md_text.extend(generate_md(tree[key], level + 1)) # 递归生成子项\n", " return md_text\n", "\n", "# 生成 Markdown 内容\n", "md_content = generate_md(tree)\n", "\n", "# 保存到 Markdown 文件\n", "with open(md_filename, mode='w', encoding='utf-8') as md_file:\n", " md_file.write(\"\\n\".join(md_content))\n", "\n", "print(f\"Markdown 文件已保存为 {md_filename}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "796cc4fe-4912-4cdc-9b80-f217f46b5487", "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "\n", "# 读取 CSV 文件\n", "input_file = 'info_data.csv' # 原始 CSV 文件路径\n", "output_file = 'info_data_cleaned.csv' # 处理后保存的 CSV 文件路径\n", "\n", "# 加载 CSV 到 DataFrame\n", "df = pd.read_csv(input_file, encoding='utf-8')\n", "\n", "# 检查 'title' 列是否存在\n", "if 'title' not in df.columns:\n", " raise ValueError(\"CSV 文件中没有找到 'title' 列,请检查文件内容\")\n", "\n", "# 定义正则表达式:匹配括号及其中的内容\n", "pattern = re.compile(r'[()()].*?[()()]')\n", "\n", "# 遍历每一行,处理 'title' 列\n", "def clean_title(title):\n", " # 转换为 str 并删除括号内容\n", " cleaned_title = re.sub(pattern, '', str(title))\n", " # 去除多余空格\n", " return cleaned_title.strip()\n", "\n", "# 更新 'title' 列\n", "df['title'] = df['title'].apply(clean_title)\n", "\n", "# 保存到新的 CSV 文件\n", "df.to_csv(output_file, index=False, encoding='utf-8')\n", "\n", "print(f\"处理完成!已保存到:{output_file}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cf63d138-4d16-496a-ad3c-5d67008412f6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# 读取 CSV 文件\n", "file_path = 'info_data_cleaned.csv' # 请替换为你的 CSV 文件路径\n", "df = pd.read_csv(file_path, encoding='utf-8')\n", "\n", "# 检查是否包含 'path' 列\n", "if 'path' not in df.columns:\n", " raise ValueError(\"CSV 文件中未找到 'path' 列,请检查文件内容。\")\n", "\n", "# 将 'path' 列按 '/' 分割,并展开为多列\n", "split_columns = df['path'].str.split('/', expand=True)\n", "\n", "# 重命名列名为 title1, title2, ..., titlen\n", "split_columns.columns = [f'title{i+1}' for i in range(split_columns.shape[1])]\n", "\n", "# 合并原 DataFrame 和新拆分的列\n", "df = pd.concat([df, split_columns], axis=1)\n", "\n", "# 保存结果到新 CSV 文件\n", "output_file = 'info_data_cleaned_split.csv'\n", "df.to_csv(output_file, index=False, encoding='utf-8')\n", "\n", "print(f\"处理完成,结果已保存到 {output_file}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7520603c-90cf-4982-9273-2f130614ca96", "metadata": {}, "outputs": [], "source": [ "import re\n", "import pandas as pd\n", "\n", "# 读取 CSV 文件\n", "input_file = 'info_data_cleaned_split.csv' # 原始 CSV 文件路径\n", "output_file = 'info_data_cleaned_split2.csv' # 处理后保存的 CSV 文件路径\n", "\n", "# 加载 CSV 到 DataFrame\n", "df = pd.read_csv(input_file, encoding='utf-8')\n", "\n", "# 定义正则表达式:匹配括号及其中的内容\n", "pattern = re.compile(r'[()()].*?[()()]')\n", "\n", "# 清洗函数:删除括号及其中内容,并去除多余空格\n", "def clean_text(text):\n", " # 转换为 str 并删除括号内容\n", " cleaned_text = re.sub(pattern, '', str(text))\n", " # 去除多余空格\n", " return cleaned_text.strip()\n", "\n", "# 从第2列开始遍历并清洗\n", "for col in df.columns[1:]:\n", " df[col] = df[col].apply(clean_text)\n", "\n", "# 保存到新的 CSV 文件\n", "df.to_csv(output_file, index=False, encoding='utf-8')\n", "\n", "print(f\"处理完成!已保存到:{output_file}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f03c96de-8bff-4b41-933a-9d9775bb7ad8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# 加载CSV文件\n", "file_path = 'info_data_cleaned_split2.csv' # 请修改为你的文件路径\n", "df = pd.read_csv(file_path, encoding='utf-8')\n", "\n", "# 定义新列名\n", "new_column = 'Previous_Row'\n", "\n", "# 初始化新列\n", "df[new_column] = None\n", "\n", "# 遍历每一行\n", "for i in range(1, len(df)):\n", " # 检查每一列是否为 NaN\n", " if df.iloc[i].isna().any():\n", " # 将前一行的内容放入新列中\n", " df.at[i, new_column] = df.iloc[i-1].to_dict()\n", "\n", "# 输出处理后的 DataFrame\n", "print(df)\n", "\n", "# 如果需要保存结果到新CSV\n", "df.to_csv('info_data_cleaned_split3.csv', index=False, encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": null, "id": "5838e3fb-0546-4433-aa82-0ed367ad05e0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "5f91c155-92b7-4c82-863f-6c942e8df58a", "metadata": {}, "source": [ "## 2.20" ] }, { "cell_type": "code", "execution_count": 12, "id": "c708560b-13c7-494e-b6fa-96c038ae36f5", "metadata": {}, "outputs": [], "source": [ "synonyms_file = \"synonyms.json\"\n", "with open(synonyms_file, \"r\", encoding=\"utf-8\") as f:\n", " synonyms = json.load(f)" ] }, { "cell_type": "code", "execution_count": 16, "id": "2f6e0cc6-7d6e-46cd-a2bc-561fb193ae92", "metadata": {}, "outputs": [], "source": [ "synonym_to_standard = {}\n", "for standard, values in synonyms.items():\n", " for value in values:\n", " synonym_to_standard[value] = standard # 反向映射 { \"表单\": \"报表\" }" ] }, { "cell_type": "code", "execution_count": null, "id": "4199840f-1d36-4328-969c-f973103b437e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "122c3629-ffb1-4deb-8df1-4be71baf82d0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7e88830a-663e-44b6-9b4d-452ac174e046", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e5273065-8470-4a77-8e5c-2cca33cf2e5f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "dify_lab", "language": "python", "name": "dify_lab" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }