3.31 上传 dm rewrite

2025-03-31 15:17:47 +08:00
commit b444310280
430 changed files with 39039 additions and 0 deletions
@@ -0,0 +1,183 @@
+# import pandas as pd
+#
+# # 读取 CSV 文件
+# csv_filename = "info_dify.csv"
+# md_filename = "output.md"
+#
+# # 读取数据
+# df = pd.read_csv(csv_filename)
+#
+# # 处理 path 列，去除前后 /
+# df['path'] = df['path'].str.strip('/')
+#
+# # 构建树结构
+# tree = {}
+#
+# def insert_path(tree, levels):
+#     """ 递归插入路径到树形结构，确保相同层级合并 """
+#     if not levels:
+#         return
+#     key = levels[0]
+#     if key not in tree:
+#         tree[key] = {}
+#     insert_path(tree[key], levels[1:])
+#
+# # 遍历 DataFrame 的 path 列
+# for path in df['path']:
+#     levels = path.split('/')  # 拆分层级
+#     insert_path(tree, levels)  # 插入到树结构
+#
+# def generate_md(tree, level=1):
+#     """ 递归生成 Markdown 文本，合并相同路径 """
+#     md_text = []
+#     for key in sorted(tree.keys()):  # 确保有序输出
+#         md_text.append(f"{'#' * level} {key}")  # 根据层级添加 `#`
+#         md_text.extend(generate_md(tree[key], level + 1))  # 递归生成子项
+#     return md_text
+#
+# # 生成 Markdown 内容
+# md_content = generate_md(tree)
+#
+# # 保存到 Markdown 文件
+# with open(md_filename, mode='w', encoding='utf-8') as md_file:
+#     md_file.write("\n".join(md_content))
+#
+# print(f"Markdown 文件已保存为 {md_filename}")
+
+###################################################################################################################
+
+# import re
+# import pandas as pd
+#
+# # 读取 CSV 文件
+# input_file = 'info_data.csv'      # 原始 CSV 文件路径
+# output_file = 'info_data_cleaned.csv'  # 处理后保存的 CSV 文件路径
+#
+# # 加载 CSV 到 DataFrame
+# df = pd.read_csv(input_file, encoding='utf-8')
+#
+# # 检查 'title' 列是否存在
+# if 'title' not in df.columns:
+#     raise ValueError("CSV 文件中没有找到 'title' 列，请检查文件内容")
+#
+# # 定义正则表达式：匹配括号及其中的内容
+# pattern = re.compile(r'[（）()].*?[（）()]')
+#
+# # 遍历每一行，处理 'title' 列
+# def clean_title(title):
+#     # 转换为 str 并删除括号内容
+#     cleaned_title = re.sub(pattern, '', str(title))
+#     # 去除多余空格
+#     return cleaned_title.strip()
+#
+# # 更新 'title' 列
+# df['title'] = df['title'].apply(clean_title)
+#
+# # 保存到新的 CSV 文件
+# df.to_csv(output_file, index=False, encoding='utf-8')
+#
+# print(f"处理完成！已保存到：{output_file}")
+
+
+##################################################################################################################
+
+# import pandas as pd
+#
+# # 读取 CSV 文件
+# file_path = 'info_data_cleaned.csv'  # 请替换为你的 CSV 文件路径
+# df = pd.read_csv(file_path, encoding='utf-8')
+#
+# # 检查是否包含 'path' 列
+# if 'path' not in df.columns:
+#     raise ValueError("CSV 文件中未找到 'path' 列，请检查文件内容。")
+#
+# # 将 'path' 列按 '/' 分割，并展开为多列
+# split_columns = df['path'].str.split('/', expand=True)
+#
+# # 重命名列名为 title1, title2, ..., titlen
+# split_columns.columns = [f'title{i+1}' for i in range(split_columns.shape[1])]
+#
+# # 合并原 DataFrame 和新拆分的列
+# df = pd.concat([df, split_columns], axis=1)
+#
+# # 保存结果到新 CSV 文件
+# output_file = 'info_data_cleaned_split.csv'
+# df.to_csv(output_file, index=False, encoding='utf-8')
+#
+# print(f"处理完成，结果已保存到 {output_file}")
+
+##################################################################################################################
+
+# import re
+# import pandas as pd
+#
+# # 读取 CSV 文件
+# input_file = 'info_data_cleaned_split.csv'      # 原始 CSV 文件路径
+# output_file = 'info_data_cleaned_split2.csv'  # 处理后保存的 CSV 文件路径
+#
+# # 加载 CSV 到 DataFrame
+# df = pd.read_csv(input_file, encoding='utf-8')
+#
+# # 定义正则表达式：匹配括号及其中的内容
+# pattern = re.compile(r'[（）()].*?[（）()]')
+#
+# # 清洗函数：删除括号及其中内容，并去除多余空格
+# def clean_text(text):
+#     # 转换为 str 并删除括号内容
+#     cleaned_text = re.sub(pattern, '', str(text))
+#     # 去除多余空格
+#     return cleaned_text.strip()
+#
+# # 从第2列开始遍历并清洗
+# for col in df.columns[1:]:
+#     df[col] = df[col].apply(clean_text)
+#
+# # 保存到新的 CSV 文件
+# df.to_csv(output_file, index=False, encoding='utf-8')
+#
+# print(f"处理完成！已保存到：{output_file}")
+
+
+#####################################################################################################################
+
+# import pandas as pd
+#
+# # 加载CSV文件
+# file_path = 'info_data_cleaned_split2.csv'  # 请修改为你的文件路径
+# df = pd.read_csv(file_path, encoding='utf-8')
+#
+# # 定义新列名
+# new_column = 'Previous_Row'
+#
+# # 初始化新列
+# df[new_column] = None
+#
+# # 遍历每一行
+# for i in range(1, len(df)):
+#     # 检查每一列是否为 NaN
+#     if df.iloc[i].isna().any():
+#         # 将前一行的内容放入新列中
+#         df.at[i, new_column] = df.iloc[i-1].to_dict()
+#
+# # 输出处理后的 DataFrame
+# print(df)
+#
+# # 如果需要保存结果到新CSV
+# df.to_csv('info_data_cleaned_split3.csv', index=False, encoding='utf-8')
+
+import pandas as pd
+
+# 读取 CSV 文件
+input_file = "info_dify_mini.csv"  # 原始 CSV 文件
+output_file = "info_dify_mini_2.csv"  # 过滤后的新 CSV 文件
+
+# 读取数据
+df = pd.read_csv(input_file, encoding='utf-8')
+
+# 过滤包含'费'字的行（假设'描述类'是列名）
+filtered_df = df[df['描述类'].astype(str).str.contains('费', na=False)]
+
+# 保存到新 CSV 文件
+filtered_df.to_csv(output_file, index=False, encoding='utf-8')
+
+print(f"筛选完成，共找到 {len(filtered_df)} 行数据，并已保存到 {output_file}")