3.31 上传 dm rewrite
This commit is contained in:
@@ -0,0 +1,183 @@
|
||||
# import pandas as pd
|
||||
#
|
||||
# # 读取 CSV 文件
|
||||
# csv_filename = "info_dify.csv"
|
||||
# md_filename = "output.md"
|
||||
#
|
||||
# # 读取数据
|
||||
# df = pd.read_csv(csv_filename)
|
||||
#
|
||||
# # 处理 path 列,去除前后 /
|
||||
# df['path'] = df['path'].str.strip('/')
|
||||
#
|
||||
# # 构建树结构
|
||||
# tree = {}
|
||||
#
|
||||
# def insert_path(tree, levels):
|
||||
# """ 递归插入路径到树形结构,确保相同层级合并 """
|
||||
# if not levels:
|
||||
# return
|
||||
# key = levels[0]
|
||||
# if key not in tree:
|
||||
# tree[key] = {}
|
||||
# insert_path(tree[key], levels[1:])
|
||||
#
|
||||
# # 遍历 DataFrame 的 path 列
|
||||
# for path in df['path']:
|
||||
# levels = path.split('/') # 拆分层级
|
||||
# insert_path(tree, levels) # 插入到树结构
|
||||
#
|
||||
# def generate_md(tree, level=1):
|
||||
# """ 递归生成 Markdown 文本,合并相同路径 """
|
||||
# md_text = []
|
||||
# for key in sorted(tree.keys()): # 确保有序输出
|
||||
# md_text.append(f"{'#' * level} {key}") # 根据层级添加 `#`
|
||||
# md_text.extend(generate_md(tree[key], level + 1)) # 递归生成子项
|
||||
# return md_text
|
||||
#
|
||||
# # 生成 Markdown 内容
|
||||
# md_content = generate_md(tree)
|
||||
#
|
||||
# # 保存到 Markdown 文件
|
||||
# with open(md_filename, mode='w', encoding='utf-8') as md_file:
|
||||
# md_file.write("\n".join(md_content))
|
||||
#
|
||||
# print(f"Markdown 文件已保存为 {md_filename}")
|
||||
|
||||
###################################################################################################################
|
||||
|
||||
# import re
|
||||
# import pandas as pd
|
||||
#
|
||||
# # 读取 CSV 文件
|
||||
# input_file = 'info_data.csv' # 原始 CSV 文件路径
|
||||
# output_file = 'info_data_cleaned.csv' # 处理后保存的 CSV 文件路径
|
||||
#
|
||||
# # 加载 CSV 到 DataFrame
|
||||
# df = pd.read_csv(input_file, encoding='utf-8')
|
||||
#
|
||||
# # 检查 'title' 列是否存在
|
||||
# if 'title' not in df.columns:
|
||||
# raise ValueError("CSV 文件中没有找到 'title' 列,请检查文件内容")
|
||||
#
|
||||
# # 定义正则表达式:匹配括号及其中的内容
|
||||
# pattern = re.compile(r'[()()].*?[()()]')
|
||||
#
|
||||
# # 遍历每一行,处理 'title' 列
|
||||
# def clean_title(title):
|
||||
# # 转换为 str 并删除括号内容
|
||||
# cleaned_title = re.sub(pattern, '', str(title))
|
||||
# # 去除多余空格
|
||||
# return cleaned_title.strip()
|
||||
#
|
||||
# # 更新 'title' 列
|
||||
# df['title'] = df['title'].apply(clean_title)
|
||||
#
|
||||
# # 保存到新的 CSV 文件
|
||||
# df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
#
|
||||
# print(f"处理完成!已保存到:{output_file}")
|
||||
|
||||
|
||||
##################################################################################################################
|
||||
|
||||
# import pandas as pd
|
||||
#
|
||||
# # 读取 CSV 文件
|
||||
# file_path = 'info_data_cleaned.csv' # 请替换为你的 CSV 文件路径
|
||||
# df = pd.read_csv(file_path, encoding='utf-8')
|
||||
#
|
||||
# # 检查是否包含 'path' 列
|
||||
# if 'path' not in df.columns:
|
||||
# raise ValueError("CSV 文件中未找到 'path' 列,请检查文件内容。")
|
||||
#
|
||||
# # 将 'path' 列按 '/' 分割,并展开为多列
|
||||
# split_columns = df['path'].str.split('/', expand=True)
|
||||
#
|
||||
# # 重命名列名为 title1, title2, ..., titlen
|
||||
# split_columns.columns = [f'title{i+1}' for i in range(split_columns.shape[1])]
|
||||
#
|
||||
# # 合并原 DataFrame 和新拆分的列
|
||||
# df = pd.concat([df, split_columns], axis=1)
|
||||
#
|
||||
# # 保存结果到新 CSV 文件
|
||||
# output_file = 'info_data_cleaned_split.csv'
|
||||
# df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
#
|
||||
# print(f"处理完成,结果已保存到 {output_file}")
|
||||
|
||||
##################################################################################################################
|
||||
|
||||
# import re
|
||||
# import pandas as pd
|
||||
#
|
||||
# # 读取 CSV 文件
|
||||
# input_file = 'info_data_cleaned_split.csv' # 原始 CSV 文件路径
|
||||
# output_file = 'info_data_cleaned_split2.csv' # 处理后保存的 CSV 文件路径
|
||||
#
|
||||
# # 加载 CSV 到 DataFrame
|
||||
# df = pd.read_csv(input_file, encoding='utf-8')
|
||||
#
|
||||
# # 定义正则表达式:匹配括号及其中的内容
|
||||
# pattern = re.compile(r'[()()].*?[()()]')
|
||||
#
|
||||
# # 清洗函数:删除括号及其中内容,并去除多余空格
|
||||
# def clean_text(text):
|
||||
# # 转换为 str 并删除括号内容
|
||||
# cleaned_text = re.sub(pattern, '', str(text))
|
||||
# # 去除多余空格
|
||||
# return cleaned_text.strip()
|
||||
#
|
||||
# # 从第2列开始遍历并清洗
|
||||
# for col in df.columns[1:]:
|
||||
# df[col] = df[col].apply(clean_text)
|
||||
#
|
||||
# # 保存到新的 CSV 文件
|
||||
# df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
#
|
||||
# print(f"处理完成!已保存到:{output_file}")
|
||||
|
||||
|
||||
#####################################################################################################################
|
||||
|
||||
# import pandas as pd
|
||||
#
|
||||
# # 加载CSV文件
|
||||
# file_path = 'info_data_cleaned_split2.csv' # 请修改为你的文件路径
|
||||
# df = pd.read_csv(file_path, encoding='utf-8')
|
||||
#
|
||||
# # 定义新列名
|
||||
# new_column = 'Previous_Row'
|
||||
#
|
||||
# # 初始化新列
|
||||
# df[new_column] = None
|
||||
#
|
||||
# # 遍历每一行
|
||||
# for i in range(1, len(df)):
|
||||
# # 检查每一列是否为 NaN
|
||||
# if df.iloc[i].isna().any():
|
||||
# # 将前一行的内容放入新列中
|
||||
# df.at[i, new_column] = df.iloc[i-1].to_dict()
|
||||
#
|
||||
# # 输出处理后的 DataFrame
|
||||
# print(df)
|
||||
#
|
||||
# # 如果需要保存结果到新CSV
|
||||
# df.to_csv('info_data_cleaned_split3.csv', index=False, encoding='utf-8')
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# 读取 CSV 文件
|
||||
input_file = "info_dify_mini.csv" # 原始 CSV 文件
|
||||
output_file = "info_dify_mini_2.csv" # 过滤后的新 CSV 文件
|
||||
|
||||
# 读取数据
|
||||
df = pd.read_csv(input_file, encoding='utf-8')
|
||||
|
||||
# 过滤包含'费'字的行(假设'描述类'是列名)
|
||||
filtered_df = df[df['描述类'].astype(str).str.contains('费', na=False)]
|
||||
|
||||
# 保存到新 CSV 文件
|
||||
filtered_df.to_csv(output_file, index=False, encoding='utf-8')
|
||||
|
||||
print(f"筛选完成,共找到 {len(filtered_df)} 行数据,并已保存到 {output_file}")
|
||||
Reference in New Issue
Block a user