上传文件

This commit is contained in:
chentianrui
2025-08-01 15:31:56 +08:00
commit 9609bb67b4
805 changed files with 982256 additions and 0 deletions
+131
View File
@@ -0,0 +1,131 @@
import json
import os
import csv
from collections import defaultdict
def analyze_project_division(json_file_path):
"""
分析JSON文件中的projectDivision数据,统计不同type节点的属性名
Args:
json_file_path: JSON文件路径
Returns:
dict: 按type分类的属性名集合
"""
try:
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 检查是否存在projectData.projectDivision
if "projectData" not in data or "projectDivision" not in data["projectData"]:
print(f"文件 {json_file_path} 中不包含projectData.projectDivision数据")
return {}
project_division = data["projectData"]["projectDivision"]
# 按type分类存储属性名
type_attributes = defaultdict(set)
# 递归遍历树状结构
def traverse_node(node):
if isinstance(node, dict):
# 如果有type或类型字段,则使用该字段作为节点类型
node_type = node.get("type", node.get("类型", "未知类型"))
# 收集当前节点的所有属性名
for attr_name in node.keys():
type_attributes[node_type].add(attr_name)
# 处理子节点
if "children" in node and isinstance(node["children"], list):
for child in node["children"]:
traverse_node(child)
# 处理其他可能的嵌套结构
for key, value in node.items():
if isinstance(value, dict) and key != "children":
traverse_node(value)
elif isinstance(value, list) and key != "children":
for item in value:
traverse_node(item)
elif isinstance(node, list):
for item in node:
traverse_node(item)
# 处理projectDivision的每个顶级键
for key, value in project_division.items():
if isinstance(value, dict):
traverse_node(value)
elif isinstance(value, list):
for item in value:
traverse_node(item)
return type_attributes
except Exception as e:
print(f"处理文件 {json_file_path} 时出错: {e}")
import traceback
traceback.print_exc()
return {}
def save_to_csv(type_attributes, output_file="node_attributes.csv"):
"""
将统计结果保存到CSV文件
Args:
type_attributes: 按type分类的属性名集合
output_file: 输出CSV文件名
"""
with open(output_file, "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["节点类型", "属性名"])
for node_type, attributes in type_attributes.items():
# 将属性名集合转换为排序后的列表
sorted_attrs = sorted(attributes)
for attr in sorted_attrs:
writer.writerow([node_type, attr])
def main():
"""
主函数
"""
# 指定JSON文件路径,可以是单个文件或目录
json_dir = "dataset/json/配网清单/2022行业招标3.1.12_readable.json" # 可以根据实际情况修改
# 存储所有文件的统计结果
all_type_attributes = defaultdict(set)
# 如果是目录,则遍历所有JSON文件
if os.path.isdir(json_dir):
for root, _, files in os.walk(json_dir):
for file in files:
if file.endswith(".json"):
file_path = os.path.join(root, file)
print(f"正在处理: {file_path}")
# 获取当前文件的统计结果
current_type_attrs = analyze_project_division(file_path)
# 合并结果
for node_type, attrs in current_type_attrs.items():
all_type_attributes[node_type].update(attrs)
else:
# 单个文件
print(f"正在处理: {json_dir}")
all_type_attributes = analyze_project_division(json_dir)
# 保存结果到CSV
save_to_csv(all_type_attributes)
print(f"统计结果已保存到 node_attributes.csv")
if __name__ == "__main__":
main()