增加了知识图谱导出excel
This commit is contained in:
+293
-125
@@ -1,141 +1,283 @@
|
||||
"""
|
||||
第四步:实现项目划分和清单节点费用预览向上汇总
|
||||
第三步:向上汇总费用预览
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from typing import Dict, List, Any, Tuple, Optional
|
||||
import copy
|
||||
import re
|
||||
|
||||
|
||||
class ExpenseProcessor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def normalize_guid(guid: str) -> str:
|
||||
"""
|
||||
标准化GUID格式,确保只有单中括号
|
||||
:param guid: 原始GUID字符串
|
||||
:return: 标准化后的GUID字符串
|
||||
"""
|
||||
if not guid:
|
||||
return guid
|
||||
# 移除所有中括号,然后添加单中括号
|
||||
normalized = guid.strip("{}")
|
||||
return "{" + normalized + "}"
|
||||
|
||||
@staticmethod
|
||||
def is_cost_item(obj: Any) -> bool:
|
||||
"""
|
||||
判断一个对象是否为费用项(只有 id 和 cost 字段)
|
||||
"""
|
||||
return (
|
||||
isinstance(obj, dict)
|
||||
and "id" in obj
|
||||
and "cost" in obj
|
||||
and len(obj) <= 2 # 允许有额外字段,但核心是 id 和 cost
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_costs_from_children(node: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从节点的 children 中提取费用项(用于叶子节点)
|
||||
:param node: 节点
|
||||
:return: 费用项列表
|
||||
"""
|
||||
costs = []
|
||||
if "children" in node and isinstance(node["children"], list):
|
||||
for child in node["children"]:
|
||||
if ExpenseProcessor.is_cost_item(child):
|
||||
# 深拷贝费用项
|
||||
costs.append(copy.deepcopy(child))
|
||||
return costs
|
||||
|
||||
@staticmethod
|
||||
def calculate_parent_costs(node: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
计算节点的汇总费用
|
||||
计算节点的汇总费用(包括自身和所有后代)
|
||||
:param node: 费用预览节点
|
||||
:return: 汇总后的费用项列表
|
||||
"""
|
||||
result_nodes = []
|
||||
processed_ids = set()
|
||||
processed_ids = {}
|
||||
|
||||
# 首先处理节点自身的sum数组
|
||||
# 1. 收集本节点自身的 sum 费用
|
||||
if "sum" in node and isinstance(node["sum"], list):
|
||||
for cost_item in node["sum"]:
|
||||
if "id" in cost_item and "cost" in cost_item:
|
||||
result_nodes.append(copy.deepcopy(cost_item))
|
||||
processed_ids.add(cost_item["id"])
|
||||
item_id = cost_item["id"]
|
||||
if item_id not in processed_ids:
|
||||
processed_ids[item_id] = 0.0
|
||||
try:
|
||||
processed_ids[item_id] += float(cost_item["cost"])
|
||||
except (ValueError, TypeError):
|
||||
pass # 忽略无效 cost
|
||||
|
||||
# 然后处理children节点
|
||||
# 2. 检查 children 中是否直接包含费用项(叶子节点)
|
||||
child_costs = ExpenseProcessor.extract_costs_from_children(node)
|
||||
for cost_item in child_costs:
|
||||
item_id = cost_item["id"]
|
||||
if item_id not in processed_ids:
|
||||
processed_ids[item_id] = 0.0
|
||||
try:
|
||||
processed_ids[item_id] += float(cost_item["cost"])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 3. 递归处理子节点(结构化节点)
|
||||
# 注意:这里我们不需要再递归计算,因为每个子节点已经在process_node中计算了自己的sum
|
||||
# 我们只需要直接使用子节点的sum即可
|
||||
if "children" in node and isinstance(node["children"], list):
|
||||
for child in node["children"]:
|
||||
child_costs = ExpenseProcessor.calculate_parent_costs(child)
|
||||
|
||||
# 合并子节点的费用
|
||||
for cost_item in child_costs:
|
||||
if "id" in cost_item:
|
||||
# 查找是否已存在相同id的费用项
|
||||
found = False
|
||||
for existing in result_nodes:
|
||||
if "id" in existing and existing["id"] == cost_item["id"]:
|
||||
# 如果存在,累加cost值
|
||||
existing["cost"] = str(float(existing["cost"]) + float(cost_item["cost"]))
|
||||
found = True
|
||||
break
|
||||
|
||||
# 如果不存在,添加新的费用项
|
||||
if not found:
|
||||
result_nodes.append(copy.deepcopy(cost_item))
|
||||
else:
|
||||
# 处理没有id的费用项(不常见)
|
||||
found = False
|
||||
for existing in result_nodes:
|
||||
if "id" not in existing:
|
||||
existing["cost"] = str(float(existing["cost"]) + float(cost_item["cost"]))
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
result_nodes.append(copy.deepcopy(cost_item))
|
||||
# 只处理非费用项的子节点
|
||||
if not ExpenseProcessor.is_cost_item(child):
|
||||
# 直接使用子节点的sum
|
||||
if "sum" in child and isinstance(child["sum"], list):
|
||||
for cost_item in child["sum"]:
|
||||
if "id" in cost_item and "cost" in cost_item:
|
||||
item_id = cost_item["id"]
|
||||
if item_id not in processed_ids:
|
||||
processed_ids[item_id] = 0.0
|
||||
try:
|
||||
processed_ids[item_id] += float(cost_item["cost"])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 构建结果
|
||||
result_nodes = [{"id": item_id, "cost": str(total_cost)} for item_id, total_cost in processed_ids.items()]
|
||||
return result_nodes
|
||||
|
||||
@staticmethod
|
||||
def process_node(node: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def find_guid_quantity(project_data: Optional[Dict[str, Any]], guid: str) -> float:
|
||||
"""
|
||||
在projectDivision中查找指定GUID节点的数量
|
||||
:param project_data: 项目数据
|
||||
:param guid: 要查找的GUID(带花括号的格式)
|
||||
:return: 数量值,如果未找到则返回1.0
|
||||
"""
|
||||
if not project_data or "projectDivision" not in project_data:
|
||||
return 1.0
|
||||
|
||||
# 移除花括号以便比较
|
||||
guid_clean = guid.strip("{}")
|
||||
|
||||
def search_node_quantity(node):
|
||||
if isinstance(node, dict):
|
||||
# 检查当前节点的GUID
|
||||
node_guid = node.get("GUID", "").strip("{}")
|
||||
if node_guid == guid_clean:
|
||||
# 找到匹配的GUID,获取数量
|
||||
quantity = node.get("数量")
|
||||
if quantity:
|
||||
try:
|
||||
return float(quantity)
|
||||
except (ValueError, TypeError):
|
||||
return 1.0
|
||||
|
||||
# 递归查找子节点
|
||||
for key, value in node.items():
|
||||
if isinstance(value, (dict, list)):
|
||||
result = search_node_quantity(value)
|
||||
if result != 1.0: # 找到非默认值
|
||||
return result
|
||||
|
||||
elif isinstance(node, list):
|
||||
for item in node:
|
||||
result = search_node_quantity(item)
|
||||
if result != 1.0: # 找到非默认值
|
||||
return result
|
||||
|
||||
return 1.0 # 默认返回1.0
|
||||
|
||||
return search_node_quantity(project_data["projectDivision"])
|
||||
|
||||
@staticmethod
|
||||
def process_node(
|
||||
node: Dict[str, Any], project_data: Optional[Dict[str, Any]] = None, is_bill_engineering: Optional[bool] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
处理单个节点,计算汇总费用并更新sum数组
|
||||
:param node: 费用预览节点
|
||||
:param project_data: 项目数据,用于查找GUID对应的数量
|
||||
:param is_bill_engineering: 是否为清单工程
|
||||
:return: 处理后的节点
|
||||
"""
|
||||
result = copy.deepcopy(node)
|
||||
|
||||
# 如果没有children,则不需要汇总
|
||||
if "children" not in node or not node["children"]:
|
||||
# 确保节点有标准格式
|
||||
if "sum" not in result:
|
||||
result["sum"] = []
|
||||
if "children" not in result:
|
||||
result["children"] = []
|
||||
if "rcj" not in result:
|
||||
result["rcj"] = []
|
||||
return result
|
||||
# 标准化GUID格式
|
||||
if "GUID" in result:
|
||||
result["GUID"] = ExpenseProcessor.normalize_guid(result["GUID"])
|
||||
|
||||
# 计算汇总费用
|
||||
cost_items = ExpenseProcessor.calculate_parent_costs(node)
|
||||
|
||||
# 更新sum数组
|
||||
if cost_items:
|
||||
# 确保只保留id和cost两个属性
|
||||
result["sum"] = [{"id": item["id"], "cost": item["cost"]} for item in cost_items if "id" in item]
|
||||
else:
|
||||
# 确保关键字段存在
|
||||
if "sum" not in result:
|
||||
result["sum"] = []
|
||||
|
||||
# 递归处理子节点
|
||||
result["children"] = [ExpenseProcessor.process_node(child) for child in node["children"]]
|
||||
|
||||
# 确保rcj数组存在
|
||||
if "rcj" not in result:
|
||||
result["rcj"] = []
|
||||
if "children" not in result:
|
||||
result["children"] = []
|
||||
|
||||
# 如果is_bill_engineering为None,默认为False
|
||||
if is_bill_engineering is None:
|
||||
is_bill_engineering = False
|
||||
|
||||
# === 特殊处理:如果 children 包含的是费用项(叶子节点)===
|
||||
direct_costs = ExpenseProcessor.extract_costs_from_children(result)
|
||||
if direct_costs:
|
||||
# 如果是清单工程且有项目数据,需要根据GUID调整费用
|
||||
if is_bill_engineering and project_data and "GUID" in result:
|
||||
guid = result["GUID"]
|
||||
quantity = ExpenseProcessor.find_guid_quantity(project_data, guid)
|
||||
|
||||
# 调整费用值:乘以数量
|
||||
for cost_item in direct_costs:
|
||||
try:
|
||||
original_cost = float(cost_item["cost"])
|
||||
adjusted_cost = original_cost * quantity
|
||||
cost_item["cost"] = str(adjusted_cost)
|
||||
except (ValueError, TypeError):
|
||||
pass # 忽略无效 cost
|
||||
|
||||
# 将直接费用项迁移到 sum
|
||||
result["sum"] = direct_costs
|
||||
# 清空 children(因为已经迁移)
|
||||
result["children"] = []
|
||||
# 不再递归处理 children
|
||||
return result
|
||||
|
||||
# === 普通节点处理:children 是子节点列表 ===
|
||||
# 递归处理所有子节点
|
||||
processed_children = []
|
||||
if result["children"]:
|
||||
for child in node["children"]:
|
||||
if not ExpenseProcessor.is_cost_item(child):
|
||||
processed_child = ExpenseProcessor.process_node(child, project_data, is_bill_engineering)
|
||||
processed_children.append(processed_child)
|
||||
|
||||
# 更新处理后的子节点
|
||||
result["children"] = processed_children
|
||||
|
||||
# 重要修改:使用处理后的result(包含已处理的子节点)来计算汇总费用
|
||||
# 而不是使用原始的node
|
||||
total_costs = ExpenseProcessor.calculate_parent_costs(result)
|
||||
result["sum"] = total_costs
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def process_expense_preview(expense_preview: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def process_expense_preview(
|
||||
expense_preview: Dict[str, Any],
|
||||
project_data: Optional[Dict[str, Any]] = None,
|
||||
is_bill_engineering: Optional[bool] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
处理整个费用预览结构
|
||||
:param expense_preview: 费用预览结构
|
||||
:return: 处理后的费用预览结构
|
||||
:param expense_preview: 费用预览数据
|
||||
:param project_data: 项目数据,用于查找GUID对应的数量
|
||||
:param is_bill_engineering: 是否为清单工程
|
||||
:return: 处理后的费用预览数据
|
||||
"""
|
||||
# 如果is_bill_engineering为None,默认为False
|
||||
if is_bill_engineering is None:
|
||||
is_bill_engineering = False
|
||||
|
||||
result = copy.deepcopy(expense_preview)
|
||||
for category_key, category_value in expense_preview.items():
|
||||
for subcategory_key, subcategory_value in category_value.items():
|
||||
if isinstance(subcategory_value, list):
|
||||
result[category_key][subcategory_key] = [
|
||||
ExpenseProcessor.process_node(item) for item in subcategory_value
|
||||
]
|
||||
if isinstance(category_value, dict):
|
||||
for subcategory_key, subcategory_value in category_value.items():
|
||||
if isinstance(subcategory_value, list):
|
||||
result[category_key][subcategory_key] = [
|
||||
ExpenseProcessor.process_node(item, project_data, is_bill_engineering)
|
||||
for item in subcategory_value
|
||||
]
|
||||
elif isinstance(category_value, list):
|
||||
result[category_key] = [
|
||||
ExpenseProcessor.process_node(item, project_data, is_bill_engineering) for item in category_value
|
||||
]
|
||||
return result
|
||||
|
||||
# 以下方法保持不变
|
||||
@classmethod
|
||||
def load_and_process_from_file(cls, input_path: str, output_path: str = None) -> Dict[str, Any]:
|
||||
"""
|
||||
从文件加载 JSON 并处理
|
||||
:param input_path: 输入文件路径
|
||||
:param output_path: 输出文件路径(可选)
|
||||
:return: 处理后的完整数据
|
||||
"""
|
||||
def load_and_process_from_file(
|
||||
cls, input_path: str, output_path: str | None = None, is_bill_engineering: Optional[bool] = None
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
with open(input_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if "projectData" in data and "expensePreview" in data["projectData"]:
|
||||
# 如果没有指定工程类型,则自动判断
|
||||
if is_bill_engineering is None:
|
||||
project_type = _determine_project_type(data)
|
||||
is_bill_engineering = project_type == "inventory"
|
||||
print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
|
||||
|
||||
processed_data = copy.deepcopy(data)
|
||||
processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
|
||||
data["projectData"]["expensePreview"]
|
||||
data["projectData"]["expensePreview"],
|
||||
data["projectData"] if is_bill_engineering else None,
|
||||
is_bill_engineering,
|
||||
)
|
||||
|
||||
if output_path:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(processed_data, f, ensure_ascii=False, indent=4)
|
||||
@@ -149,81 +291,107 @@ class ExpenseProcessor:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def process_raw_data(cls, raw_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
直接处理原始数据(不涉及文件读写)
|
||||
:param raw_data: 原始数据,格式应包含 projectData.expensePreview
|
||||
:return: 处理后的数据
|
||||
"""
|
||||
def process_raw_data(cls, raw_data: Dict[str, Any], is_bill_engineering: Optional[bool] = None) -> Dict[str, Any]:
|
||||
if "projectData" in raw_data and "expensePreview" in raw_data["projectData"]:
|
||||
# 如果没有指定工程类型,则自动判断
|
||||
if is_bill_engineering is None:
|
||||
project_type = _determine_project_type(raw_data)
|
||||
is_bill_engineering = project_type == "inventory"
|
||||
print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
|
||||
|
||||
processed_data = copy.deepcopy(raw_data)
|
||||
processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
|
||||
raw_data["projectData"]["expensePreview"]
|
||||
raw_data["projectData"]["expensePreview"],
|
||||
raw_data["projectData"] if is_bill_engineering else None,
|
||||
is_bill_engineering,
|
||||
)
|
||||
return processed_data
|
||||
else:
|
||||
raise ValueError("未找到 projectData.expensePreview 路径")
|
||||
|
||||
@classmethod
|
||||
def process_directory(cls, input_dir: str, output_dir: str) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
处理指定目录中的所有JSON文件
|
||||
|
||||
:param input_dir: 输入目录路径,包含要处理的JSON文件
|
||||
:param output_dir: 输出目录路径,处理后的JSON文件将保存在这里
|
||||
:return: 成功处理的文件列表,格式为 [(源文件路径, 输出文件路径), ...]
|
||||
"""
|
||||
# 确保输出目录存在
|
||||
def process_directory(
|
||||
cls, input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
|
||||
) -> List[Tuple[str, str]]:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 查找所有JSON文件
|
||||
json_files = []
|
||||
for file in os.listdir(input_dir):
|
||||
if file.lower().endswith(".json"):
|
||||
json_files.append(os.path.join(input_dir, file))
|
||||
|
||||
json_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".json")]
|
||||
if not json_files:
|
||||
print(f"警告: 在目录 {input_dir} 中没有找到JSON文件")
|
||||
return []
|
||||
|
||||
# 处理每个JSON文件
|
||||
successful_files = []
|
||||
for input_file in json_files:
|
||||
file_name = os.path.basename(input_file)
|
||||
output_file = os.path.join(output_dir, file_name)
|
||||
|
||||
for file in json_files:
|
||||
input_file = os.path.join(input_dir, file)
|
||||
output_file = os.path.join(output_dir, file)
|
||||
print(f"处理文件: {input_file}")
|
||||
processed_data = cls.load_and_process_from_file(input_file, output_file)
|
||||
|
||||
processed_data = cls.load_and_process_from_file(input_file, output_file, is_bill_engineering)
|
||||
if processed_data:
|
||||
successful_files.append((input_file, output_file))
|
||||
print(f"✅ 成功处理: {file_name}")
|
||||
print(f"✅ 成功处理: {file}")
|
||||
else:
|
||||
print(f"❌ 处理失败: {file_name}")
|
||||
|
||||
print(f"❌ 处理失败: {file}")
|
||||
return successful_files
|
||||
|
||||
|
||||
def costsummary_upwards(input_dir: str, output_dir: str) -> List[Tuple[str, str]]:
|
||||
def _determine_project_type(data):
|
||||
"""
|
||||
处理指定目录中的所有JSON文件,实现项目划分和清单节点费用预览向上汇总
|
||||
根据division字段判断工程类型
|
||||
:param data: 项目数据
|
||||
:return: 'inventory' 表示清单工程,'budget' 表示预算工程
|
||||
"""
|
||||
# 清单工程关键词
|
||||
inventory_keywords = ["清单", "结算", "招标控制价", "招投标工程", "清单计价"]
|
||||
# 预算工程关键词
|
||||
budget_keywords = ["概预算", "定额", "定额计价", "概算", "概预算工程"]
|
||||
|
||||
:param input_dir: 输入目录路径,包含要处理的JSON文件
|
||||
:param output_dir: 输出目录路径,处理后的JSON文件将保存在这里
|
||||
:return: 成功处理的文件列表,格式为 [(源文件路径, 输出文件路径), ...]
|
||||
"""
|
||||
return ExpenseProcessor.process_directory(input_dir, output_dir)
|
||||
# 尝试从数据中获取division字段
|
||||
division = None
|
||||
if "division" in data:
|
||||
division = data["division"]
|
||||
parts = division.split("-")
|
||||
|
||||
# 如果找到division字段
|
||||
if division:
|
||||
# 去掉"主网-"前缀
|
||||
if len(parts) == 2:
|
||||
division_type = parts[1].strip()
|
||||
else:
|
||||
division_type = parts[2].strip()
|
||||
|
||||
# 判断是否为清单工程
|
||||
for keyword in inventory_keywords:
|
||||
if keyword in division_type:
|
||||
print(f"根据division字段 '{division}' 判断为清单工程")
|
||||
return "inventory"
|
||||
|
||||
# 判断是否为预算工程
|
||||
for keyword in budget_keywords:
|
||||
if keyword in division_type:
|
||||
print(f"根据division字段 '{division}' 判断为预算工程")
|
||||
return "budget"
|
||||
|
||||
# 如果无法通过division字段判断,则尝试通过数据结构判断
|
||||
is_inventory_project = False
|
||||
for key in data.keys():
|
||||
if re.search(r"[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}", key, re.IGNORECASE):
|
||||
is_inventory_project = True
|
||||
print("通过数据结构判断为清单工程")
|
||||
break
|
||||
|
||||
return "inventory" if is_inventory_project else "budget"
|
||||
|
||||
|
||||
def costsummary_upwards(
|
||||
input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
|
||||
) -> List[Tuple[str, str]]:
|
||||
return ExpenseProcessor.process_directory(input_dir, output_dir, is_bill_engineering)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 使用示例
|
||||
input_directory = "final_outputs" # 输入JSON文件夹路径
|
||||
output_directory = "final_outputs" # 输出JSON文件夹路径
|
||||
|
||||
# 处理整个文件夹
|
||||
input_directory = "project2json/outputs/json"
|
||||
output_directory = "project2json/outputs/merged"
|
||||
# 自动判断工程类型
|
||||
result = costsummary_upwards(input_directory, output_directory)
|
||||
|
||||
# 显示处理结果
|
||||
if result:
|
||||
print(f"\n成功处理了 {len(result)} 个文件:")
|
||||
for src, dst in result:
|
||||
|
||||
Reference in New Issue
Block a user