Files
KG_generation/supplement_kg.py
T
2025-08-18 15:14:37 +08:00

401 lines
16 KiB
Python

"""
第三步:向上汇总费用预览
"""
import json
import os
from typing import Dict, List, Any, Tuple, Optional
import copy
import re
class ExpenseProcessor:
def __init__(self):
pass
@staticmethod
def normalize_guid(guid: str) -> str:
"""
标准化GUID格式,确保只有单中括号
:param guid: 原始GUID字符串
:return: 标准化后的GUID字符串
"""
if not guid:
return guid
# 移除所有中括号,然后添加单中括号
normalized = guid.strip("{}")
return "{" + normalized + "}"
@staticmethod
def is_cost_item(obj: Any) -> bool:
"""
判断一个对象是否为费用项(只有 id 和 cost 字段)
"""
return (
isinstance(obj, dict)
and "id" in obj
and "cost" in obj
and len(obj) <= 2 # 允许有额外字段,但核心是 id 和 cost
)
@staticmethod
def extract_costs_from_children(node: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
从节点的 children 中提取费用项(用于叶子节点)
:param node: 节点
:return: 费用项列表
"""
costs = []
if "children" in node and isinstance(node["children"], list):
for child in node["children"]:
if ExpenseProcessor.is_cost_item(child):
# 深拷贝费用项
costs.append(copy.deepcopy(child))
return costs
@staticmethod
def calculate_parent_costs(node: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
计算节点的汇总费用(包括自身和所有后代)
:param node: 费用预览节点
:return: 汇总后的费用项列表
"""
result_nodes = []
processed_ids = {}
# 1. 收集本节点自身的 sum 费用
if "sum" in node and isinstance(node["sum"], list):
for cost_item in node["sum"]:
if "id" in cost_item and "cost" in cost_item:
item_id = cost_item["id"]
if item_id not in processed_ids:
processed_ids[item_id] = 0.0
try:
processed_ids[item_id] += float(cost_item["cost"])
except (ValueError, TypeError):
pass # 忽略无效 cost
# 2. 检查 children 中是否直接包含费用项(叶子节点)
child_costs = ExpenseProcessor.extract_costs_from_children(node)
for cost_item in child_costs:
item_id = cost_item["id"]
if item_id not in processed_ids:
processed_ids[item_id] = 0.0
try:
processed_ids[item_id] += float(cost_item["cost"])
except (ValueError, TypeError):
pass
# 3. 递归处理子节点(结构化节点)
# 注意:这里我们不需要再递归计算,因为每个子节点已经在process_node中计算了自己的sum
# 我们只需要直接使用子节点的sum即可
if "children" in node and isinstance(node["children"], list):
for child in node["children"]:
# 只处理非费用项的子节点
if not ExpenseProcessor.is_cost_item(child):
# 直接使用子节点的sum
if "sum" in child and isinstance(child["sum"], list):
for cost_item in child["sum"]:
if "id" in cost_item and "cost" in cost_item:
item_id = cost_item["id"]
if item_id not in processed_ids:
processed_ids[item_id] = 0.0
try:
processed_ids[item_id] += float(cost_item["cost"])
except (ValueError, TypeError):
pass
# 构建结果
result_nodes = [{"id": item_id, "cost": str(total_cost)} for item_id, total_cost in processed_ids.items()]
return result_nodes
@staticmethod
def find_guid_quantity(project_data: Optional[Dict[str, Any]], guid: str) -> float:
"""
在projectDivision中查找指定GUID节点的数量
:param project_data: 项目数据
:param guid: 要查找的GUID(带花括号的格式)
:return: 数量值,如果未找到则返回1.0
"""
if not project_data or "projectDivision" not in project_data:
return 1.0
# 移除花括号以便比较
guid_clean = guid.strip("{}")
def search_node_quantity(node):
if isinstance(node, dict):
# 检查当前节点的GUID
node_guid = node.get("GUID", "").strip("{}")
if node_guid == guid_clean:
# 找到匹配的GUID,获取数量
quantity = node.get("数量")
if quantity:
try:
return float(quantity)
except (ValueError, TypeError):
return 1.0
# 递归查找子节点
for key, value in node.items():
if isinstance(value, (dict, list)):
result = search_node_quantity(value)
if result != 1.0: # 找到非默认值
return result
elif isinstance(node, list):
for item in node:
result = search_node_quantity(item)
if result != 1.0: # 找到非默认值
return result
return 1.0 # 默认返回1.0
return search_node_quantity(project_data["projectDivision"])
@staticmethod
def process_node(
node: Dict[str, Any], project_data: Optional[Dict[str, Any]] = None, is_bill_engineering: Optional[bool] = None
) -> Dict[str, Any]:
"""
处理单个节点,计算汇总费用并更新sum数组
:param node: 费用预览节点
:param project_data: 项目数据,用于查找GUID对应的数量
:param is_bill_engineering: 是否为清单工程
:return: 处理后的节点
"""
result = copy.deepcopy(node)
# 标准化GUID格式
if "GUID" in result:
result["GUID"] = ExpenseProcessor.normalize_guid(result["GUID"])
# 确保关键字段存在
if "sum" not in result:
result["sum"] = []
if "rcj" not in result:
result["rcj"] = []
if "children" not in result:
result["children"] = []
# 如果is_bill_engineering为None,默认为False
if is_bill_engineering is None:
is_bill_engineering = False
# === 特殊处理:如果 children 包含的是费用项(叶子节点)===
direct_costs = ExpenseProcessor.extract_costs_from_children(result)
if direct_costs:
# 如果是清单工程且有项目数据,需要根据GUID调整费用
if is_bill_engineering and project_data and "GUID" in result:
guid = result["GUID"]
quantity = ExpenseProcessor.find_guid_quantity(project_data, guid)
# 调整费用值:乘以数量
for cost_item in direct_costs:
try:
original_cost = float(cost_item["cost"])
adjusted_cost = original_cost * quantity
cost_item["cost"] = str(adjusted_cost)
except (ValueError, TypeError):
pass # 忽略无效 cost
# 将直接费用项迁移到 sum
result["sum"] = direct_costs
# 清空 children(因为已经迁移)
result["children"] = []
# 不再递归处理 children
return result
# === 普通节点处理:children 是子节点列表 ===
# 递归处理所有子节点
processed_children = []
if result["children"]:
for child in node["children"]:
if not ExpenseProcessor.is_cost_item(child):
processed_child = ExpenseProcessor.process_node(child, project_data, is_bill_engineering)
processed_children.append(processed_child)
# 更新处理后的子节点
result["children"] = processed_children
# 重要修改:使用处理后的result(包含已处理的子节点)来计算汇总费用
# 而不是使用原始的node
total_costs = ExpenseProcessor.calculate_parent_costs(result)
result["sum"] = total_costs
return result
@staticmethod
def process_expense_preview(
expense_preview: Dict[str, Any],
project_data: Optional[Dict[str, Any]] = None,
is_bill_engineering: Optional[bool] = None,
) -> Dict[str, Any]:
"""
处理整个费用预览结构
:param expense_preview: 费用预览数据
:param project_data: 项目数据,用于查找GUID对应的数量
:param is_bill_engineering: 是否为清单工程
:return: 处理后的费用预览数据
"""
# 如果is_bill_engineering为None,默认为False
if is_bill_engineering is None:
is_bill_engineering = False
result = copy.deepcopy(expense_preview)
for category_key, category_value in expense_preview.items():
if isinstance(category_value, dict):
for subcategory_key, subcategory_value in category_value.items():
if isinstance(subcategory_value, list):
result[category_key][subcategory_key] = [
ExpenseProcessor.process_node(item, project_data, is_bill_engineering)
for item in subcategory_value
]
elif isinstance(category_value, list):
result[category_key] = [
ExpenseProcessor.process_node(item, project_data, is_bill_engineering) for item in category_value
]
return result
# 以下方法保持不变
@classmethod
def load_and_process_from_file(
cls, input_path: str, output_path: str | None = None, is_bill_engineering: Optional[bool] = None
) -> Optional[Dict[str, Any]]:
try:
with open(input_path, "r", encoding="utf-8") as f:
data = json.load(f)
if "projectData" in data and "expensePreview" in data["projectData"]:
# 如果没有指定工程类型,则自动判断
if is_bill_engineering is None:
project_type = _determine_project_type(data)
is_bill_engineering = project_type == "inventory"
print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
processed_data = copy.deepcopy(data)
processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
data["projectData"]["expensePreview"],
data["projectData"] if is_bill_engineering else None,
is_bill_engineering,
)
if output_path:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(processed_data, f, ensure_ascii=False, indent=4)
print(f"处理完成,结果已保存到 {output_path}")
return processed_data
else:
print(f"警告: 文件 {input_path} 中未找到 projectData.expensePreview 路径")
return None
except Exception as e:
print(f"处理文件 {input_path} 时出错: {str(e)}")
return None
@classmethod
def process_raw_data(cls, raw_data: Dict[str, Any], is_bill_engineering: Optional[bool] = None) -> Dict[str, Any]:
if "projectData" in raw_data and "expensePreview" in raw_data["projectData"]:
# 如果没有指定工程类型,则自动判断
if is_bill_engineering is None:
project_type = _determine_project_type(raw_data)
is_bill_engineering = project_type == "inventory"
print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
processed_data = copy.deepcopy(raw_data)
processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
raw_data["projectData"]["expensePreview"],
raw_data["projectData"] if is_bill_engineering else None,
is_bill_engineering,
)
return processed_data
else:
raise ValueError("未找到 projectData.expensePreview 路径")
@classmethod
def process_directory(
cls, input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
) -> List[Tuple[str, str]]:
os.makedirs(output_dir, exist_ok=True)
json_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".json")]
if not json_files:
print(f"警告: 在目录 {input_dir} 中没有找到JSON文件")
return []
successful_files = []
for file in json_files:
input_file = os.path.join(input_dir, file)
output_file = os.path.join(output_dir, file)
print(f"处理文件: {input_file}")
processed_data = cls.load_and_process_from_file(input_file, output_file, is_bill_engineering)
if processed_data:
successful_files.append((input_file, output_file))
print(f"✅ 成功处理: {file}")
else:
print(f"❌ 处理失败: {file}")
return successful_files
def _determine_project_type(data):
"""
根据division字段判断工程类型
:param data: 项目数据
:return: 'inventory' 表示清单工程,'budget' 表示预算工程
"""
# 清单工程关键词
inventory_keywords = ["清单", "结算", "招标控制价", "招投标工程", "清单计价"]
# 预算工程关键词
budget_keywords = ["概预算", "定额", "定额计价", "概算", "概预算工程"]
# 尝试从数据中获取division字段
division = None
if "division" in data:
division = data["division"]
parts = division.split("-")
# 如果找到division字段
if division:
# 去掉"主网-"前缀
if len(parts) == 2:
division_type = parts[1].strip()
else:
division_type = parts[2].strip()
# 判断是否为清单工程
for keyword in inventory_keywords:
if keyword in division_type:
print(f"根据division字段 '{division}' 判断为清单工程")
return "inventory"
# 判断是否为预算工程
for keyword in budget_keywords:
if keyword in division_type:
print(f"根据division字段 '{division}' 判断为预算工程")
return "budget"
# 如果无法通过division字段判断,则尝试通过数据结构判断
is_inventory_project = False
for key in data.keys():
if re.search(r"[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}", key, re.IGNORECASE):
is_inventory_project = True
print("通过数据结构判断为清单工程")
break
return "inventory" if is_inventory_project else "budget"
def costsummary_upwards(
input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
) -> List[Tuple[str, str]]:
return ExpenseProcessor.process_directory(input_dir, output_dir, is_bill_engineering)
if __name__ == "__main__":
input_directory = "project2json/outputs/json"
output_directory = "project2json/outputs/merged"
# 自动判断工程类型
result = costsummary_upwards(input_directory, output_directory)
if result:
print(f"\n成功处理了 {len(result)} 个文件:")
for src, dst in result:
print(f" {os.path.basename(src)} -> {os.path.basename(dst)}")
else:
print("\n没有文件被成功处理")