增加了知识图谱导出excel

2025-08-18 15:14:37 +08:00
parent ce2986fbe2
commit 3fd0b2af0c
610 changed files with 6062 additions and 4932473 deletions
@@ -1,141 +1,283 @@
 """
-第四步：实现项目划分和清单节点费用预览向上汇总
+第三步：向上汇总费用预览
 """

 import json
 import os
-import glob
-from typing import Dict, List, Any, Tuple
+from typing import Dict, List, Any, Tuple, Optional
 import copy
+import re


 class ExpenseProcessor:
    def __init__(self):
        pass

+    @staticmethod
+    def normalize_guid(guid: str) -> str:
+        """
+        标准化GUID格式，确保只有单中括号
+        :param guid: 原始GUID字符串
+        :return: 标准化后的GUID字符串
+        """
+        if not guid:
+            return guid
+        # 移除所有中括号，然后添加单中括号
+        normalized = guid.strip("{}")
+        return "{" + normalized + "}"
+
+    @staticmethod
+    def is_cost_item(obj: Any) -> bool:
+        """
+        判断一个对象是否为费用项（只有 id 和 cost 字段）
+        """
+        return (
+            isinstance(obj, dict)
+            and "id" in obj
+            and "cost" in obj
+            and len(obj) <= 2  # 允许有额外字段，但核心是 id 和 cost
+        )
+
+    @staticmethod
+    def extract_costs_from_children(node: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        从节点的 children 中提取费用项（用于叶子节点）
+        :param node: 节点
+        :return: 费用项列表
+        """
+        costs = []
+        if "children" in node and isinstance(node["children"], list):
+            for child in node["children"]:
+                if ExpenseProcessor.is_cost_item(child):
+                    # 深拷贝费用项
+                    costs.append(copy.deepcopy(child))
+        return costs
+
    @staticmethod
    def calculate_parent_costs(node: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
-        计算节点的汇总费用
+        计算节点的汇总费用（包括自身和所有后代）
        :param node: 费用预览节点
        :return: 汇总后的费用项列表
        """
        result_nodes = []
-        processed_ids = set()
+        processed_ids = {}

-        # 首先处理节点自身的sum数组
+        # 1. 收集本节点自身的 sum 费用
        if "sum" in node and isinstance(node["sum"], list):
            for cost_item in node["sum"]:
                if "id" in cost_item and "cost" in cost_item:
-                    result_nodes.append(copy.deepcopy(cost_item))
-                    processed_ids.add(cost_item["id"])
+                    item_id = cost_item["id"]
+                    if item_id not in processed_ids:
+                        processed_ids[item_id] = 0.0
+                    try:
+                        processed_ids[item_id] += float(cost_item["cost"])
+                    except (ValueError, TypeError):
+                        pass  # 忽略无效 cost

-        # 然后处理children节点
+        # 2. 检查 children 中是否直接包含费用项（叶子节点）
+        child_costs = ExpenseProcessor.extract_costs_from_children(node)
+        for cost_item in child_costs:
+            item_id = cost_item["id"]
+            if item_id not in processed_ids:
+                processed_ids[item_id] = 0.0
+            try:
+                processed_ids[item_id] += float(cost_item["cost"])
+            except (ValueError, TypeError):
+                pass
+
+        # 3. 递归处理子节点（结构化节点）
+        # 注意：这里我们不需要再递归计算，因为每个子节点已经在process_node中计算了自己的sum
+        # 我们只需要直接使用子节点的sum即可
        if "children" in node and isinstance(node["children"], list):
            for child in node["children"]:
-                child_costs = ExpenseProcessor.calculate_parent_costs(child)
-
-                # 合并子节点的费用
-                for cost_item in child_costs:
-                    if "id" in cost_item:
-                        # 查找是否已存在相同id的费用项
-                        found = False
-                        for existing in result_nodes:
-                            if "id" in existing and existing["id"] == cost_item["id"]:
-                                # 如果存在，累加cost值
-                                existing["cost"] = str(float(existing["cost"]) + float(cost_item["cost"]))
-                                found = True
-                                break
-
-                        # 如果不存在，添加新的费用项
-                        if not found:
-                            result_nodes.append(copy.deepcopy(cost_item))
-                    else:
-                        # 处理没有id的费用项（不常见）
-                        found = False
-                        for existing in result_nodes:
-                            if "id" not in existing:
-                                existing["cost"] = str(float(existing["cost"]) + float(cost_item["cost"]))
-                                found = True
-                                break
-                        if not found:
-                            result_nodes.append(copy.deepcopy(cost_item))
+                # 只处理非费用项的子节点
+                if not ExpenseProcessor.is_cost_item(child):
+                    # 直接使用子节点的sum
+                    if "sum" in child and isinstance(child["sum"], list):
+                        for cost_item in child["sum"]:
+                            if "id" in cost_item and "cost" in cost_item:
+                                item_id = cost_item["id"]
+                                if item_id not in processed_ids:
+                                    processed_ids[item_id] = 0.0
+                                try:
+                                    processed_ids[item_id] += float(cost_item["cost"])
+                                except (ValueError, TypeError):
+                                    pass

+        # 构建结果
+        result_nodes = [{"id": item_id, "cost": str(total_cost)} for item_id, total_cost in processed_ids.items()]
        return result_nodes

    @staticmethod
-    def process_node(node: Dict[str, Any]) -> Dict[str, Any]:
+    def find_guid_quantity(project_data: Optional[Dict[str, Any]], guid: str) -> float:
+        """
+        在projectDivision中查找指定GUID节点的数量
+        :param project_data: 项目数据
+        :param guid: 要查找的GUID（带花括号的格式）
+        :return: 数量值，如果未找到则返回1.0
+        """
+        if not project_data or "projectDivision" not in project_data:
+            return 1.0
+
+        # 移除花括号以便比较
+        guid_clean = guid.strip("{}")
+
+        def search_node_quantity(node):
+            if isinstance(node, dict):
+                # 检查当前节点的GUID
+                node_guid = node.get("GUID", "").strip("{}")
+                if node_guid == guid_clean:
+                    # 找到匹配的GUID，获取数量
+                    quantity = node.get("数量")
+                    if quantity:
+                        try:
+                            return float(quantity)
+                        except (ValueError, TypeError):
+                            return 1.0
+
+                # 递归查找子节点
+                for key, value in node.items():
+                    if isinstance(value, (dict, list)):
+                        result = search_node_quantity(value)
+                        if result != 1.0:  # 找到非默认值
+                            return result
+
+            elif isinstance(node, list):
+                for item in node:
+                    result = search_node_quantity(item)
+                    if result != 1.0:  # 找到非默认值
+                        return result
+
+            return 1.0  # 默认返回1.0
+
+        return search_node_quantity(project_data["projectDivision"])
+
+    @staticmethod
+    def process_node(
+        node: Dict[str, Any], project_data: Optional[Dict[str, Any]] = None, is_bill_engineering: Optional[bool] = None
+    ) -> Dict[str, Any]:
        """
        处理单个节点，计算汇总费用并更新sum数组
        :param node: 费用预览节点
+        :param project_data: 项目数据，用于查找GUID对应的数量
+        :param is_bill_engineering: 是否为清单工程
        :return: 处理后的节点
        """
        result = copy.deepcopy(node)

-        # 如果没有children，则不需要汇总
-        if "children" not in node or not node["children"]:
-            # 确保节点有标准格式
-            if "sum" not in result:
-                result["sum"] = []
-            if "children" not in result:
-                result["children"] = []
-            if "rcj" not in result:
-                result["rcj"] = []
-            return result
+        # 标准化GUID格式
+        if "GUID" in result:
+            result["GUID"] = ExpenseProcessor.normalize_guid(result["GUID"])

-        # 计算汇总费用
-        cost_items = ExpenseProcessor.calculate_parent_costs(node)
-
-        # 更新sum数组
-        if cost_items:
-            # 确保只保留id和cost两个属性
-            result["sum"] = [{"id": item["id"], "cost": item["cost"]} for item in cost_items if "id" in item]
-        else:
+        # 确保关键字段存在
+        if "sum" not in result:
            result["sum"] = []
-
-        # 递归处理子节点
-        result["children"] = [ExpenseProcessor.process_node(child) for child in node["children"]]
-
-        # 确保rcj数组存在
        if "rcj" not in result:
            result["rcj"] = []
+        if "children" not in result:
+            result["children"] = []
+
+        # 如果is_bill_engineering为None，默认为False
+        if is_bill_engineering is None:
+            is_bill_engineering = False
+
+        # === 特殊处理：如果 children 包含的是费用项（叶子节点）===
+        direct_costs = ExpenseProcessor.extract_costs_from_children(result)
+        if direct_costs:
+            # 如果是清单工程且有项目数据，需要根据GUID调整费用
+            if is_bill_engineering and project_data and "GUID" in result:
+                guid = result["GUID"]
+                quantity = ExpenseProcessor.find_guid_quantity(project_data, guid)
+
+                # 调整费用值：乘以数量
+                for cost_item in direct_costs:
+                    try:
+                        original_cost = float(cost_item["cost"])
+                        adjusted_cost = original_cost * quantity
+                        cost_item["cost"] = str(adjusted_cost)
+                    except (ValueError, TypeError):
+                        pass  # 忽略无效 cost
+
+            # 将直接费用项迁移到 sum
+            result["sum"] = direct_costs
+            # 清空 children（因为已经迁移）
+            result["children"] = []
+            # 不再递归处理 children
+            return result
+
+        # === 普通节点处理：children 是子节点列表 ===
+        # 递归处理所有子节点
+        processed_children = []
+        if result["children"]:
+            for child in node["children"]:
+                if not ExpenseProcessor.is_cost_item(child):
+                    processed_child = ExpenseProcessor.process_node(child, project_data, is_bill_engineering)
+                    processed_children.append(processed_child)
+
+            # 更新处理后的子节点
+            result["children"] = processed_children
+
+        # 重要修改：使用处理后的result（包含已处理的子节点）来计算汇总费用
+        # 而不是使用原始的node
+        total_costs = ExpenseProcessor.calculate_parent_costs(result)
+        result["sum"] = total_costs

        return result

    @staticmethod
-    def process_expense_preview(expense_preview: Dict[str, Any]) -> Dict[str, Any]:
+    def process_expense_preview(
+        expense_preview: Dict[str, Any],
+        project_data: Optional[Dict[str, Any]] = None,
+        is_bill_engineering: Optional[bool] = None,
+    ) -> Dict[str, Any]:
        """
        处理整个费用预览结构
-        :param expense_preview: 费用预览结构
-        :return: 处理后的费用预览结构
+        :param expense_preview: 费用预览数据
+        :param project_data: 项目数据，用于查找GUID对应的数量
+        :param is_bill_engineering: 是否为清单工程
+        :return: 处理后的费用预览数据
        """
+        # 如果is_bill_engineering为None，默认为False
+        if is_bill_engineering is None:
+            is_bill_engineering = False
+
        result = copy.deepcopy(expense_preview)
        for category_key, category_value in expense_preview.items():
-            for subcategory_key, subcategory_value in category_value.items():
-                if isinstance(subcategory_value, list):
-                    result[category_key][subcategory_key] = [
-                        ExpenseProcessor.process_node(item) for item in subcategory_value
-                    ]
+            if isinstance(category_value, dict):
+                for subcategory_key, subcategory_value in category_value.items():
+                    if isinstance(subcategory_value, list):
+                        result[category_key][subcategory_key] = [
+                            ExpenseProcessor.process_node(item, project_data, is_bill_engineering)
+                            for item in subcategory_value
+                        ]
+            elif isinstance(category_value, list):
+                result[category_key] = [
+                    ExpenseProcessor.process_node(item, project_data, is_bill_engineering) for item in category_value
+                ]
        return result

+    # 以下方法保持不变
    @classmethod
-    def load_and_process_from_file(cls, input_path: str, output_path: str = None) -> Dict[str, Any]:
-        """
-        从文件加载 JSON 并处理
-        :param input_path: 输入文件路径
-        :param output_path: 输出文件路径（可选）
-        :return: 处理后的完整数据
-        """
+    def load_and_process_from_file(
+        cls, input_path: str, output_path: str | None = None, is_bill_engineering: Optional[bool] = None
+    ) -> Optional[Dict[str, Any]]:
        try:
            with open(input_path, "r", encoding="utf-8") as f:
                data = json.load(f)
-
            if "projectData" in data and "expensePreview" in data["projectData"]:
+                # 如果没有指定工程类型，则自动判断
+                if is_bill_engineering is None:
+                    project_type = _determine_project_type(data)
+                    is_bill_engineering = project_type == "inventory"
+                    print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
+
                processed_data = copy.deepcopy(data)
                processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
-                    data["projectData"]["expensePreview"]
+                    data["projectData"]["expensePreview"],
+                    data["projectData"] if is_bill_engineering else None,
+                    is_bill_engineering,
                )
-
                if output_path:
                    with open(output_path, "w", encoding="utf-8") as f:
                        json.dump(processed_data, f, ensure_ascii=False, indent=4)
@@ -149,81 +291,107 @@ class ExpenseProcessor:
            return None

    @classmethod
-    def process_raw_data(cls, raw_data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        直接处理原始数据（不涉及文件读写）
-        :param raw_data: 原始数据，格式应包含 projectData.expensePreview
-        :return: 处理后的数据
-        """
+    def process_raw_data(cls, raw_data: Dict[str, Any], is_bill_engineering: Optional[bool] = None) -> Dict[str, Any]:
        if "projectData" in raw_data and "expensePreview" in raw_data["projectData"]:
+            # 如果没有指定工程类型，则自动判断
+            if is_bill_engineering is None:
+                project_type = _determine_project_type(raw_data)
+                is_bill_engineering = project_type == "inventory"
+                print(f"自动判断工程类型: {'清单工程' if is_bill_engineering else '预算工程'}")
+
            processed_data = copy.deepcopy(raw_data)
            processed_data["projectData"]["expensePreview"] = cls.process_expense_preview(
-                raw_data["projectData"]["expensePreview"]
+                raw_data["projectData"]["expensePreview"],
+                raw_data["projectData"] if is_bill_engineering else None,
+                is_bill_engineering,
            )
            return processed_data
        else:
            raise ValueError("未找到 projectData.expensePreview 路径")

    @classmethod
-    def process_directory(cls, input_dir: str, output_dir: str) -> List[Tuple[str, str]]:
-        """
-        处理指定目录中的所有JSON文件
-
-        :param input_dir: 输入目录路径，包含要处理的JSON文件
-        :param output_dir: 输出目录路径，处理后的JSON文件将保存在这里
-        :return: 成功处理的文件列表，格式为 [(源文件路径, 输出文件路径), ...]
-        """
-        # 确保输出目录存在
+    def process_directory(
+        cls, input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
+    ) -> List[Tuple[str, str]]:
        os.makedirs(output_dir, exist_ok=True)
-
-        # 查找所有JSON文件
-        json_files = []
-        for file in os.listdir(input_dir):
-            if file.lower().endswith(".json"):
-                json_files.append(os.path.join(input_dir, file))
-
+        json_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".json")]
        if not json_files:
            print(f"警告: 在目录 {input_dir} 中没有找到JSON文件")
            return []

-        # 处理每个JSON文件
        successful_files = []
-        for input_file in json_files:
-            file_name = os.path.basename(input_file)
-            output_file = os.path.join(output_dir, file_name)
-
+        for file in json_files:
+            input_file = os.path.join(input_dir, file)
+            output_file = os.path.join(output_dir, file)
            print(f"处理文件: {input_file}")
-            processed_data = cls.load_and_process_from_file(input_file, output_file)
-
+            processed_data = cls.load_and_process_from_file(input_file, output_file, is_bill_engineering)
            if processed_data:
                successful_files.append((input_file, output_file))
-                print(f"✅ 成功处理: {file_name}")
+                print(f"✅ 成功处理: {file}")
            else:
-                print(f"❌ 处理失败: {file_name}")
-
+                print(f"❌ 处理失败: {file}")
        return successful_files


-def costsummary_upwards(input_dir: str, output_dir: str) -> List[Tuple[str, str]]:
+def _determine_project_type(data):
    """
-    处理指定目录中的所有JSON文件，实现项目划分和清单节点费用预览向上汇总
+    根据division字段判断工程类型
+    :param data: 项目数据
+    :return: 'inventory' 表示清单工程，'budget' 表示预算工程
+    """
+    # 清单工程关键词
+    inventory_keywords = ["清单", "结算", "招标控制价", "招投标工程", "清单计价"]
+    # 预算工程关键词
+    budget_keywords = ["概预算", "定额", "定额计价", "概算", "概预算工程"]

-    :param input_dir: 输入目录路径，包含要处理的JSON文件
-    :param output_dir: 输出目录路径，处理后的JSON文件将保存在这里
-    :return: 成功处理的文件列表，格式为 [(源文件路径, 输出文件路径), ...]
-    """
-    return ExpenseProcessor.process_directory(input_dir, output_dir)
+    # 尝试从数据中获取division字段
+    division = None
+    if "division" in data:
+        division = data["division"]
+        parts = division.split("-")
+
+    # 如果找到division字段
+    if division:
+        # 去掉"主网-"前缀
+        if len(parts) == 2:
+            division_type = parts[1].strip()
+        else:
+            division_type = parts[2].strip()
+
+        # 判断是否为清单工程
+        for keyword in inventory_keywords:
+            if keyword in division_type:
+                print(f"根据division字段 '{division}' 判断为清单工程")
+                return "inventory"
+
+        # 判断是否为预算工程
+        for keyword in budget_keywords:
+            if keyword in division_type:
+                print(f"根据division字段 '{division}' 判断为预算工程")
+                return "budget"
+
+    # 如果无法通过division字段判断，则尝试通过数据结构判断
+    is_inventory_project = False
+    for key in data.keys():
+        if re.search(r"[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}", key, re.IGNORECASE):
+            is_inventory_project = True
+            print("通过数据结构判断为清单工程")
+            break
+
+    return "inventory" if is_inventory_project else "budget"
+
+
+def costsummary_upwards(
+    input_dir: str, output_dir: str, is_bill_engineering: Optional[bool] = None
+) -> List[Tuple[str, str]]:
+    return ExpenseProcessor.process_directory(input_dir, output_dir, is_bill_engineering)


 if __name__ == "__main__":
-    # 使用示例
-    input_directory = "final_outputs"  # 输入JSON文件夹路径
-    output_directory = "final_outputs"  # 输出JSON文件夹路径
-
-    # 处理整个文件夹
+    input_directory = "project2json/outputs/json"
+    output_directory = "project2json/outputs/merged"
+    # 自动判断工程类型
    result = costsummary_upwards(input_directory, output_directory)
-
-    # 显示处理结果
    if result:
        print(f"\n成功处理了 {len(result)} 个文件:")
        for src, dst in result: