""" 通过本体层文件构建知识图谱 """ from py2neo import Graph, Node, Relationship, NodeMatcher from neo4j import GraphDatabase import json import os import logging import re import configparser import glob import time # 设置日志 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # 全局变量 graph = None def read_config(config_file="config.ini"): """ 读取配置文件 Args: config_file: 配置文件路径 Returns: config: 配置对象 """ config = configparser.ConfigParser() config.read(config_file, encoding="utf-8") return config def connect_to_neo4j(uri, user, password): """ 连接到Neo4j数据库 Args: uri: 数据库URI user: 用户名 password: 密码 Returns: graph: 数据库连接对象 """ global graph try: graph = Graph(uri, auth=(user, password)) logger.info("成功连接到Neo4j数据库") return True except Exception as e: logger.error(f"连接Neo4j数据库失败: {e}") graph = None return False def clear_database(): """ 清空数据库 """ if graph is None: logger.error("数据库连接不可用") return False try: graph.run("MATCH (n) DETACH DELETE n") logger.info("已清空数据库") # 删除所有约束 try: # 获取所有约束 constraints = graph.run("SHOW CONSTRAINTS").data() for constraint in constraints: constraint_name = constraint.get("name") if constraint_name: graph.run(f"DROP CONSTRAINT {constraint_name}") logger.info(f"已删除约束: {constraint_name}") except Exception as e: logger.warning(f"删除约束失败: {e}") return True except Exception as e: logger.error(f"清空数据库失败: {e}") return False # 解析本体层文件,获取实体类型、属性和关系定义 def parse_ontology_file(file_path="Ontology_Layer.txt"): """ 解析本体层文件,获取实体类型、属性和关系定义 Args: file_path: 本体层文件路径 Returns: entity_types: 实体类型和属性的字典 {实体类型名称: {属性名称: 属性类型}} entity_relationships: 实体间关系的列表 [(源实体类型, 关系类型, 目标实体类型)] """ try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() # 分割实体类型和关系部分 parts = content.split("2. 实体间的关系") if len(parts) != 2: logger.error("本体层文件格式错误,无法找到实体类型和关系部分") return {}, [] entity_types_content = parts[0] relationships_content = parts[1] # 解析实体类型和属性 entity_types = {} current_entity = None # 移除 "1.实体类型" 标题行 entity_types_content = entity_types_content.replace("1.实体类型", "").strip() # 按行分割 lines = entity_types_content.strip().split("\n") for line in lines: line = line.strip() if not line: continue # 如果行不包含冒号,视为实体类型 if ":" not in line: current_entity = line entity_types[current_entity] = {} # 否则视为属性定义 else: if current_entity is None: continue parts = line.split(":", 1) if len(parts) == 2: attr_name = parts[0].strip() attr_type = parts[1].strip() entity_types[current_entity][attr_name] = attr_type # 解析实体间的关系 entity_relationships = [] # 匹配关系定义行: (:EntityType)-[:RELATIONSHIP]->(:EntityType) relationship_pattern = r"\(:(\w+)\)-\[:(\w+)\]->\(:(\w+)\)" for line in relationships_content.strip().split("\n"): line = line.strip() if not line: continue match = re.match(relationship_pattern, line) if match: source_entity, relationship_type, target_entity = match.groups() entity_relationships.append((source_entity, relationship_type, target_entity)) logger.info(f"从本体层文件中解析出 {len(entity_types)} 个实体类型和 {len(entity_relationships)} 个关系定义") return entity_types, entity_relationships except Exception as e: logger.error(f"解析本体层文件失败: {e}") return {}, [] # 创建约束和索引以提高性能 - 现在不创建任何约束 def create_constraints_and_indexes(): # 不创建任何约束 logger.info("跳过创建约束") pass # 获取实体之间的关系类型 def get_relationship_type(source_entity_type, target_entity_type, entity_relationships): """ 根据源实体类型和目标实体类型获取关系类型 Args: source_entity_type: 源实体类型 target_entity_type: 目标实体类型 entity_relationships: 实体间关系的列表 Returns: relationship_type: 关系类型,如果没找到则返回None """ for source, rel_type, target in entity_relationships: if source == source_entity_type and target == target_entity_type: return rel_type # 如果没有找到匹配的关系,返回None logger.warning(f"未找到从 {source_entity_type} 到 {target_entity_type} 的关系定义,不创建关系") return None # 创建根节点 def create_root_node(): root = Node("EngineeringData", name="工程") graph.create(root) logger.info("创建根节点: 工程") return root # --------------------------*项目划分*-------------------------- # 处理ProjectDivisionSet def process_project_division_set(data, parent_node, entity_relationships): # 根据您提供的JSON结构,正确访问projectDivision数据 if "projectData" in data and "projectDivision" in data["projectData"]: project_division = data["projectData"]["projectDivision"] elif "projectDivision" in data: project_division = data["projectDivision"] else: logger.warning("JSON中未找到projectDivision数据") logger.info(f"JSON顶层键: {list(data.keys())}") return logger.info(f"开始处理projectDivision,包含 {len(project_division)} 个顶级项目") # 创建新的ProjectDivisionSet节点 - 项目划分集 division_set = Node("ProjectDivisionSet", name="项目划分集") graph.create(division_set) # 添加sortid属性,固定为"1" division_set["sortid"] = "1" graph.push(division_set) # 从本体层获取关系类型 relationship_type = get_relationship_type("EngineeringData", "ProjectDivisionSet", entity_relationships) if relationship_type: # 只有当关系类型不为None时才创建关系 graph.create(Relationship(parent_node, relationship_type, division_set)) # 处理ProjectDivisionTree tree_idx = 1 # 用于跟踪ProjectDivisionTree的序号 for first_level_name, first_level_content in project_division.items(): # 处理第一层下的内容,直接创建合并后的ProjectDivisionTree节点 if isinstance(first_level_content, dict): # 处理一级名称,去掉"工程"字样 processed_first_level = first_level_name.replace("工程", "") for second_level_name, second_level_content in first_level_content.items(): # 确定最终节点名称 if second_level_name == processed_first_level: # 如果二级名称与处理后的一级名称相同,直接使用二级名称 final_name = second_level_name else: # 否则组合二级名称和处理后的一级名称 final_name = f"{second_level_name}{processed_first_level}" # 创建ProjectDivisionTree节点 division_tree = Node("ProjectDivisionTree", name=final_name) # 设置sortid属性,格式为"1.树索引",确保每个树有唯一的sortid current_sort_path = [1, tree_idx] division_tree["sortid"] = ".".join(map(str, current_sort_path)) tree_idx += 1 # 增加树索引,确保下一个树有不同的sortid # 保存原始名称作为属性 division_tree["original_first_level"] = first_level_name division_tree["original_second_level"] = second_level_name # 如果有GUID,添加到节点属性 guid = None if isinstance(first_level_content, dict) and "GUID" in first_level_content: division_tree["first_level_GUID"] = first_level_content["GUID"] guid = first_level_content["GUID"] graph.create(division_tree) graph.push(division_tree) # 保存属性 # 从本体层获取关系类型 relationship_type = get_relationship_type( "ProjectDivisionSet", "ProjectDivisionTree", entity_relationships ) if relationship_type: graph.create(Relationship(division_set, relationship_type, division_tree)) # 移除GUID关系建立代码,避免重复创建 # 关系将在establish_relationships函数中批量创建 # 处理第二层下的ProjectDivisionItem列表 if isinstance(second_level_content, list): for item_idx, item in enumerate(second_level_content, start=1): # 传递当前排序路径,为子项添加第三级索引 process_project_division_item( item, division_tree, entity_relationships, current_sort_path + [item_idx] ) else: logger.warning(f"ProjectDivisionTree {final_name} 的内容类型未知: {type(second_level_content)}") else: logger.warning(f"第一层 {first_level_name} 的内容类型未知: {type(first_level_content)}") # 处理ProjectDivisionItem def process_project_division_item(item, parent_node, entity_relationships, current_sort_path=None): # 提取必要属性 guid = item.get("GUID", item.get("guid", "")) name = item.get("项目名称", "") if not guid and not name: logger.warning("ProjectDivisionItem缺少GUID和项目名称") return # 创建ProjectDivisionItem节点 item_node = Node("ProjectDivisionItem", GUID=guid, name=name) # 设置sortid属性 if current_sort_path: item_node["sortid"] = ".".join(map(str, current_sort_path)) # 添加path属性,表示从ProjectDivisionItem到ProjectDivisionTree的路径,不包含节点类型 if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels: # 如果父节点是ProjectDivisionTree,使用"父节点名称/当前节点名称"作为路径 item_node["path"] = f"{parent_node['name']}/{name}" # logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}") else: # 如果父节点是ProjectDivisionItem,使用"父节点path/当前节点名称"作为路径 parent_path = parent_node.get("path", "") if parent_path: item_node["path"] = f"{parent_path}/{name}" else: # 如果父节点没有path属性(不应该发生,但为了健壮性) item_node["path"] = name # logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}") # 添加其他属性 for key, value in item.items(): if key not in ["GUID", "项目名称", "children"] and value is not None: # 检查是否为资源库列表 if key == "资源库列表" and isinstance(value, list): # 将资源库列表转换为分号分隔的字符串 resource_names = [] for resource in value: if isinstance(resource, dict) and "资源库名称" in resource: resource_names.append(resource["资源库名称"]) item_node["资源库名称"] = ";".join(resource_names) # logger.info(f"将资源库列表转换为字符串: {item_node['资源库名称']}") # 检查值是否为基本类型 elif isinstance(value, (str, int, float, bool)): item_node[key] = value # 如果是列表,尝试转换为分号分隔的字符串 elif isinstance(value, list): try: if all(isinstance(x, (str, int, float, bool)) for x in value): item_node[key] = ";".join(str(x) for x in value) else: # 对于包含复杂对象的列表,尝试提取关键信息 extracted_values = [] for item_in_list in value: if isinstance(item_in_list, dict): # 尝试提取字典中的名称或标识符 for name_key in ["名称", "name", "标识", "id", "ID"]: if name_key in item_in_list: extracted_values.append(str(item_in_list[name_key])) break else: # 如果没有找到名称键,使用第一个键值对 if item_in_list: first_key = next(iter(item_in_list)) extracted_values.append(f"{first_key}:{item_in_list[first_key]}") else: extracted_values.append(str(item_in_list)) item_node[key] = ";".join(extracted_values) except Exception as e: logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}") # 如果是字典,尝试转换为字符串 elif isinstance(value, dict): try: # 提取字典中的关键信息 extracted_info = [] for dict_key, dict_value in value.items(): if isinstance(dict_value, (str, int, float, bool)): extracted_info.append(f"{dict_key}:{dict_value}") item_node[key] = ";".join(extracted_info) except Exception as e: logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}") graph.create(item_node) graph.push(item_node) # 保存属性 # logger.info(f"创建ProjectDivisionItem节点: {name} (GUID: {guid})") # 创建与父节点的关系 if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels: # 从本体层获取关系类型 relationship_type = get_relationship_type("ProjectDivisionTree", "ProjectDivisionItem", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, item_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}") else: # 从本体层获取关系类型 relationship_type = get_relationship_type("ProjectDivisionItem", "ProjectDivisionItem", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, item_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}") # 移除GUID关系建立代码,避免重复创建 # 关系将在establish_relationships函数中批量创建 # 处理子项 if "children" in item and item["children"]: children = item["children"] # logger.info(f"ProjectDivisionItem {name} 有 {len(children)} 个子项") for child_idx, child in enumerate(children, start=1): child_type = child.get("type", child.get("类型", "")) # 创建子节点的排序路径 child_sort_path = current_sort_path + [child_idx] if current_sort_path else None if child_type == "项目划分": # 递归处理子ProjectDivisionItem process_project_division_item(child, item_node, entity_relationships, child_sort_path) elif child_type == "8" or child_type == "清单": # 处理List类型节点 process_list_item(child, item_node, entity_relationships, child_sort_path) else: # 处理ProjectQuantity及其子类 process_project_quantity(child, item_node, entity_relationships, child_sort_path) # 处理List及其子类 def process_list_item(list_item, parent_node, entity_relationships, current_sort_path=None): """处理清单类型的节点""" # 提取必要属性 guid = list_item.get("GUID", list_item.get("guid", "")) list_name = list_item.get("清单名称") list_type = list_item.get("类型", "") # 创建List节点 list_node = Node("List", guid=guid, name=list_name, type=list_type) # 设置sortid属性 if current_sort_path: list_node["sortid"] = ".".join(map(str, current_sort_path)) # 添加path属性,包含节点类型 parent_path = parent_node.get("path", "") if parent_path: list_node["path"] = f"{parent_path}/{list_name}(清单)" else: # 如果父节点没有path属性(不应该发生,但为了健壮性) parent_name = parent_node.get("name", "") list_node["path"] = f"{parent_name}/{list_name}(清单)" # logger.info(f"为List节点 {list_name} 设置path: {list_node['path']}") # 添加其他属性 for key, value in list_item.items(): if key not in ["清单名称", "类型", "guid", "children"] and value is not None: # 检查是否为资源库列表 if key == "资源库列表" and isinstance(value, list): # 将资源库列表转换为分号分隔的字符串 resource_names = [] for resource in value: if isinstance(resource, dict) and "资源库名称" in resource: resource_names.append(resource["资源库名称"]) list_node["资源库名称"] = ";".join(resource_names) # logger.info(f"将资源库列表转换为字符串: {list_node['资源库名称']}") # 检查值是否为基本类型 elif isinstance(value, (str, int, float, bool)): list_node[key] = value # 如果是列表,尝试转换为分号分隔的字符串 elif isinstance(value, list): try: if all(isinstance(x, (str, int, float, bool)) for x in value): list_node[key] = ";".join(str(x) for x in value) else: # 对于包含复杂对象的列表,尝试提取关键信息 extracted_values = [] for item_in_list in value: if isinstance(item_in_list, dict): # 尝试提取字典中的名称或标识符 for name_key in ["名称", "name", "标识", "id", "ID"]: if name_key in item_in_list: extracted_values.append(str(item_in_list[name_key])) break else: # 如果没有找到名称键,使用第一个键值对 if item_in_list: first_key = next(iter(item_in_list)) extracted_values.append(f"{first_key}:{item_in_list[first_key]}") else: extracted_values.append(str(item_in_list)) list_node[key] = ";".join(extracted_values) except Exception as e: logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}") # 如果是字典,尝试转换为字符串 elif isinstance(value, dict): try: # 提取字典中的关键信息 extracted_info = [] for dict_key, dict_value in value.items(): if isinstance(dict_value, (str, int, float, bool)): extracted_info.append(f"{dict_key}:{dict_value}") list_node[key] = ";".join(extracted_info) except Exception as e: logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}") graph.create(list_node) graph.push(list_node) # 保存属性 # logger.info(f"创建List节点: {list_name} (类型: {list_type})") # 创建与父节点的关系 # 从本体层获取关系类型 relationship_type = get_relationship_type(list(parent_node.labels)[0], "List", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, list_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {list_name}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {list_name}") # 如果有GUID,尝试建立与CostSet的关系 if guid: # 查找对应的CostSet节点 cost_set_query = f""" MATCH (c:CostSet) WHERE c.GUID = '{guid}' RETURN c """ cost_set_nodes = list(graph.run(cost_set_query)) if cost_set_nodes: cost_set_node = cost_set_nodes[0]["c"] relationship_type = get_relationship_type("List", "CostSet", entity_relationships) if relationship_type: graph.create(Relationship(list_node, relationship_type, cost_set_node)) # logger.info(f"创建关系: {list_name} {relationship_type} CostSet (GUID: {guid})") # else: # logger.info(f"不创建关系: {list_name} 到 CostSet (GUID: {guid})") # 处理子项 if "children" in list_item and list_item["children"]: children = list_item["children"] # logger.info(f"List {list_name} 有 {len(children)} 个子项") for child_idx, child in enumerate(children, start=1): # 确定子项类型 child_type = child.get("type", child.get("类型", "")) # 创建子节点的排序路径 child_sort_path = current_sort_path + [child_idx] if current_sort_path else [1, child_idx] if child_type == "项目划分": # 递归处理子ProjectDivisionItem process_project_division_item(child, list_node, entity_relationships, child_sort_path) elif child_type == "8" or child_type == "清单": # 递归处理子List process_list_item(child, list_node, entity_relationships, child_sort_path) else: # 处理ProjectQuantity及其子类 process_project_quantity(child, list_node, entity_relationships, child_sort_path) # 处理ProjectQuantity及其子类 def process_project_quantity(quantity, parent_node, entity_relationships, current_sort_path=None): # 确定具体类型 quantity_type = quantity.get("type", quantity.get("类型", "")) labels = ["ProjectQuantity"] type_name = "ProjectQuantity" # 支持数字和文本类型 if quantity_type == "0" or quantity_type == "定额": labels.append("Quota") type_name = "定额" elif quantity_type == "1" or quantity_type == "主材": labels.append("MainMaterial") type_name = "主材" elif quantity_type == "5" or quantity_type == "设备": labels.append("Equipment") type_name = "设备" elif quantity_type == "配件": # 特殊处理:当类型为“配件”时,读取“配件类型”字段 accessory_type = quantity.get("配件类型", "") # 默认为“配件” if accessory_type == "主材": labels.append("MainMaterial") type_name = "主材" else: labels.append("Accessory") type_name = "配件" else: # 可选:处理未知类型 pass # 创建节点 quantity_name = quantity.get("项目名称", quantity.get("名称", "")) quantity_node = Node(*labels, name=quantity_name) # 设置sortid属性 if current_sort_path: quantity_node["sortid"] = ".".join(map(str, current_sort_path)) # 添加path属性,包含节点类型 parent_path = parent_node.get("path", "") if parent_path: quantity_node["path"] = f"{parent_path}/{quantity_name}({type_name})" else: # 如果父节点没有path属性(不应该发生,但为了健壮性) parent_name = parent_node.get("name", "") quantity_node["path"] = f"{parent_name}/{quantity_name}({type_name})" # logger.info(f"为ProjectQuantity节点 {quantity_name} 设置path: {quantity_node['path']}") # 添加其他属性 for key, value in quantity.items(): if key not in ["名称", "项目名称", "材机列表", "children"] and value is not None: # 检查是否为资源库列表 if key == "资源库列表" and isinstance(value, list): # 将资源库列表转换为分号分隔的字符串 resource_names = [] for resource in value: if isinstance(resource, dict) and "资源库名称" in resource: resource_names.append(resource["资源库名称"]) quantity_node["资源库名称"] = ";".join(resource_names) # logger.info(f"将资源库列表转换为字符串: {quantity_node['资源库名称']}") # 检查值是否为基本类型 elif isinstance(value, (str, int, float, bool)): quantity_node[key] = value # 如果是列表,尝试转换为分号分隔的字符串 elif isinstance(value, list): try: if all(isinstance(x, (str, int, float, bool)) for x in value): quantity_node[key] = ";".join(str(x) for x in value) else: # 对于包含复杂对象的列表,尝试提取关键信息 extracted_values = [] for item_in_list in value: if isinstance(item_in_list, dict): # 尝试提取字典中的名称或标识符 for name_key in ["名称", "name", "标识", "id", "ID"]: if name_key in item_in_list: extracted_values.append(str(item_in_list[name_key])) break else: # 如果没有找到名称键,使用第一个键值对 if item_in_list: first_key = next(iter(item_in_list)) extracted_values.append(f"{first_key}:{item_in_list[first_key]}") else: extracted_values.append(str(item_in_list)) quantity_node[key] = ";".join(extracted_values) except Exception as e: logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}") # 如果是字典,尝试转换为字符串 elif isinstance(value, dict): try: # 提取字典中的关键信息 extracted_info = [] for dict_key, dict_value in value.items(): if isinstance(dict_value, (str, int, float, bool)): extracted_info.append(f"{dict_key}:{dict_value}") quantity_node[key] = ";".join(extracted_info) except Exception as e: logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}") graph.create(quantity_node) graph.push(quantity_node) # 保存属性 # logger.info(f"创建ProjectQuantity节点: {quantity_name} (id: {quantity_id}, 类型: {quantity_type})") # 创建与父节点的关系 # 从本体层获取关系类型 parent_label = list(parent_node.labels)[0] # 获取父节点的第一个标签 relationship_type = get_relationship_type(parent_label, "ProjectQuantity", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, quantity_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {quantity_name}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {quantity_name}") # 移除GUID关系建立代码,避免重复创建 # 关系将在establish_relationships函数中批量创建 # 处理材机列表或children materials = None # 先检查是否有材机列表 if "材机列表" in quantity and quantity["材机列表"]: materials = quantity["材机列表"] # logger.info(f"ProjectQuantity {quantity_name} 有 {len(materials)} 个材机项") for mat_idx, material in enumerate(materials, start=1): # 创建子节点的排序路径 mat_sort_path = current_sort_path + [mat_idx] if current_sort_path else [1, mat_idx] process_material_or_equipment(material, quantity_node, entity_relationships, mat_sort_path) # 如果没有材机列表,则检查是否有children elif "children" in quantity and quantity["children"]: children = quantity["children"] # logger.info(f"ProjectQuantity {quantity_name} 有 {len(children)} 个子项") for child_idx, child in enumerate(children, start=1): child_type = child.get("类型", child.get("type", "")) # 创建子节点的排序路径 child_sort_path = current_sort_path + [child_idx] if current_sort_path else [1, child_idx] # 如果子项类型为人工、材料或机械,则视为MaterialOrEquipment if child_type in ["人工", "材料", "机械", "2", "3", "4"]: process_material_or_equipment(child, quantity_node, entity_relationships, child_sort_path) # 如果子项类型为主材、设备或定额,则递归处理为ProjectQuantity elif child_type in ["1", "主材", "5", "设备", "0", "定额"]: process_project_quantity(child, quantity_node, entity_relationships, child_sort_path) # 处理MaterialOrEquipment def process_material_or_equipment(material, parent_node, entity_relationships, current_sort_path=None): material_id = material.get("id", material.get("ID", "")) material_name = material.get("名称", "") material_type = material.get("类型", material.get("type", "")) if not material_id and not material_name: logger.warning("MaterialOrEquipment缺少id和名称") return # 创建唯一标识,结合父节点的ID和当前项的id parent_id = parent_node.get("id", parent_node.get("GUID", "")) unique_id = f"{parent_id}_{material_id}" if parent_id else material_id # 直接创建新节点,不检查是否已存在 material_node = Node( "MaterialOrEquipment", id=material_id, unique_id=unique_id, name=material_name, type=material_type ) # 设置sortid属性 if current_sort_path: material_node["sortid"] = ".".join(map(str, current_sort_path)) # 添加其他属性 for key, value in material.items(): if key not in ["id", "ID", "名称", "类型", "type"] and value is not None: # 检查是否为资源库列表 if key == "资源库列表" and isinstance(value, list): # 将资源库列表转换为分号分隔的字符串 resource_names = [] for resource in value: if isinstance(resource, dict) and "资源库名称" in resource: resource_names.append(resource["资源库名称"]) material_node["资源库名称"] = ";".join(resource_names) # logger.info(f"将资源库列表转换为字符串: {material_node['资源库名称']}") # 检查值是否为基本类型 elif isinstance(value, (str, int, float, bool)): material_node[key] = value # 如果是列表,尝试转换为分号分隔的字符串 elif isinstance(value, list): try: if all(isinstance(x, (str, int, float, bool)) for x in value): material_node[key] = ";".join(str(x) for x in value) else: # 对于包含复杂对象的列表,尝试提取关键信息 extracted_values = [] for item_in_list in value: if isinstance(item_in_list, dict): # 尝试提取字典中的名称或标识符 for name_key in ["名称", "name", "标识", "id", "ID"]: if name_key in item_in_list: extracted_values.append(str(item_in_list[name_key])) break else: # 如果没有找到名称键,使用第一个键值对 if item_in_list: first_key = next(iter(item_in_list)) extracted_values.append(f"{first_key}:{item_in_list[first_key]}") else: extracted_values.append(str(item_in_list)) material_node[key] = ";".join(extracted_values) except Exception as e: logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}") # 如果是字典,尝试转换为字符串 elif isinstance(value, dict): try: # 提取字典中的关键信息 extracted_info = [] for dict_key, dict_value in value.items(): if isinstance(dict_value, (str, int, float, bool)): extracted_info.append(f"{dict_key}:{dict_value}") material_node[key] = ";".join(extracted_info) except Exception as e: logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}") graph.create(material_node) graph.push(material_node) # 保存属性 # logger.info(f"创建MaterialOrEquipment节点: {material_name} (id: {material_id}, 类型: {material_type})") # 创建与父节点的关系 # 从本体层获取关系类型 relationship_type = get_relationship_type("ProjectQuantity", "MaterialOrEquipment", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, material_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {material_name}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {material_name}") # --------------------------*费用预览*-------------------------- # 处理CostSet def process_cost_set(data, root_node, entity_relationships): """处理费用预览,为每个具有GUID的节点创建CostSet节点""" # 根据JSON结构,访问expensePreview数据 if "projectData" in data and "expensePreview" in data["projectData"]: expense_preview = data["projectData"]["expensePreview"] elif "expensePreview" in data: expense_preview = data["expensePreview"] else: logger.warning("JSON中未找到expensePreview数据") logger.info(f"JSON顶层键: {list(data.keys())}") return # logger.info("开始处理expensePreview") # 用于跟踪已处理的GUID,避免重复处理 processed_guids = set() # 用于跟踪费用预览节点的序号 cost_set_counter = 1 # 递归函数,用于遍历expensePreview中的所有节点 def traverse_expense_preview(node, path=""): nonlocal cost_set_counter if isinstance(node, dict): # 如果节点包含GUID,则创建CostSet节点 if "GUID" in node: guid = node["GUID"] # 标准化GUID格式 guid = guid.strip("{}") guid = "{" + guid.upper() + "}" if guid not in processed_guids: processed_guids.add(guid) # 创建CostSet节点 cost_set_node = Node("CostSet", GUID=guid, name="费用预览集") # 设置sortid属性,每个CostSet都从1开始 cost_set_node["sortid"] = "1" try: graph.create(cost_set_node) graph.push(cost_set_node) # 保存属性 # logger.info(f"创建CostSet节点: 费用预览集 (GUID: {guid}, 路径: {path})") except Exception as e: logger.error(f"创建CostSet节点失败: {e}") return # 创建与根节点的关系 relationship_type = get_relationship_type("EngineeringData", "CostSet", entity_relationships) if relationship_type: try: graph.create(Relationship(root_node, relationship_type, cost_set_node)) # logger.info(f"创建关系: {root_node['name']} {relationship_type} 费用预览集") except Exception as e: logger.error(f"创建关系失败: {e}") # 处理sum数组中的CostItem if "sum" in node and isinstance(node["sum"], list) and node["sum"]: for item_idx, item in enumerate(node["sum"], start=1): try: # 为每个CostItem创建独立的排序路径,从1.1开始 item_sort_path = [1, item_idx] process_cost_item(item, cost_set_node, entity_relationships, item_sort_path) except Exception as e: logger.error(f"处理CostItem时出错: {e}") # 处理rcj数组中的MaterialandmachineCostItem if "rcj" in node and isinstance(node["rcj"], list) and node["rcj"]: for item_idx, item in enumerate(node["rcj"], start=1): try: # 为每个MaterialandmachineCostItem创建独立的排序路径,从2.1开始 item_sort_path = [2, item_idx] process_material_machine_cost_item( item, cost_set_node, entity_relationships, item_sort_path ) except Exception as e: logger.error(f"处理MaterialandmachineCostItem时出错: {e}") # 增加计数器,为下一个CostSet准备 cost_set_counter += 1 # 递归处理子节点 for key, value in node.items(): if key == "children" and isinstance(value, list): for i, child in enumerate(value, start=1): traverse_expense_preview(child, f"{path}.children[{i}]") elif isinstance(node, list): for i, item in enumerate(node, start=1): traverse_expense_preview(item, f"{path}[{i}]") # 遍历expensePreview中的所有类别 for category_idx, (category_name, category) in enumerate(expense_preview.items(), start=1): # 处理category是字典的情况 if isinstance(category, dict): for subcategory_idx, (subcategory_name, subcategory) in enumerate(category.items(), start=1): if isinstance(subcategory, list): for i, item in enumerate(subcategory, start=1): traverse_expense_preview(item, f"expensePreview.{category_name}.{subcategory_name}[{i}]") # 处理category是列表的情况 elif isinstance(category, list): for i, item in enumerate(category, start=1): traverse_expense_preview(item, f"expensePreview.{category_name}[{i}]") # logger.info(f"共处理了 {len(processed_guids)} 个CostSet节点") # 处理CostItem - 简化处理逻辑,确保正确创建节点 def process_cost_item(item, parent_node, entity_relationships, current_sort_path=None): # 确保item是字典 if not isinstance(item, dict): logger.error(f"CostItem不是字典类型: {item}") return item_id = item.get("id", "") cost = item.get("cost", "") if not item_id: logger.warning(f"跳过没有id的CostItem: {item}") return # 使用id作为名称 name = item_id # 创建唯一标识,结合父节点的GUID和当前项的id parent_guid = parent_node.get("GUID", "") unique_id = f"{parent_guid}_{item_id}" if parent_guid else item_id # 创建CostItem节点 - 简化属性,只保留关键属性 item_node = Node("CostItem", id=item_id, unique_id=unique_id, cost=cost, name=name) # 设置sortid属性 if current_sort_path: item_node["sortid"] = ".".join(map(str, current_sort_path)) # 添加其他属性 - 简化属性处理 for key, value in item.items(): if key not in ["id", "cost"] and value is not None: if isinstance(value, (str, int, float, bool)): item_node[key] = value graph.create(item_node) graph.push(item_node) # 保存属性 # logger.info(f"创建CostItem节点: {name} (id: {item_id}, cost: {cost})") # 创建与父节点的关系 relationship_type = get_relationship_type("CostSet", "CostItem", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, item_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}") # else: # logger.info(f"不创建关系: {parent_node['name']} 到 {name}") # 处理人材机合价项(MaterialandmachineCostItem) def process_material_machine_cost_item(item, parent_node, entity_relationships, current_sort_path=None): """处理人材机合价项""" # 提取必要属性 item_type = item.get("type", "") name = item.get("名称", "") code = item.get("编码", "") unit = item.get("单位", "") supplier = item.get("供货方", "") budget_price = item.get("预算价不含税", "") market_price = item.get("市场价不含税", "") budget_total = item.get("预算价合价", "") market_total = item.get("市场价合价", "") price_diff = item.get("价差", "") quantity = item.get("数量", "") if not name: logger.warning("MaterialandmachineCostItem缺少名称") return # 创建唯一标识,结合父节点的GUID和当前项的编码 parent_guid = parent_node.get("GUID", "") unique_id = f"{parent_guid}_{code}" if parent_guid and code else (parent_guid or code) # 创建人材机合价项节点,确保所有属性值都是原始类型 properties = {"type": str(item_type), "name": str(name), "unique_id": str(unique_id)} # 设置sortid属性 if current_sort_path: properties["sortid"] = ".".join(map(str, current_sort_path)) # 添加其他属性,确保都是字符串类型 if supplier: properties["供货方"] = str(supplier) if code: properties["编码"] = str(code) if unit: properties["单位"] = str(unit) if budget_price: properties["预算价不含税"] = str(budget_price) if market_price: properties["市场价不含税"] = str(market_price) if budget_total: properties["预算价合价"] = str(budget_total) if market_total: properties["市场价合价"] = str(market_total) if price_diff: properties["价差"] = str(price_diff) if quantity: properties["数量"] = str(quantity) # 创建节点 item_node = Node("MaterialandmachineCostItem", **properties) try: graph.create(item_node) graph.push(item_node) # 保存属性 # logger.info(f"创建MaterialandmachineCostItem节点: {name} (类型: {item_type}, 编码: {code})") except Exception as e: logger.error(f"创建MaterialandmachineCostItem节点失败: {e}") # 打印详细的节点属性,帮助调试 for key, value in properties.items(): logger.error(f"属性 {key}: {type(value)} = {value}") return # 创建与父节点的关系 relationship_type = get_relationship_type("CostSet", "MaterialandmachineCostItem", entity_relationships) if relationship_type: try: graph.create(Relationship(parent_node, relationship_type, item_node)) # logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}") except Exception as e: logger.error(f"创建关系失败: {e}") else: logger.info(f"不创建关系: {parent_node['name']} 到 {name}") # 修改establish_relationships函数,添加项目划分与人材机合价集合的关系 def establish_relationships(entity_relationships): # 首先检查数据库中的节点情况 logger.info("检查数据库中的节点情况...") # 检查ProjectDivisionItem节点 pdi_query = """ MATCH (pdi:ProjectDivisionItem) RETURN count(pdi) as count, collect(distinct pdi.GUID)[..10] as sample_guids """ pdi_result = graph.run(pdi_query).data()[0] logger.info(f"数据库中有 {pdi_result['count']} 个ProjectDivisionItem节点") logger.info(f"ProjectDivisionItem节点GUID样本: {pdi_result['sample_guids']}") # 检查ProjectQuantity节点 pq_query = """ MATCH (pq:ProjectQuantity) RETURN count(pq) as count, collect(distinct pq.id)[..10] as sample_ids """ pq_result = graph.run(pq_query).data()[0] logger.info(f"数据库中有 {pq_result['count']} 个ProjectQuantity节点") logger.info(f"ProjectQuantity节点ID样本: {pq_result['sample_ids']}") # 检查CostSet节点 cs_query = """ MATCH (cs:CostSet) RETURN count(cs) as count, collect(distinct cs.GUID)[..10] as sample_guids """ cs_result = graph.run(cs_query).data()[0] logger.info(f"数据库中有 {cs_result['count']} 个CostSet节点") logger.info(f"CostSet节点GUID样本: {cs_result['sample_guids']}") # 获取从ProjectDivisionItem到CostSet的关系类型 relationship_type = get_relationship_type("ProjectDivisionItem", "CostSet", entity_relationships) if relationship_type: # 只有当关系类型不为None时才创建关系 # 建立ProjectDivisionItem与CostSet的关系,使用更宽松的匹配条件,并避免重复创建 query_division_item = f""" MATCH (pdi:ProjectDivisionItem), (cs:CostSet) WHERE toUpper(replace(replace(pdi.GUID, '{{', ''), '}}', '')) = toUpper(replace(replace(cs.GUID, '{{', ''), '}}', '')) AND pdi.GUID <> "" AND NOT EXISTS((pdi)-[:{relationship_type}]->(cs)) CREATE (pdi)-[:{relationship_type}]->(cs) RETURN count(*) as count """ try: result = graph.run(query_division_item) count = result.data()[0]["count"] logger.info(f"创建了 {count} 个 ProjectDivisionItem {relationship_type} CostSet 关系") except Exception as e: logger.error(f"创建ProjectDivisionItem与CostSet关系失败: {e}") else: logger.info("本体层中未定义ProjectDivisionItem到CostSet的关系,跳过创建") # 获取从ProjectQuantity到CostSet的关系类型 relationship_type = get_relationship_type("ProjectQuantity", "CostSet", entity_relationships) if relationship_type: # 只有当关系类型不为None时才创建关系 # 建立ProjectQuantity与CostSet的关系,使用更宽松的匹配条件,并避免重复创建 query_quantity = f""" MATCH (pq:ProjectQuantity), (cs:CostSet) WHERE toUpper(replace(replace(coalesce(pq.GUID, pq.guid, ''), '{{', ''), '}}', '')) = toUpper(replace(replace(coalesce(cs.GUID, cs.guid, ''), '{{', ''), '}}', '')) AND coalesce(pq.GUID, pq.guid, '') <> '' AND coalesce(cs.GUID, cs.guid, '') <> '' AND NOT EXISTS((pq)-[:{relationship_type}]->(cs)) CREATE (pq)-[:{relationship_type}]->(cs) RETURN count(*) as count """ try: result = graph.run(query_quantity) count = result.data()[0]["count"] logger.info(f"创建了 {count} 个 ProjectQuantity {relationship_type} CostSet 关系") except Exception as e: logger.error(f"创建ProjectQuantity与CostSet关系失败: {e}") else: logger.info("本体层中未定义ProjectQuantity到CostSet的关系,跳过创建") # 检查最终的关系数量 # 检查ProjectDivisionItem到CostSet的关系 pdi_cs_query = """ MATCH (pdi:ProjectDivisionItem)-[r]->(cs:CostSet) RETURN count(r) as count """ pdi_cs_result = graph.run(pdi_cs_query).data()[0] logger.info(f"数据库中最终有 {pdi_cs_result['count']} 个ProjectDivisionItem到CostSet的关系") # 检查ProjectQuantity到CostSet的关系 pq_cs_query = """ MATCH (pq:ProjectQuantity)-[r]->(cs:CostSet) RETURN count(r) as count """ pq_cs_result = graph.run(pq_cs_query).data()[0] logger.info(f"数据库中最终有 {pq_cs_result['count']} 个ProjectQuantity到CostSet的关系") # --------------------------*费用表*-------------------------- # 处理取费表模板集(FeeTableTemplateSet) def process_fee_table_template_set(data, parent_node, entity_relationships): if "projectData" in data and "costSetting" in data["projectData"]: cost_setting = data["projectData"]["costSetting"] elif "costSetting" in data: cost_setting = data["costSetting"] else: logger.warning("JSON中未找到costSetting数据") logger.info(f"JSON顶层键: {list(data.keys())}") return # 创建取费表模板集节点 fee_template_set_node = Node("FeeTableTemplateSet", name="取费表模板集") graph.create(fee_template_set_node) fee_template_set_node["sortid"] = "1" # 固定为1 graph.push(fee_template_set_node) # 更新节点属性 relationship_type = get_relationship_type("EngineeringData", "FeeTableTemplateSet", entity_relationships) graph.create(Relationship(parent_node, relationship_type, fee_template_set_node)) # 开始处理子节点,从编号 [1] 开始 idx = 1 for template_set_name, template_set_content in cost_setting.items(): if "tables" in template_set_content and isinstance(template_set_content["tables"], list): for template_item in template_set_content["tables"]: process_fee_table_template_item(template_item, fee_template_set_node, entity_relationships, [1, idx]) idx += 1 # 同级模板项顺序递增 # 处理取费表模板项(FeeTableTemplateItem) def process_fee_table_template_item(template_item, parent_node, entity_relationships, current_sort_path): # 只获取必须的name属性 name = template_item.get("name", "") if not name: logger.warning("FeeTableTemplateItem缺少name") return # 创建基础节点(只设置name) template_item_node = Node("FeeTableTemplateItem", name=name) # 动态添加所有其他属性(排除name和children) for key, value in template_item.items(): if key not in ["name", "children"] and value is not None: if isinstance(value, (str, int, float, bool)): template_item_node[key] = value # 完全原样存储 else: logger.warning(f"跳过非基本类型属性: {key}={value}(类型:{type(value)})") # 设置系统属性 template_item_node["sortid"] = ".".join(map(str, current_sort_path)) # 保存节点 try: graph.create(template_item_node) graph.push(template_item_node) except Exception as e: logger.error(f"创建模板项节点失败: {str(e)}") return # 创建关系 relationship_type = get_relationship_type("FeeTableTemplateSet", "FeeTableTemplateItem", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, template_item_node)) # 处理子费用项(保持原有逻辑) if "children" in template_item and isinstance(template_item["children"], list): for i, fee_item in enumerate(template_item["children"], start=1): process_fee(fee_item, template_item_node, entity_relationships, current_sort_path + [i]) # 处理取费(FeeCollection) def process_fee(fee_item, parent_node, entity_relationships, current_sort_path): # 只获取费用名称(必须字段) fee_name = fee_item.get("费用名称", "") if not fee_name: logger.warning("Fee缺少费用名称") return # 创建节点(只设置name属性) fee_node = Node("FeeCollection", name=fee_name) # 动态添加所有其他属性(排除children和费用名称) for key, value in fee_item.items(): if key not in ["费用名称", "children"] and value is not None: # 只允许基本数据类型(Neo4j不支持复杂类型) if isinstance(value, (str, int, float, bool)): fee_node[key] = value # 完全原样存储,不做任何映射 else: logger.warning(f"跳过非基本类型属性: {key}={value}(类型:{type(value)})") # 设置系统属性 fee_node["sortid"] = ".".join(map(str, current_sort_path)) # 排序路径 # 设置path(保持原有逻辑) if "FeeCollection" in parent_node.labels: parent_path = parent_node.get("path", "") fee_node["path"] = f"{parent_path}/{fee_name}" else: parent_name = parent_node.get("name", "") fee_node["path"] = f"{parent_name}/{fee_name}" # 保存节点 try: graph.create(fee_node) graph.push(fee_node) except Exception as e: logger.error(f"创建节点失败: {str(e)}") return # 创建关系 parent_label = list(parent_node.labels)[0] relationship_type = get_relationship_type(parent_label, "FeeCollection", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, fee_node)) # 递归处理子费用 if "children" in fee_item and isinstance(fee_item["children"], list): for i, child_fee in enumerate(fee_item["children"], start=1): process_fee(child_fee, fee_node, entity_relationships, current_sort_path + [i]) # --------------------------*工程费用*-------------------------- # 处理费用表集(FeeScheduleSet) def process_fee_schedule_set(data, parent_node, entity_relationships): """处理费用表集、费用表项和费用""" if "projectData" in data and "projectCost" in data["projectData"]: project_cost = data["projectData"]["projectCost"] elif "projectCost" in data: project_cost = data["projectCost"] else: logger.warning("JSON中未找到projectCost数据") logger.info(f"JSON顶层键: {list(data.keys())}") return # 创建 FeeScheduleSet 节点:工程费用 fee_schedule_set = Node("FeeScheduleSet", name="工程费用") graph.create(fee_schedule_set) # 设置 sortid = "1" fee_schedule_set["sortid"] = "1" graph.push(fee_schedule_set) # 保存属性 # 创建关系 relationship_type = get_relationship_type("EngineeringData", "FeeScheduleSet", entity_relationships) graph.create(Relationship(parent_node, relationship_type, fee_schedule_set)) # 开始处理子项,从编号 1 开始 for idx, (fee_table_name, fee_table_content) in enumerate(project_cost.items(), start=1): # 传入当前路径 [1, idx],因为 FeeScheduleSet 是 1 process_fee_schedule_item(fee_table_content, fee_table_name, fee_schedule_set, entity_relationships, [1, idx]) # 处理费用表项(FeeScheduleItem) def process_fee_schedule_item(fee_table_content, item_name, parent_node, entity_relationships, current_sort_path): """处理费用表项(FeeScheduleItem)""" # 创建 FeeScheduleItem 节点 fee_schedule_item = Node("FeeScheduleItem", name=item_name) fee_schedule_item["sortid"] = ".".join(map(str, current_sort_path)) graph.create(fee_schedule_item) graph.push(fee_schedule_item) # 保存 sortid # 创建与父节点的关系 relationship_type = get_relationship_type("FeeScheduleSet", "FeeScheduleItem", entity_relationships) graph.create(Relationship(parent_node, relationship_type, fee_schedule_item)) # 处理费用项列表 if isinstance(fee_table_content, list): # 原有格式:直接是费用项列表 for i, fee_item in enumerate(fee_table_content, start=1): process_fee_item(fee_item, fee_schedule_item, entity_relationships, current_sort_path + [i]) elif isinstance(fee_table_content, dict) and "children" in fee_table_content: # 新格式:包含children字段的字典 if isinstance(fee_table_content["children"], list): for i, fee_item in enumerate(fee_table_content["children"], start=1): process_fee_item(fee_item, fee_schedule_item, entity_relationships, current_sort_path + [i]) else: logger.warning( f"FeeScheduleItem {item_name} 的children不是列表,类型: {type(fee_table_content['children'])}" ) else: logger.warning(f"FeeScheduleItem {item_name} 的内容类型未知: {type(fee_table_content)}") # 处理费用项(Fee) def process_fee_item(fee, parent_node, entity_relationships, current_sort_path=None): """ 处理费用项(Fee) current_sort_path: 当前层级编号路径,如 [1, 2, 1] """ if current_sort_path is None: current_sort_path = [1] # 尝试获取"费用名称",如果不存在,则尝试获取"项目名称" name = fee.get("费用名称", fee.get("项目名称", fee.get("项目划分名称", fee.get("名称", "")))) if not name: logger.warning(f"Fee缺少费用名称或项目名称,当前对象: {fee}") return # 创建基础节点,只设置name属性 fee_node = Node("Fee", name=name) # 动态添加所有其他属性(排除"费用名称"、"项目名称"和"children") for key, value in fee.items(): # 只处理基本数据类型,跳过children和复杂对象 if key not in ["费用名称", "项目名称", "children"] and value is not None: if isinstance(value, (str, int, float, bool)): fee_node[key] = value elif isinstance(value, (list, dict)): logger.warning(f"跳过复杂属性: {key} (类型: {type(value)})") # 设置 sortid fee_node["sortid"] = ".".join(map(str, current_sort_path)) # 设置 path 属性 if "Fee" in parent_node.labels: parent_path = parent_node.get("path", "") fee_node["path"] = f"{parent_path}/{name}" else: parent_name = parent_node.get("name", "") fee_node["path"] = f"{parent_name}/{name}" graph.create(fee_node) graph.push(fee_node) # 创建关系 parent_label = list(parent_node.labels)[0] relationship_type = get_relationship_type(parent_label, "Fee", entity_relationships) if relationship_type: graph.create(Relationship(parent_node, relationship_type, fee_node)) # 递归处理子费用(从children获取) if "children" in fee and isinstance(fee["children"], list): for i, child in enumerate(fee["children"], start=1): process_fee_item(child, fee_node, entity_relationships, current_sort_path + [i]) # --------------------------*工程属性*-------------------------- # 处理工程属性集(ProjectPropertySet) def process_project_property_set(data, parent_node, entity_relationships): # 检查projectInfo是否存在 if "projectData" in data and "projectInfo" in data["projectData"]: project_info = data["projectData"]["projectInfo"] elif "projectInfo" in data: project_info = data["projectInfo"] else: logger.warning("JSON中未找到projectInfo数据") logger.info(f"JSON顶层键: {list(data.keys())}") return # 创建工程属性集节点 property_set_node = Node("ProjectPropertySet", name="工程属性") property_set_node["sortid"] = "1" # 固定根编号 graph.create(property_set_node) graph.push(property_set_node) # 保存属性 # 创建关系 relationship_type = get_relationship_type("EngineeringData", "ProjectPropertySet", entity_relationships) graph.create(Relationship(parent_node, relationship_type, property_set_node)) # 添加上传时间属性 upload_date = time.strftime("%Y/%m/%d") current_sort_path = [1, 0] # 确保排在最前面 create_property_node("上传时间", upload_date, property_set_node, entity_relationships, current_sort_path) # 处理每个属性,从编号 1 开始 for idx, (key, value) in enumerate(project_info.items(), start=1): if value is not None: current_sort_path = [1, idx] # 从 1.x 开始 create_property_node(key, value, property_set_node, entity_relationships, current_sort_path) # 处理工程属性(ProjectProperty) def create_property_node(key, value, parent_node, entity_relationships, current_sort_path): """递归创建 ProjectProperty 节点,支持嵌套结构""" # 创建当前节点 property_node = Node("ProjectProperty", name=key) property_node["sortid"] = ".".join(map(str, current_sort_path)) # 设置 value 属性(基本类型或摘要) if isinstance(value, (dict, list)): type_name = "字典" if isinstance(value, dict) else "列表" count = len(value) property_node["value"] = f"{type_name}({count}项)" else: property_node["value"] = str(value) graph.create(property_node) graph.push(property_node) # 创建与父节点的关系 rel_type = ( get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships) if "ProjectPropertySet" in parent_node.labels else get_relationship_type("ProjectProperty", "ProjectProperty", entity_relationships) ) graph.create(Relationship(parent_node, rel_type, property_node)) # 递归处理嵌套结构 if isinstance(value, dict): for sub_key, sub_value in value.items(): if sub_value is not None: sub_full_name = f"{key}.{sub_key}" create_property_node( sub_full_name, sub_value, property_node, entity_relationships, current_sort_path + [1], # 暂用 1,后面可优化为真实顺序 ) elif isinstance(value, list): for i, item in enumerate(value): list_node_name = f"{key}[{i}]" list_node = Node("ProjectProperty", name=list_node_name, value="列表项") list_sort_path = current_sort_path + [i + 1] list_node["sortid"] = ".".join(map(str, list_sort_path)) graph.create(list_node) graph.push(list_node) rel_type = get_relationship_type("ProjectProperty", "ProjectProperty", entity_relationships) graph.create(Relationship(property_node, rel_type, list_node)) if isinstance(item, dict): for sub_key, sub_value in item.items(): if sub_value is not None: sub_full_name = f"{key}[{i}].{sub_key}" create_property_node( sub_full_name, sub_value, list_node, entity_relationships, list_sort_path + [1] ) else: # 基本类型列表项 item_node = Node("ProjectProperty", name=f"{key}[{i}]", value=str(item)) item_node["sortid"] = ".".join(map(str, list_sort_path)) graph.create(item_node) graph.push(item_node) graph.create(Relationship(list_node, rel_type, item_node)) # 将创建知识图谱的功能封装为函数 def create_KG(json_file_path, ontology_file_path="Ontology_Layer.txt"): """ 创建知识图谱 Args: json_file_path: JSON文件路径 ontology_file_path: 本体层文件路径 Returns: success: 是否成功创建知识图谱 """ try: # 解析本体层文件 entity_types, entity_relationships = parse_ontology_file(ontology_file_path) if not entity_types or not entity_relationships: logger.error("解析本体层文件失败") return False # 读取JSON文件 try: with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) except Exception as e: logger.error(f"读取JSON文件失败: {e}") return False # 获取文件名作为工程名称 file_name = os.path.basename(json_file_path) project_name = os.path.splitext(file_name)[0] # 为每个JSON文件创建一个新的EngineeringData节点 root_node = Node("EngineeringData", name=project_name) graph.create(root_node) logger.info(f"创建根节点: {project_name}") # 处理费用预览,创建CostSet节点及其子节点 process_cost_set(data, root_node, entity_relationships) # 处理项目划分,创建ProjectDivisionSet和ProjectDivisionItem节点 process_project_division_set(data, root_node, entity_relationships) # 处理取费表模板集 process_fee_table_template_set(data, root_node, entity_relationships) # 处理费用表集 process_fee_schedule_set(data, root_node, entity_relationships) # 处理工程属性集 process_project_property_set(data, root_node, entity_relationships) # 建立实体间的关系 establish_relationships(entity_relationships) logger.info(f"成功创建知识图谱: {json_file_path}") return True except Exception as e: logger.error(f"创建知识图谱失败: {e}") import traceback traceback.print_exc() return False # 添加一个新的函数,用于处理文件夹中的多个JSON文件 def create_KGs_from_folder(input_folder, ontology_file_path="Ontology_Layer.txt"): """ 从文件夹中创建多个知识图谱 Args: input_folder: 输入文件夹路径,包含多个JSON文件 ontology_file_path: 本体层文件路径 Returns: success_count: 成功处理的文件数量 total_count: 总文件数量 deleted_projects: 被删除的同名工程列表 """ # 获取输入文件夹中的所有JSON文件 json_files = [] deleted_projects = [] # 如果输入是文件夹,则获取文件夹中的所有JSON文件 if os.path.isdir(input_folder): json_files = glob.glob(os.path.join(input_folder, "*.json")) # 如果输入是文件,则直接使用该文件 elif os.path.isfile(input_folder) and input_folder.endswith(".json"): json_files = [input_folder] else: logger.error(f"输入路径无效: {input_folder}") return 0, 0, [] if not json_files: logger.error(f"未找到JSON文件: {input_folder}") return 0, 0, [] # 连接到Neo4j数据库 config = read_config() if not connect_to_neo4j(config["neo4j"]["uri"], config["neo4j"]["user"], config["neo4j"]["password"]): return 0, 0, [] # 处理每个JSON文件 success_count = 0 for json_file in json_files: logger.info(f"处理文件: {json_file}") # 获取文件名作为工程名称 file_name = os.path.basename(json_file) project_name = os.path.splitext(file_name)[0] # 读取JSON数据用于比较 try: with open(json_file, "r", encoding="utf-8") as f: project_data = json.load(f) except Exception as e: logger.error(f"读取JSON文件失败: {e}") continue # 检查是否存在同名工程,如果存在则删除 if check_and_delete_duplicate_project(project_name, project_data): deleted_projects.append(project_name) logger.info(f"删除同名工程: {project_name}") # 创建知识图谱 if create_KG(json_file, ontology_file_path): success_count += 1 logger.info(f"知识图谱构建完成,成功处理 {success_count}/{len(json_files)} 个文件") return success_count, len(json_files), deleted_projects # 检查是否存在同名工程,如果存在则删除 def check_and_delete_duplicate_project(project_name, project_data): """ 检查是否存在同名工程,如果存在则删除其整个连通子图(包括所有出边/入边连接的节点) Args: project_name: 工程名称(对应 EngineeringData 的 name 属性) project_data: 工程数据,用于进一步比较内部的“工程名称” Returns: deleted: 是否删除了至少一个工程 """ try: # 查询是否存在同名的 EngineeringData 节点 query = """ MATCH (ed:EngineeringData {name: $name}) RETURN id(ed) AS node_id """ config = read_config() with GraphDatabase.driver( config["neo4j"]["uri"], auth=(config["neo4j"]["user"], config["neo4j"]["password"]) ) as driver: deleted_count = 0 with driver.session() as session: result = session.run(query, name=project_name) records = list(result) if not records: return False # 没有找到同名工程 logger.info(f"找到 {len(records)} 个同名工程节点: {project_name}") # ====== 是否需要根据内部“工程名称”进行精确匹配?====== need_exact_match = ( project_data and "projectData" in project_data and "projectInfo" in project_data["projectData"] ) new_project_name = ( project_data["projectData"]["projectInfo"].get("工程名称", "") if need_exact_match else "" ) # 遍历每个同名的 EngineeringData 节点 for record in records: node_id = record["node_id"] should_delete = False if need_exact_match and new_project_name: # 需要精确匹配内部“工程名称” matched = False for rel_type in ["HAS_CHILD", "USE"]: prop_query = f""" MATCH (ed:EngineeringData)-[:{rel_type}*1..3]->(pp:ProjectProperty {{name: "工程名称"}}) WHERE id(ed) = $node_id RETURN pp.value AS project_name """ prop_result = session.run(prop_query, node_id=node_id) prop_record = prop_result.single() if prop_record and prop_record["project_name"] == new_project_name: matched = True break should_delete = matched else: # 不需要精确匹配,直接删除所有同名工程 should_delete = True # ====== 执行删除:删除从该 EngineeringData 可达的所有节点(任意关系、任意方向)====== if should_delete: delete_query = """ MATCH (ed:EngineeringData) WHERE id(ed) = $node_id // 从 ed 出发,匹配所有通过任意关系、任意方向、任意深度连接的节点(包括 ed 自己) MATCH (ed)-[*0..]-(n) // 删除所有匹配到的节点及其关系 DETACH DELETE n """ session.run(delete_query, node_id=node_id) logger.info(f"已递归删除工程及其所有关联节点: {project_name} (Node ID: {node_id})") deleted_count += 1 return deleted_count > 0 except Exception as e: logger.error(f"检查并删除同名工程失败: {e}") import traceback traceback.print_exc() return False # if __name__ == "__main__": # input_folder = "project2json/outputs/GPRB1MJL/final" # create_KGs_from_folder(input_folder)