Files
KG_generation/build_kg_ontolo.py
T
chentianrui 9609bb67b4 上传文件
2025-08-01 15:31:56 +08:00

1489 lines
65 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
通过本体层文件构建知识图谱
"""
from py2neo import Graph, Node, Relationship, NodeMatcher
import json
import os
import logging
import re
import configparser
import glob
# 设置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# 全局变量
graph = None
def read_config(config_file="config.ini"):
"""
读取配置文件
Args:
config_file: 配置文件路径
Returns:
config: 配置对象
"""
config = configparser.ConfigParser()
config.read(config_file, encoding="utf-8")
return config
def connect_to_neo4j(uri, user, password):
"""
连接到Neo4j数据库
Args:
uri: 数据库URI
user: 用户名
password: 密码
Returns:
graph: 数据库连接对象
"""
global graph
try:
graph = Graph(uri, auth=(user, password))
logger.info("成功连接到Neo4j数据库")
return True
except Exception as e:
logger.error(f"连接Neo4j数据库失败: {e}")
return False
def clear_database():
"""
清空数据库
"""
try:
graph.run("MATCH (n) DETACH DELETE n")
logger.info("已清空数据库")
# 删除所有约束
try:
# 获取所有约束
constraints = graph.run("SHOW CONSTRAINTS").data()
for constraint in constraints:
constraint_name = constraint.get("name")
if constraint_name:
graph.run(f"DROP CONSTRAINT {constraint_name}")
logger.info(f"已删除约束: {constraint_name}")
except Exception as e:
logger.warning(f"删除约束失败: {e}")
return True
except Exception as e:
logger.error(f"清空数据库失败: {e}")
return False
# 解析本体层文件,获取实体类型、属性和关系定义
def parse_ontology_file(file_path="Ontology_Layer.txt"):
"""
解析本体层文件,获取实体类型、属性和关系定义
Args:
file_path: 本体层文件路径
Returns:
entity_types: 实体类型和属性的字典 {实体类型名称: {属性名称: 属性类型}}
entity_relationships: 实体间关系的列表 [(源实体类型, 关系类型, 目标实体类型)]
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
# 分割实体类型和关系部分
parts = content.split("2. 实体间的关系")
if len(parts) != 2:
logger.error("本体层文件格式错误,无法找到实体类型和关系部分")
return {}, []
entity_types_content = parts[0]
relationships_content = parts[1]
# 解析实体类型和属性
entity_types = {}
current_entity = None
# 移除 "1.实体类型" 标题行
entity_types_content = entity_types_content.replace("1.实体类型", "").strip()
# 按行分割
lines = entity_types_content.strip().split("\n")
for line in lines:
line = line.strip()
if not line:
continue
# 如果行不包含冒号,视为实体类型
if ":" not in line:
current_entity = line
entity_types[current_entity] = {}
# 否则视为属性定义
else:
if current_entity is None:
continue
parts = line.split(":", 1)
if len(parts) == 2:
attr_name = parts[0].strip()
attr_type = parts[1].strip()
entity_types[current_entity][attr_name] = attr_type
# 解析实体间的关系
entity_relationships = []
# 匹配关系定义行: (:EntityType)-[:RELATIONSHIP]->(:EntityType)
relationship_pattern = r"\(:(\w+)\)-\[:(\w+)\]->\(:(\w+)\)"
for line in relationships_content.strip().split("\n"):
line = line.strip()
if not line:
continue
match = re.match(relationship_pattern, line)
if match:
source_entity, relationship_type, target_entity = match.groups()
entity_relationships.append((source_entity, relationship_type, target_entity))
logger.info(f"从本体层文件中解析出 {len(entity_types)} 个实体类型和 {len(entity_relationships)} 个关系定义")
return entity_types, entity_relationships
except Exception as e:
logger.error(f"解析本体层文件失败: {e}")
return {}, []
# 创建约束和索引以提高性能 - 现在不创建任何约束
def create_constraints_and_indexes():
# 不创建任何约束
logger.info("跳过创建约束")
pass
# 获取实体之间的关系类型
def get_relationship_type(source_entity_type, target_entity_type, entity_relationships):
"""
根据源实体类型和目标实体类型获取关系类型
Args:
source_entity_type: 源实体类型
target_entity_type: 目标实体类型
entity_relationships: 实体间关系的列表
Returns:
relationship_type: 关系类型,如果没找到则返回None
"""
for source, rel_type, target in entity_relationships:
if source == source_entity_type and target == target_entity_type:
return rel_type
# 如果没有找到匹配的关系,返回None
logger.warning(f"未找到从 {source_entity_type}{target_entity_type} 的关系定义,不创建关系")
return None
# 创建根节点
def create_root_node():
root = Node("EngineeringData", name="工程")
graph.create(root)
logger.info("创建根节点: 工程")
return root
# 处理ProjectDivisionSet
def process_project_division_set(data, parent_node, entity_relationships):
# 根据您提供的JSON结构,正确访问projectDivision数据
if "projectData" in data and "projectDivision" in data["projectData"]:
project_division = data["projectData"]["projectDivision"]
elif "projectDivision" in data:
project_division = data["projectDivision"]
else:
logger.warning("JSON中未找到projectDivision数据")
logger.info(f"JSON顶层键: {list(data.keys())}")
return
logger.info(f"开始处理projectDivision,包含 {len(project_division)} 个顶级项目")
# 创建新的ProjectDivisionSet节点 - 项目划分集
division_set = Node("ProjectDivisionSet", name="项目划分集")
graph.create(division_set)
# 从本体层获取关系类型
relationship_type = get_relationship_type("EngineeringData", "ProjectDivisionSet", entity_relationships)
if relationship_type: # 只有当关系类型不为None时才创建关系
graph.create(Relationship(parent_node, relationship_type, division_set))
# 处理ProjectDivisionTree
for first_level_name, first_level_content in project_division.items():
# 处理第一层下的内容,直接创建合并后的ProjectDivisionTree节点
if isinstance(first_level_content, dict):
# 处理一级名称,去掉"工程"字样
processed_first_level = first_level_name.replace("工程", "")
for second_level_name, second_level_content in first_level_content.items():
# 确定最终节点名称
if second_level_name == processed_first_level:
# 如果二级名称与处理后的一级名称相同,直接使用二级名称
final_name = second_level_name
else:
# 否则组合二级名称和处理后的一级名称
final_name = f"{second_level_name}{processed_first_level}"
# 创建ProjectDivisionTree节点
division_tree = Node("ProjectDivisionTree", name=final_name)
# 保存原始名称作为属性
division_tree["original_first_level"] = first_level_name
division_tree["original_second_level"] = second_level_name
# 如果有GUID,添加到节点属性
guid = None
if isinstance(first_level_content, dict) and "GUID" in first_level_content:
division_tree["first_level_GUID"] = first_level_content["GUID"]
guid = first_level_content["GUID"]
graph.create(division_tree)
# 从本体层获取关系类型
relationship_type = get_relationship_type(
"ProjectDivisionSet", "ProjectDivisionTree", entity_relationships
)
if relationship_type:
graph.create(Relationship(division_set, relationship_type, division_tree))
# 移除GUID关系建立代码,避免重复创建
# 关系将在establish_relationships函数中批量创建
# 处理第二层下的ProjectDivisionItem列表
if isinstance(second_level_content, list):
for item in second_level_content:
process_project_division_item(item, division_tree, entity_relationships)
else:
logger.warning(f"ProjectDivisionTree {final_name} 的内容类型未知: {type(second_level_content)}")
else:
logger.warning(f"第一层 {first_level_name} 的内容类型未知: {type(first_level_content)}")
# 处理ProjectDivisionItem
def process_project_division_item(item, parent_node, entity_relationships):
# 提取必要属性
guid = item.get("GUID", "")
name = item.get("项目名称", "")
if not guid and not name:
logger.warning("ProjectDivisionItem缺少GUID和项目名称")
return
# 创建ProjectDivisionItem节点
item_node = Node("ProjectDivisionItem", GUID=guid, name=name)
# 添加path属性,表示从ProjectDivisionItem到ProjectDivisionTree的路径,不包含节点类型
if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels:
# 如果父节点是ProjectDivisionTree,使用"父节点名称/当前节点名称"作为路径
item_node["path"] = f"{parent_node['name']}/{name}"
# logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}")
else:
# 如果父节点是ProjectDivisionItem,使用"父节点path/当前节点名称"作为路径
parent_path = parent_node.get("path", "")
if parent_path:
item_node["path"] = f"{parent_path}/{name}"
else:
# 如果父节点没有path属性(不应该发生,但为了健壮性)
item_node["path"] = name
# logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}")
# 添加其他属性
for key, value in item.items():
if key not in ["GUID", "项目名称", "children"] and value is not None:
# 检查是否为资源库列表
if key == "资源库列表" and isinstance(value, list):
# 将资源库列表转换为分号分隔的字符串
resource_names = []
for resource in value:
if isinstance(resource, dict) and "资源库名称" in resource:
resource_names.append(resource["资源库名称"])
item_node["资源库名称"] = "".join(resource_names)
# logger.info(f"将资源库列表转换为字符串: {item_node['资源库名称']}")
# 检查值是否为基本类型
elif isinstance(value, (str, int, float, bool)):
item_node[key] = value
# 如果是列表,尝试转换为分号分隔的字符串
elif isinstance(value, list):
try:
if all(isinstance(x, (str, int, float, bool)) for x in value):
item_node[key] = "".join(str(x) for x in value)
else:
# 对于包含复杂对象的列表,尝试提取关键信息
extracted_values = []
for item_in_list in value:
if isinstance(item_in_list, dict):
# 尝试提取字典中的名称或标识符
for name_key in ["名称", "name", "标识", "id", "ID"]:
if name_key in item_in_list:
extracted_values.append(str(item_in_list[name_key]))
break
else:
# 如果没有找到名称键,使用第一个键值对
if item_in_list:
first_key = next(iter(item_in_list))
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
else:
extracted_values.append(str(item_in_list))
item_node[key] = "".join(extracted_values)
except Exception as e:
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
# 如果是字典,尝试转换为字符串
elif isinstance(value, dict):
try:
# 提取字典中的关键信息
extracted_info = []
for dict_key, dict_value in value.items():
if isinstance(dict_value, (str, int, float, bool)):
extracted_info.append(f"{dict_key}:{dict_value}")
item_node[key] = "".join(extracted_info)
except Exception as e:
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
graph.create(item_node)
# logger.info(f"创建ProjectDivisionItem节点: {name} (GUID: {guid})")
# 创建与父节点的关系
if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels:
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectDivisionTree", "ProjectDivisionItem", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, item_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}")
else:
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectDivisionItem", "ProjectDivisionItem", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, item_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}")
# 移除GUID关系建立代码,避免重复创建
# 关系将在establish_relationships函数中批量创建
# 处理子项
if "children" in item and item["children"]:
children = item["children"]
# logger.info(f"ProjectDivisionItem {name} 有 {len(children)} 个子项")
for child in children:
child_type = child.get("type", child.get("类型", ""))
if child_type == "项目划分":
# 递归处理子ProjectDivisionItem
process_project_division_item(child, item_node, entity_relationships)
elif child_type == "8" or child_type == "清单":
# 处理List类型节点
process_list_item(child, item_node, entity_relationships)
else:
# 处理ProjectQuantity及其子类
process_project_quantity(child, item_node, entity_relationships)
# 处理List及其子类
def process_list_item(list_item, parent_node, entity_relationships):
"""处理清单类型的节点"""
# 提取必要属性
guid = list_item.get("GUID", "")
list_name = list_item.get("清单名称")
list_type = list_item.get("类型", "")
# 创建List节点
list_node = Node("List", guid=guid, name=list_name, type=list_type)
# 添加path属性,包含节点类型
parent_path = parent_node.get("path", "")
if parent_path:
list_node["path"] = f"{parent_path}/{list_name}(清单)"
else:
# 如果父节点没有path属性(不应该发生,但为了健壮性)
parent_name = parent_node.get("name", "")
list_node["path"] = f"{parent_name}/{list_name}(清单)"
# logger.info(f"为List节点 {list_name} 设置path: {list_node['path']}")
# 添加其他属性
for key, value in list_item.items():
if key not in ["清单名称", "类型", "guid", "children"] and value is not None:
# 检查是否为资源库列表
if key == "资源库列表" and isinstance(value, list):
# 将资源库列表转换为分号分隔的字符串
resource_names = []
for resource in value:
if isinstance(resource, dict) and "资源库名称" in resource:
resource_names.append(resource["资源库名称"])
list_node["资源库名称"] = "".join(resource_names)
# logger.info(f"将资源库列表转换为字符串: {list_node['资源库名称']}")
# 检查值是否为基本类型
elif isinstance(value, (str, int, float, bool)):
list_node[key] = value
# 如果是列表,尝试转换为分号分隔的字符串
elif isinstance(value, list):
try:
if all(isinstance(x, (str, int, float, bool)) for x in value):
list_node[key] = "".join(str(x) for x in value)
else:
# 对于包含复杂对象的列表,尝试提取关键信息
extracted_values = []
for item_in_list in value:
if isinstance(item_in_list, dict):
# 尝试提取字典中的名称或标识符
for name_key in ["名称", "name", "标识", "id", "ID"]:
if name_key in item_in_list:
extracted_values.append(str(item_in_list[name_key]))
break
else:
# 如果没有找到名称键,使用第一个键值对
if item_in_list:
first_key = next(iter(item_in_list))
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
else:
extracted_values.append(str(item_in_list))
list_node[key] = "".join(extracted_values)
except Exception as e:
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
# 如果是字典,尝试转换为字符串
elif isinstance(value, dict):
try:
# 提取字典中的关键信息
extracted_info = []
for dict_key, dict_value in value.items():
if isinstance(dict_value, (str, int, float, bool)):
extracted_info.append(f"{dict_key}:{dict_value}")
list_node[key] = "".join(extracted_info)
except Exception as e:
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
graph.create(list_node)
# logger.info(f"创建List节点: {list_name} (类型: {list_type})")
# 创建与父节点的关系
# 从本体层获取关系类型
relationship_type = get_relationship_type(list(parent_node.labels)[0], "List", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, list_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {list_name}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {list_name}")
# 如果有GUID,尝试建立与CostSet的关系
if guid:
# 查找对应的CostSet节点
cost_set_query = f"""
MATCH (c:CostSet)
WHERE c.GUID = '{guid}'
RETURN c
"""
cost_set_nodes = list(graph.run(cost_set_query))
if cost_set_nodes:
cost_set_node = cost_set_nodes[0]["c"]
relationship_type = get_relationship_type("List", "CostSet", entity_relationships)
if relationship_type:
graph.create(Relationship(list_node, relationship_type, cost_set_node))
# logger.info(f"创建关系: {list_name} {relationship_type} CostSet (GUID: {guid})")
# else:
# logger.info(f"不创建关系: {list_name} 到 CostSet (GUID: {guid})")
# 处理子项
if "children" in list_item and list_item["children"]:
children = list_item["children"]
# logger.info(f"List {list_name} 有 {len(children)} 个子项")
for child in children:
# 确定子项类型
child_type = child.get("type", child.get("类型", ""))
if child_type == "项目划分":
# 递归处理子ProjectDivisionItem
process_project_division_item(child, list_node, entity_relationships)
elif child_type == "8" or child_type == "清单":
# 递归处理子List
process_list_item(child, list_node, entity_relationships)
else:
# 处理ProjectQuantity及其子类
process_project_quantity(child, list_node, entity_relationships)
# 处理ProjectQuantity及其子类
def process_project_quantity(quantity, parent_node, entity_relationships):
# 确定具体类型
quantity_type = quantity.get("类型", "")
labels = ["ProjectQuantity"]
type_name = "ProjectQuantity"
# 支持数字和文本类型
if quantity_type == "0" or quantity_type == "定额":
labels.append("Quota")
type_name = "定额"
elif quantity_type == "1" or quantity_type == "主材":
labels.append("MainMaterial")
type_name = "主材"
elif quantity_type == "5" or quantity_type == "设备":
labels.append("Equipment")
type_name = "设备"
# 创建节点
quantity_id = quantity.get("id", "")
quantity_name = quantity.get("项目名称", quantity.get("名称", ""))
quantity_node = Node(*labels, id=quantity_id, name=quantity_name)
# 添加path属性,包含节点类型
parent_path = parent_node.get("path", "")
if parent_path:
quantity_node["path"] = f"{parent_path}/{quantity_name}({type_name})"
else:
# 如果父节点没有path属性(不应该发生,但为了健壮性)
parent_name = parent_node.get("name", "")
quantity_node["path"] = f"{parent_name}/{quantity_name}({type_name})"
# logger.info(f"为ProjectQuantity节点 {quantity_name} 设置path: {quantity_node['path']}")
# 添加其他属性
for key, value in quantity.items():
if key not in ["id", "名称", "项目名称", "材机列表", "children"] and value is not None:
# 检查是否为资源库列表
if key == "资源库列表" and isinstance(value, list):
# 将资源库列表转换为分号分隔的字符串
resource_names = []
for resource in value:
if isinstance(resource, dict) and "资源库名称" in resource:
resource_names.append(resource["资源库名称"])
quantity_node["资源库名称"] = "".join(resource_names)
# logger.info(f"将资源库列表转换为字符串: {quantity_node['资源库名称']}")
# 检查值是否为基本类型
elif isinstance(value, (str, int, float, bool)):
quantity_node[key] = value
# 如果是列表,尝试转换为分号分隔的字符串
elif isinstance(value, list):
try:
if all(isinstance(x, (str, int, float, bool)) for x in value):
quantity_node[key] = "".join(str(x) for x in value)
else:
# 对于包含复杂对象的列表,尝试提取关键信息
extracted_values = []
for item_in_list in value:
if isinstance(item_in_list, dict):
# 尝试提取字典中的名称或标识符
for name_key in ["名称", "name", "标识", "id", "ID"]:
if name_key in item_in_list:
extracted_values.append(str(item_in_list[name_key]))
break
else:
# 如果没有找到名称键,使用第一个键值对
if item_in_list:
first_key = next(iter(item_in_list))
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
else:
extracted_values.append(str(item_in_list))
quantity_node[key] = "".join(extracted_values)
except Exception as e:
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
# 如果是字典,尝试转换为字符串
elif isinstance(value, dict):
try:
# 提取字典中的关键信息
extracted_info = []
for dict_key, dict_value in value.items():
if isinstance(dict_value, (str, int, float, bool)):
extracted_info.append(f"{dict_key}:{dict_value}")
quantity_node[key] = "".join(extracted_info)
except Exception as e:
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
graph.create(quantity_node)
# logger.info(f"创建ProjectQuantity节点: {quantity_name} (id: {quantity_id}, 类型: {quantity_type})")
# 创建与父节点的关系
# 从本体层获取关系类型
parent_label = list(parent_node.labels)[0] # 获取父节点的第一个标签
relationship_type = get_relationship_type(parent_label, "ProjectQuantity", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, quantity_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {quantity_name}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {quantity_name}")
# 移除GUID关系建立代码,避免重复创建
# 关系将在establish_relationships函数中批量创建
# 处理材机列表或children
materials = None
# 先检查是否有材机列表
if "材机列表" in quantity and quantity["材机列表"]:
materials = quantity["材机列表"]
# logger.info(f"ProjectQuantity {quantity_name} 有 {len(materials)} 个材机项")
for material in materials:
process_material_or_equipment(material, quantity_node, entity_relationships)
# 如果没有材机列表,则检查是否有children
elif "children" in quantity and quantity["children"]:
children = quantity["children"]
# logger.info(f"ProjectQuantity {quantity_name} 有 {len(children)} 个子项")
for child in children:
child_type = child.get("类型", child.get("type", ""))
# 如果子项类型为人工、材料或机械,则视为MaterialOrEquipment
if child_type in ["人工", "材料", "机械", "2", "3", "4"]:
process_material_or_equipment(child, quantity_node, entity_relationships)
# 如果子项类型为主材、设备或定额,则递归处理为ProjectQuantity
elif child_type in ["1", "主材", "5", "设备", "0", "定额"]:
process_project_quantity(child, quantity_node, entity_relationships)
# 处理MaterialOrEquipment
def process_material_or_equipment(material, parent_node, entity_relationships):
material_id = material.get("id", material.get("ID", ""))
material_name = material.get("名称", "")
material_type = material.get("类型", material.get("type", ""))
if not material_id and not material_name:
logger.warning("MaterialOrEquipment缺少id和名称")
return
# 创建唯一标识,结合父节点的ID和当前项的id
parent_id = parent_node.get("id", parent_node.get("GUID", ""))
unique_id = f"{parent_id}_{material_id}" if parent_id else material_id
# 直接创建新节点,不检查是否已存在
material_node = Node(
"MaterialOrEquipment", id=material_id, unique_id=unique_id, name=material_name, type=material_type
)
# 添加其他属性
for key, value in material.items():
if key not in ["id", "ID", "名称", "类型", "type"] and value is not None:
# 检查是否为资源库列表
if key == "资源库列表" and isinstance(value, list):
# 将资源库列表转换为分号分隔的字符串
resource_names = []
for resource in value:
if isinstance(resource, dict) and "资源库名称" in resource:
resource_names.append(resource["资源库名称"])
material_node["资源库名称"] = "".join(resource_names)
# logger.info(f"将资源库列表转换为字符串: {material_node['资源库名称']}")
# 检查值是否为基本类型
elif isinstance(value, (str, int, float, bool)):
material_node[key] = value
# 如果是列表,尝试转换为分号分隔的字符串
elif isinstance(value, list):
try:
if all(isinstance(x, (str, int, float, bool)) for x in value):
material_node[key] = "".join(str(x) for x in value)
else:
# 对于包含复杂对象的列表,尝试提取关键信息
extracted_values = []
for item_in_list in value:
if isinstance(item_in_list, dict):
# 尝试提取字典中的名称或标识符
for name_key in ["名称", "name", "标识", "id", "ID"]:
if name_key in item_in_list:
extracted_values.append(str(item_in_list[name_key]))
break
else:
# 如果没有找到名称键,使用第一个键值对
if item_in_list:
first_key = next(iter(item_in_list))
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
else:
extracted_values.append(str(item_in_list))
material_node[key] = "".join(extracted_values)
except Exception as e:
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
# 如果是字典,尝试转换为字符串
elif isinstance(value, dict):
try:
# 提取字典中的关键信息
extracted_info = []
for dict_key, dict_value in value.items():
if isinstance(dict_value, (str, int, float, bool)):
extracted_info.append(f"{dict_key}:{dict_value}")
material_node[key] = "".join(extracted_info)
except Exception as e:
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
graph.create(material_node)
# logger.info(f"创建MaterialOrEquipment节点: {material_name} (id: {material_id}, 类型: {material_type})")
# 创建与父节点的关系
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectQuantity", "MaterialOrEquipment", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, material_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {material_name}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {material_name}")
# 处理CostSet
def process_cost_set(data, root_node, entity_relationships):
"""处理费用预览,为每个具有GUID的节点创建CostSet节点"""
# 根据JSON结构,访问expensePreview数据
if "projectData" in data and "expensePreview" in data["projectData"]:
expense_preview = data["projectData"]["expensePreview"]
elif "expensePreview" in data:
expense_preview = data["expensePreview"]
else:
logger.warning("JSON中未找到expensePreview数据")
logger.info(f"JSON顶层键: {list(data.keys())}")
return
# logger.info("开始处理expensePreview")
# 用于跟踪已处理的GUID,避免重复处理
processed_guids = set()
# 递归函数,用于遍历expensePreview中的所有节点
def traverse_expense_preview(node, path=""):
if isinstance(node, dict):
# 如果节点包含GUID,则创建CostSet节点
if "GUID" in node:
guid = node["GUID"]
# 标准化GUID格式
guid = guid.strip("{}")
guid = "{" + guid.upper() + "}"
if guid not in processed_guids:
processed_guids.add(guid)
# 创建CostSet节点
cost_set_node = Node("CostSet", GUID=guid, name="费用预览集")
try:
graph.create(cost_set_node)
# logger.info(f"创建CostSet节点: 费用预览集 (GUID: {guid}, 路径: {path})")
except Exception as e:
logger.error(f"创建CostSet节点失败: {e}")
return
# 创建与根节点的关系
relationship_type = get_relationship_type("EngineeringData", "CostSet", entity_relationships)
if relationship_type:
try:
graph.create(Relationship(root_node, relationship_type, cost_set_node))
# logger.info(f"创建关系: {root_node['name']} {relationship_type} 费用预览集")
except Exception as e:
logger.error(f"创建关系失败: {e}")
# 处理sum数组中的CostItem
if "sum" in node and isinstance(node["sum"], list) and node["sum"]:
for item in node["sum"]:
try:
process_cost_item(item, cost_set_node, entity_relationships)
except Exception as e:
logger.error(f"处理CostItem时出错: {e}")
# 处理rcj数组中的MaterialandmachineCostItem
if "rcj" in node and isinstance(node["rcj"], list) and node["rcj"]:
for item in node["rcj"]:
try:
process_material_machine_cost_item(item, cost_set_node, entity_relationships)
except Exception as e:
logger.error(f"处理MaterialandmachineCostItem时出错: {e}")
# 递归处理子节点
for key, value in node.items():
if key == "children" and isinstance(value, list):
for i, child in enumerate(value):
traverse_expense_preview(child, f"{path}.children[{i}]")
elif isinstance(node, list):
for i, item in enumerate(node):
traverse_expense_preview(item, f"{path}[{i}]")
# 遍历expensePreview中的所有类别
for category_name, category in expense_preview.items():
for subcategory_name, subcategory in category.items():
if isinstance(subcategory, list):
for i, item in enumerate(subcategory):
traverse_expense_preview(item, f"expensePreview.{category_name}.{subcategory_name}[{i}]")
# logger.info(f"共处理了 {len(processed_guids)} 个CostSet节点")
# 处理CostItem - 简化处理逻辑,确保正确创建节点
def process_cost_item(item, parent_node, entity_relationships):
# 确保item是字典
if not isinstance(item, dict):
logger.error(f"CostItem不是字典类型: {item}")
return
item_id = item.get("id", "")
cost = item.get("cost", "")
if not item_id:
logger.warning(f"跳过没有id的CostItem: {item}")
return
# 使用id作为名称
name = item_id
# 创建唯一标识,结合父节点的GUID和当前项的id
parent_guid = parent_node.get("GUID", "")
unique_id = f"{parent_guid}_{item_id}" if parent_guid else item_id
# 创建CostItem节点 - 简化属性,只保留关键属性
item_node = Node("CostItem", id=item_id, unique_id=unique_id, cost=cost, name=name)
# 添加其他属性 - 简化属性处理
for key, value in item.items():
if key not in ["id", "cost"] and value is not None:
if isinstance(value, (str, int, float, bool)):
item_node[key] = value
graph.create(item_node)
# logger.info(f"创建CostItem节点: {name} (id: {item_id}, cost: {cost})")
# 创建与父节点的关系
relationship_type = get_relationship_type("CostSet", "CostItem", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, item_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {name}")
# 处理人材机合价项(MaterialandmachineCostItem)
def process_material_machine_cost_item(item, parent_node, entity_relationships):
"""处理人材机合价项"""
# 提取必要属性
item_type = item.get("type", "")
name = item.get("名称", "")
code = item.get("编码", "")
unit = item.get("单位", "")
supplier = item.get("供货方", "")
budget_price = item.get("预算价不含税", "")
market_price = item.get("市场价不含税", "")
budget_total = item.get("预算价合价", "")
market_total = item.get("市场价合价", "")
price_diff = item.get("价差", "")
quantity = item.get("数量", "")
if not name:
logger.warning("MaterialandmachineCostItem缺少名称")
return
# 创建唯一标识,结合父节点的GUID和当前项的编码
parent_guid = parent_node.get("GUID", "")
unique_id = f"{parent_guid}_{code}" if parent_guid and code else (parent_guid or code)
# 创建人材机合价项节点,确保所有属性值都是原始类型
properties = {"type": str(item_type), "name": str(name), "unique_id": str(unique_id)}
# 添加其他属性,确保都是字符串类型
if supplier:
properties["供货方"] = str(supplier)
if code:
properties["编码"] = str(code)
if unit:
properties["单位"] = str(unit)
if budget_price:
properties["预算价不含税"] = str(budget_price)
if market_price:
properties["市场价不含税"] = str(market_price)
if budget_total:
properties["预算价合价"] = str(budget_total)
if market_total:
properties["市场价合价"] = str(market_total)
if price_diff:
properties["价差"] = str(price_diff)
if quantity:
properties["数量"] = str(quantity)
# 创建节点
item_node = Node("MaterialandmachineCostItem", **properties)
try:
graph.create(item_node)
# logger.info(f"创建MaterialandmachineCostItem节点: {name} (类型: {item_type}, 编码: {code})")
except Exception as e:
logger.error(f"创建MaterialandmachineCostItem节点失败: {e}")
# 打印详细的节点属性,帮助调试
for key, value in properties.items():
logger.error(f"属性 {key}: {type(value)} = {value}")
return
# 创建与父节点的关系
relationship_type = get_relationship_type("CostSet", "MaterialandmachineCostItem", entity_relationships)
if relationship_type:
try:
graph.create(Relationship(parent_node, relationship_type, item_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
except Exception as e:
logger.error(f"创建关系失败: {e}")
else:
logger.info(f"不创建关系: {parent_node['name']}{name}")
# 修改establish_relationships函数,添加项目划分与人材机合价集合的关系
def establish_relationships(entity_relationships):
# 首先检查数据库中的节点情况
logger.info("检查数据库中的节点情况...")
# 检查ProjectDivisionItem节点
pdi_query = """
MATCH (pdi:ProjectDivisionItem)
RETURN count(pdi) as count, collect(distinct pdi.GUID)[..10] as sample_guids
"""
pdi_result = graph.run(pdi_query).data()[0]
logger.info(f"数据库中有 {pdi_result['count']} 个ProjectDivisionItem节点")
logger.info(f"ProjectDivisionItem节点GUID样本: {pdi_result['sample_guids']}")
# 检查ProjectQuantity节点
pq_query = """
MATCH (pq:ProjectQuantity)
RETURN count(pq) as count, collect(distinct pq.id)[..10] as sample_ids
"""
pq_result = graph.run(pq_query).data()[0]
logger.info(f"数据库中有 {pq_result['count']} 个ProjectQuantity节点")
logger.info(f"ProjectQuantity节点ID样本: {pq_result['sample_ids']}")
# 检查CostSet节点
cs_query = """
MATCH (cs:CostSet)
RETURN count(cs) as count, collect(distinct cs.GUID)[..10] as sample_guids
"""
cs_result = graph.run(cs_query).data()[0]
logger.info(f"数据库中有 {cs_result['count']} 个CostSet节点")
logger.info(f"CostSet节点GUID样本: {cs_result['sample_guids']}")
# 获取从ProjectDivisionItem到CostSet的关系类型
relationship_type = get_relationship_type("ProjectDivisionItem", "CostSet", entity_relationships)
if relationship_type: # 只有当关系类型不为None时才创建关系
# 建立ProjectDivisionItem与CostSet的关系,使用更宽松的匹配条件,并避免重复创建
query_division_item = f"""
MATCH (pdi:ProjectDivisionItem), (cs:CostSet)
WHERE toUpper(replace(replace(pdi.GUID, '{{', ''), '}}', '')) = toUpper(replace(replace(cs.GUID, '{{', ''), '}}', ''))
AND pdi.GUID <> ""
AND NOT EXISTS((pdi)-[:{relationship_type}]->(cs))
CREATE (pdi)-[:{relationship_type}]->(cs)
RETURN count(*) as count
"""
try:
result = graph.run(query_division_item)
count = result.data()[0]["count"]
logger.info(f"创建了 {count} 个 ProjectDivisionItem {relationship_type} CostSet 关系")
except Exception as e:
logger.error(f"创建ProjectDivisionItem与CostSet关系失败: {e}")
else:
logger.info("本体层中未定义ProjectDivisionItem到CostSet的关系,跳过创建")
# 获取从ProjectQuantity到CostSet的关系类型
relationship_type = get_relationship_type("ProjectQuantity", "CostSet", entity_relationships)
if relationship_type: # 只有当关系类型不为None时才创建关系
# 建立ProjectQuantity与CostSet的关系,使用更宽松的匹配条件,并避免重复创建
query_quantity = f"""
MATCH (pq:ProjectQuantity), (cs:CostSet)
WHERE toUpper(replace(replace(pq.GUID, '{{', ''), '}}', '')) = toUpper(replace(replace(cs.GUID, '{{', ''), '}}', ''))
AND pq.GUID <> ""
AND NOT EXISTS((pq)-[:{relationship_type}]->(cs))
CREATE (pq)-[:{relationship_type}]->(cs)
RETURN count(*) as count
"""
try:
result = graph.run(query_quantity)
count = result.data()[0]["count"]
logger.info(f"创建了 {count} 个 ProjectQuantity {relationship_type} CostSet 关系")
except Exception as e:
logger.error(f"创建ProjectQuantity与CostSet关系失败: {e}")
else:
logger.info("本体层中未定义ProjectQuantity到CostSet的关系,跳过创建")
# 检查最终的关系数量
# 检查ProjectDivisionItem到CostSet的关系
pdi_cs_query = """
MATCH (pdi:ProjectDivisionItem)-[r]->(cs:CostSet)
RETURN count(r) as count
"""
pdi_cs_result = graph.run(pdi_cs_query).data()[0]
logger.info(f"数据库中最终有 {pdi_cs_result['count']} 个ProjectDivisionItem到CostSet的关系")
# 检查ProjectQuantity到CostSet的关系
pq_cs_query = """
MATCH (pq:ProjectQuantity)-[r]->(cs:CostSet)
RETURN count(r) as count
"""
pq_cs_result = graph.run(pq_cs_query).data()[0]
logger.info(f"数据库中最终有 {pq_cs_result['count']} 个ProjectQuantity到CostSet的关系")
# 处理取费表模板集(FeeTableTemplateSet)
def process_fee_table_template_set(data, parent_node, entity_relationships):
# 根据JSON结构,访问costSetting数据
if "projectData" in data and "costSetting" in data["projectData"]:
cost_setting = data["projectData"]["costSetting"]
elif "costSetting" in data:
cost_setting = data["costSetting"]
else:
logger.warning("JSON中未找到costSetting数据")
logger.info(f"JSON顶层键: {list(data.keys())}")
return
# logger.info(f"开始处理costSetting,包含 {len(cost_setting)} 个取费表模板集")
# 只创建一个取费表模板集节点
fee_template_set_node = Node("FeeTableTemplateSet", name="取费表模板集")
graph.create(fee_template_set_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("EngineeringData", "FeeTableTemplateSet", entity_relationships)
graph.create(Relationship(parent_node, relationship_type, fee_template_set_node))
# logger.info("创建FeeTableTemplateSet节点: 取费表模板集")
# 处理每个取费表模板集
for template_set_name, template_set_content in cost_setting.items():
# 直接处理取费表模板项
if "tables" in template_set_content and isinstance(template_set_content["tables"], list):
for template_item in template_set_content["tables"]:
process_fee_table_template_item(template_item, fee_template_set_node, entity_relationships)
# 处理取费表模板项(FeeTableTemplateItem)
def process_fee_table_template_item(template_item, parent_node, entity_relationships):
# 提取必要属性
name = template_item.get("name", "")
outlay_id = template_item.get("OutlayID", "")
type_name = template_item.get("类型", "")
profession = template_item.get("专业", "")
if not name:
logger.warning("FeeTableTemplateItem缺少name")
return
# 创建取费表模板项节点
template_item_node = Node(
"FeeTableTemplateItem", name=name, outlayID=outlay_id, type=type_name, profession=profession
)
graph.create(template_item_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("FeeTableTemplateSet", "FeeTableTemplateItem", entity_relationships)
graph.create(Relationship(parent_node, relationship_type, template_item_node))
# logger.info(f"创建FeeTableTemplateItem节点: {name} (OutlayID: {outlay_id})")
# 处理取费项
if "children" in template_item and isinstance(template_item["children"], list):
for fee_item in template_item["children"]:
process_fee(fee_item, template_item_node, entity_relationships)
# 处理取费(FeeCollection)
def process_fee(fee_item, parent_node, entity_relationships):
# 提取必要属性
serial_number = fee_item.get("序号", "")
fee_name = fee_item.get("费用名称", "")
code = fee_item.get("代码", "")
rate = fee_item.get("费率(%)", "")
base = fee_item.get("取费基数", "")
remark = fee_item.get("备注", "")
if not fee_name:
logger.warning("Fee缺少费用名称")
return
# 创建取费节点
fee_node = Node("FeeCollection", serialNumber=serial_number, name=fee_name, code=code)
# 添加path属性
if "FeeCollection" in parent_node.labels:
# 如果父节点是FeeCollection节点,使用父节点的path加上当前节点名称
parent_path = parent_node.get("path", "")
fee_node["path"] = f"{parent_path}/{fee_name}"
else:
# 如果父节点是FeeTableTemplateItem,直接使用父节点名称作为路径的开始
parent_name = parent_node.get("name", "")
fee_node["path"] = f"{parent_name}/{fee_name}"
# logger.info(f"为FeeCollection节点 {fee_name} 设置path: {fee_node['path']}")
# 添加其他属性
if rate:
fee_node["rate"] = rate
if base:
fee_node["base"] = base
if remark:
fee_node["remark"] = remark
graph.create(fee_node)
# 从本体层获取关系类型
parent_label = list(parent_node.labels)[0]
relationship_type = get_relationship_type(parent_label, "FeeCollection", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, fee_node))
# logger.info(f"创建Fee节点: {fee_name} (序号: {serial_number}, 代码: {code})")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {fee_name}")
# 处理子费用项
if "children" in fee_item and isinstance(fee_item["children"], list):
for child_fee in fee_item["children"]:
process_fee(child_fee, fee_node, entity_relationships)
# 处理费用表集(FeeScheduleSet)
def process_fee_schedule_set(data, parent_node, entity_relationships):
"""处理费用表集、费用表项和费用"""
# 检查projectCost是否存在
if "projectData" in data and "projectCost" in data["projectData"]:
project_cost = data["projectData"]["projectCost"]
elif "projectCost" in data:
project_cost = data["projectCost"]
else:
logger.warning("JSON中未找到projectCost数据")
logger.info(f"JSON顶层键: {list(data.keys())}")
return
# logger.info(f"开始处理projectCost,包含 {len(project_cost)} 个费用表项")
# 创建FeeScheduleSet节点 - 工程费用
fee_schedule_set = Node("FeeScheduleSet", name="工程费用")
graph.create(fee_schedule_set)
# 从本体层获取关系类型
relationship_type = get_relationship_type("EngineeringData", "FeeScheduleSet", entity_relationships)
graph.create(Relationship(parent_node, relationship_type, fee_schedule_set))
# logger.info(f"创建FeeScheduleSet节点: 工程费用")
# 处理费用表集下的费用表项
for fee_table_name, fee_table_content in project_cost.items():
# 创建FeeScheduleItem节点
fee_schedule_item = Node("FeeScheduleItem", name=fee_table_name)
graph.create(fee_schedule_item)
# 从本体层获取关系类型
relationship_type = get_relationship_type("FeeScheduleSet", "FeeScheduleItem", entity_relationships)
graph.create(Relationship(fee_schedule_set, relationship_type, fee_schedule_item))
# logger.info(f"创建FeeScheduleItem节点: {fee_table_name}")
# 处理费用表项下的费用列表
if isinstance(fee_table_content, list):
# logger.info(f"FeeScheduleItem {fee_table_name} 包含 {len(fee_table_content)} 个费用项")
for fee_item in fee_table_content:
process_fee_item(fee_item, fee_schedule_item, entity_relationships)
else:
logger.warning(f"FeeScheduleItem {fee_table_name} 的内容类型未知: {type(fee_table_content)}")
# 处理费用项(Fee)
def process_fee_item(fee, parent_node, entity_relationships):
"""处理费用项"""
# 提取必要属性
serial_number = fee.get("序号", "")
name = fee.get("费用名称", "")
code = fee.get("代码", "")
rate = fee.get("费率(%)", "")
amount = fee.get("金额", "")
if not name:
logger.warning("Fee缺少费用名称")
return
# 创建Fee节点
fee_node = Node("Fee", serialNumber=serial_number, name=name, code=code)
# 添加path属性
if "Fee" in parent_node.labels:
# 如果父节点是Fee节点,使用父节点的path加上当前节点名称
parent_path = parent_node.get("path", "")
fee_node["path"] = f"{parent_path}/{name}"
else:
# 如果父节点是FeeScheduleItem,直接使用父节点名称作为路径的开始
parent_name = parent_node.get("name", "")
fee_node["path"] = f"{parent_name}/{name}"
# logger.info(f"为Fee节点 {name} 设置path: {fee_node['path']}")
# 添加其他属性
if rate:
fee_node["rate"] = rate
if amount:
fee_node["amount"] = amount
# 添加其他属性
for key, value in fee.items():
if key not in ["序号", "费用名称", "代码", "费率(%)", "金额", "children"] and value is not None:
if isinstance(value, (str, int, float, bool)):
fee_node[key] = value
graph.create(fee_node)
# logger.info(f"创建Fee节点: {name} (序号: {serial_number})")
# 创建与父节点的关系
# 从本体层获取关系类型
parent_label = list(parent_node.labels)[0]
relationship_type = get_relationship_type(parent_label, "Fee", entity_relationships)
if relationship_type:
graph.create(Relationship(parent_node, relationship_type, fee_node))
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
# else:
# logger.info(f"不创建关系: {parent_node['name']} 到 {name}")
# 处理子费用项
if "children" in fee and fee["children"]:
children = fee["children"]
# logger.info(f"Fee {name} 有 {len(children)} 个子费用项")
for child in children:
process_fee_item(child, fee_node, entity_relationships)
# 处理工程属性集(ProjectPropertySet)和工程属性(ProjectProperty)
def process_project_property_set(data, parent_node, entity_relationships):
# 检查projectInfo是否存在
if "projectData" in data and "projectInfo" in data["projectData"]:
project_info = data["projectData"]["projectInfo"]
elif "projectInfo" in data:
project_info = data["projectInfo"]
else:
logger.warning("JSON中未找到projectInfo数据")
logger.info(f"JSON顶层键: {list(data.keys())}")
return
# logger.info("开始处理projectInfo")
# 创建工程属性集节点
property_set_node = Node("ProjectPropertySet", name="工程属性")
graph.create(property_set_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("EngineeringData", "ProjectPropertySet", entity_relationships)
graph.create(Relationship(parent_node, relationship_type, property_set_node))
# logger.info("创建ProjectPropertySet节点: 工程属性")
# 为每个属性创建单独的节点
for key, value in project_info.items():
if value is not None:
# 处理复杂类型的值
if isinstance(value, list):
# 对于列表类型,创建一个包含列表摘要的节点
property_node = Node("ProjectProperty", name=key, value=f"列表({len(value)}项)")
graph.create(property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
graph.create(Relationship(property_set_node, relationship_type, property_node))
# logger.info(f"创建ProjectProperty节点: {key} = 列表({len(value)}项)")
# 为列表中的每个项创建子节点
for i, item in enumerate(value):
if isinstance(item, dict):
# 对于字典类型的列表项,创建包含键值对的节点
for sub_key, sub_value in item.items():
if sub_value is not None:
sub_property_node = Node(
"ProjectProperty", name=f"{key}[{i}].{sub_key}", value=str(sub_value)
)
graph.create(sub_property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type(
"ProjectProperty", "ProjectProperty", entity_relationships
)
graph.create(Relationship(property_node, relationship_type, sub_property_node))
# logger.info(f"创建ProjectProperty子节点: {key}[{i}].{sub_key} = {sub_value}")
else:
# 对于基本类型的列表项,创建简单节点
sub_property_node = Node("ProjectProperty", name=f"{key}[{i}]", value=str(item))
graph.create(sub_property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type(
"ProjectProperty", "ProjectProperty", entity_relationships
)
graph.create(Relationship(property_node, relationship_type, sub_property_node))
# logger.info(f"创建ProjectProperty子节点: {key}[{i}] = {item}")
elif isinstance(value, dict):
# 对于字典类型,创建一个包含字典摘要的节点
property_node = Node("ProjectProperty", name=key, value=f"字典({len(value)}项)")
graph.create(property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
graph.create(Relationship(property_set_node, relationship_type, property_node))
# logger.info(f"创建ProjectProperty节点: {key} = 字典({len(value)}项)")
# 为字典中的每个键值对创建子节点
for sub_key, sub_value in value.items():
if sub_value is not None:
sub_property_node = Node("ProjectProperty", name=f"{key}.{sub_key}", value=str(sub_value))
graph.create(sub_property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type(
"ProjectProperty", "ProjectProperty", entity_relationships
)
graph.create(Relationship(property_node, relationship_type, sub_property_node))
# logger.info(f"创建ProjectProperty子节点: {key}.{sub_key} = {sub_value}")
else:
# 对于基本类型,直接创建节点
property_node = Node("ProjectProperty", name=key, value=str(value))
graph.create(property_node)
# 从本体层获取关系类型
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
graph.create(Relationship(property_set_node, relationship_type, property_node))
# logger.info(f"创建ProjectProperty节点: {key} = {value}")
# 将创建知识图谱的功能封装为函数
def create_KG(json_file_path, ontology_file_path="Ontology_Layer.txt"):
"""
创建知识图谱
Args:
json_file_path: JSON文件路径
ontology_file_path: 本体层文件路径
Returns:
success: 是否成功创建知识图谱
"""
try:
# 解析本体层文件
entity_types, entity_relationships = parse_ontology_file(ontology_file_path)
if not entity_types or not entity_relationships:
logger.error("解析本体层文件失败")
return False
# 读取JSON文件
try:
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as e:
logger.error(f"读取JSON文件失败: {e}")
return False
# 获取文件名作为工程名称
file_name = os.path.basename(json_file_path)
project_name = os.path.splitext(file_name)[0]
# 为每个JSON文件创建一个新的EngineeringData节点
root_node = Node("EngineeringData", name=project_name)
graph.create(root_node)
logger.info(f"创建根节点: {project_name}")
# 处理费用预览,创建CostSet节点及其子节点
process_cost_set(data, root_node, entity_relationships)
# 处理项目划分,创建ProjectDivisionSet和ProjectDivisionItem节点
process_project_division_set(data, root_node, entity_relationships)
# 处理取费表模板集
process_fee_table_template_set(data, root_node, entity_relationships)
# 处理费用表集
process_fee_schedule_set(data, root_node, entity_relationships)
# 处理工程属性集
process_project_property_set(data, root_node, entity_relationships)
# 建立实体间的关系
establish_relationships(entity_relationships)
logger.info(f"成功创建知识图谱: {json_file_path}")
return True
except Exception as e:
logger.error(f"创建知识图谱失败: {e}")
import traceback
traceback.print_exc()
return False
# 添加一个新的函数,用于处理文件夹中的多个JSON文件
def create_KGs_from_folder(input_folder, ontology_file_path="Ontology_Layer.txt"):
"""
从文件夹中创建多个知识图谱
Args:
input_folder: 输入文件夹路径,包含多个JSON文件
ontology_file_path: 本体层文件路径
Returns:
success_count: 成功处理的文件数量
total_count: 总文件数量
"""
# 获取输入文件夹中的所有JSON文件
json_files = []
# 如果输入是文件夹,则获取文件夹中的所有JSON文件
if os.path.isdir(input_folder):
json_files = glob.glob(os.path.join(input_folder, "*.json"))
# 如果输入是文件,则直接使用该文件
elif os.path.isfile(input_folder) and input_folder.endswith(".json"):
json_files = [input_folder]
else:
logger.error(f"输入路径无效: {input_folder}")
return 0, 0
if not json_files:
logger.error(f"未找到JSON文件: {input_folder}")
return 0, 0
# 连接到Neo4j数据库
config = read_config()
if not connect_to_neo4j(config["neo4j"]["uri"], config["neo4j"]["user"], config["neo4j"]["password"]):
return 0, 0
# 处理每个JSON文件
success_count = 0
for json_file in json_files:
logger.info(f"处理文件: {json_file}")
if create_KG(json_file, ontology_file_path):
success_count += 1
logger.info(f"知识图谱构建完成,成功处理 {success_count}/{len(json_files)} 个文件")
return success_count, len(json_files)
# 修改main函数,使用命令行参数
def main():
import argparse
parser = argparse.ArgumentParser(description="构建知识图谱")
parser.add_argument("--input", "-i", default="final_outputs", help="输入文件夹或文件路径")
parser.add_argument("--ontology", "-o", default="Ontology_Layer.txt", help="本体层文件路径")
parser.add_argument("--clear", "-c", action="store_true", help="是否清空数据库")
args = parser.parse_args()
# 读取配置文件
config = read_config()
# 连接到Neo4j数据库
if not connect_to_neo4j(config["neo4j"]["uri"], config["neo4j"]["user"], config["neo4j"]["password"]):
return
# 如果指定了清空数据库,则清空
if args.clear:
if not clear_database():
return
# 处理输入文件夹或文件
success_count, total_count = create_KGs_from_folder(args.input, args.ontology)
logger.info(f"知识图谱构建完成,成功处理 {success_count}/{total_count} 个文件")
if __name__ == "__main__":
main()