1489 lines
65 KiB
Python
1489 lines
65 KiB
Python
"""
|
||
通过本体层文件构建知识图谱
|
||
"""
|
||
|
||
from py2neo import Graph, Node, Relationship, NodeMatcher
|
||
import json
|
||
import os
|
||
import logging
|
||
import re
|
||
import configparser
|
||
import glob
|
||
|
||
# 设置日志
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 全局变量
|
||
graph = None
|
||
|
||
|
||
def read_config(config_file="config.ini"):
|
||
"""
|
||
读取配置文件
|
||
|
||
Args:
|
||
config_file: 配置文件路径
|
||
|
||
Returns:
|
||
config: 配置对象
|
||
"""
|
||
config = configparser.ConfigParser()
|
||
config.read(config_file, encoding="utf-8")
|
||
return config
|
||
|
||
|
||
def connect_to_neo4j(uri, user, password):
|
||
"""
|
||
连接到Neo4j数据库
|
||
|
||
Args:
|
||
uri: 数据库URI
|
||
user: 用户名
|
||
password: 密码
|
||
|
||
Returns:
|
||
graph: 数据库连接对象
|
||
"""
|
||
global graph
|
||
try:
|
||
graph = Graph(uri, auth=(user, password))
|
||
logger.info("成功连接到Neo4j数据库")
|
||
return True
|
||
except Exception as e:
|
||
logger.error(f"连接Neo4j数据库失败: {e}")
|
||
return False
|
||
|
||
|
||
def clear_database():
|
||
"""
|
||
清空数据库
|
||
"""
|
||
try:
|
||
graph.run("MATCH (n) DETACH DELETE n")
|
||
logger.info("已清空数据库")
|
||
|
||
# 删除所有约束
|
||
try:
|
||
# 获取所有约束
|
||
constraints = graph.run("SHOW CONSTRAINTS").data()
|
||
for constraint in constraints:
|
||
constraint_name = constraint.get("name")
|
||
if constraint_name:
|
||
graph.run(f"DROP CONSTRAINT {constraint_name}")
|
||
logger.info(f"已删除约束: {constraint_name}")
|
||
except Exception as e:
|
||
logger.warning(f"删除约束失败: {e}")
|
||
|
||
return True
|
||
except Exception as e:
|
||
logger.error(f"清空数据库失败: {e}")
|
||
return False
|
||
|
||
|
||
# 解析本体层文件,获取实体类型、属性和关系定义
|
||
def parse_ontology_file(file_path="Ontology_Layer.txt"):
|
||
"""
|
||
解析本体层文件,获取实体类型、属性和关系定义
|
||
|
||
Args:
|
||
file_path: 本体层文件路径
|
||
|
||
Returns:
|
||
entity_types: 实体类型和属性的字典 {实体类型名称: {属性名称: 属性类型}}
|
||
entity_relationships: 实体间关系的列表 [(源实体类型, 关系类型, 目标实体类型)]
|
||
"""
|
||
try:
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
# 分割实体类型和关系部分
|
||
parts = content.split("2. 实体间的关系")
|
||
|
||
if len(parts) != 2:
|
||
logger.error("本体层文件格式错误,无法找到实体类型和关系部分")
|
||
return {}, []
|
||
|
||
entity_types_content = parts[0]
|
||
relationships_content = parts[1]
|
||
|
||
# 解析实体类型和属性
|
||
entity_types = {}
|
||
current_entity = None
|
||
|
||
# 移除 "1.实体类型" 标题行
|
||
entity_types_content = entity_types_content.replace("1.实体类型", "").strip()
|
||
|
||
# 按行分割
|
||
lines = entity_types_content.strip().split("\n")
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# 如果行不包含冒号,视为实体类型
|
||
if ":" not in line:
|
||
current_entity = line
|
||
entity_types[current_entity] = {}
|
||
# 否则视为属性定义
|
||
else:
|
||
if current_entity is None:
|
||
continue
|
||
|
||
parts = line.split(":", 1)
|
||
if len(parts) == 2:
|
||
attr_name = parts[0].strip()
|
||
attr_type = parts[1].strip()
|
||
entity_types[current_entity][attr_name] = attr_type
|
||
|
||
# 解析实体间的关系
|
||
entity_relationships = []
|
||
|
||
# 匹配关系定义行: (:EntityType)-[:RELATIONSHIP]->(:EntityType)
|
||
relationship_pattern = r"\(:(\w+)\)-\[:(\w+)\]->\(:(\w+)\)"
|
||
|
||
for line in relationships_content.strip().split("\n"):
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
match = re.match(relationship_pattern, line)
|
||
if match:
|
||
source_entity, relationship_type, target_entity = match.groups()
|
||
entity_relationships.append((source_entity, relationship_type, target_entity))
|
||
|
||
logger.info(f"从本体层文件中解析出 {len(entity_types)} 个实体类型和 {len(entity_relationships)} 个关系定义")
|
||
return entity_types, entity_relationships
|
||
|
||
except Exception as e:
|
||
logger.error(f"解析本体层文件失败: {e}")
|
||
return {}, []
|
||
|
||
|
||
# 创建约束和索引以提高性能 - 现在不创建任何约束
|
||
def create_constraints_and_indexes():
|
||
# 不创建任何约束
|
||
logger.info("跳过创建约束")
|
||
pass
|
||
|
||
|
||
# 获取实体之间的关系类型
|
||
def get_relationship_type(source_entity_type, target_entity_type, entity_relationships):
|
||
"""
|
||
根据源实体类型和目标实体类型获取关系类型
|
||
|
||
Args:
|
||
source_entity_type: 源实体类型
|
||
target_entity_type: 目标实体类型
|
||
entity_relationships: 实体间关系的列表
|
||
|
||
Returns:
|
||
relationship_type: 关系类型,如果没找到则返回None
|
||
"""
|
||
for source, rel_type, target in entity_relationships:
|
||
if source == source_entity_type and target == target_entity_type:
|
||
return rel_type
|
||
|
||
# 如果没有找到匹配的关系,返回None
|
||
logger.warning(f"未找到从 {source_entity_type} 到 {target_entity_type} 的关系定义,不创建关系")
|
||
return None
|
||
|
||
|
||
# 创建根节点
|
||
def create_root_node():
|
||
root = Node("EngineeringData", name="工程")
|
||
graph.create(root)
|
||
logger.info("创建根节点: 工程")
|
||
return root
|
||
|
||
|
||
# 处理ProjectDivisionSet
|
||
def process_project_division_set(data, parent_node, entity_relationships):
|
||
# 根据您提供的JSON结构,正确访问projectDivision数据
|
||
if "projectData" in data and "projectDivision" in data["projectData"]:
|
||
project_division = data["projectData"]["projectDivision"]
|
||
elif "projectDivision" in data:
|
||
project_division = data["projectDivision"]
|
||
else:
|
||
logger.warning("JSON中未找到projectDivision数据")
|
||
logger.info(f"JSON顶层键: {list(data.keys())}")
|
||
return
|
||
|
||
logger.info(f"开始处理projectDivision,包含 {len(project_division)} 个顶级项目")
|
||
|
||
# 创建新的ProjectDivisionSet节点 - 项目划分集
|
||
division_set = Node("ProjectDivisionSet", name="项目划分集")
|
||
graph.create(division_set)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("EngineeringData", "ProjectDivisionSet", entity_relationships)
|
||
if relationship_type: # 只有当关系类型不为None时才创建关系
|
||
graph.create(Relationship(parent_node, relationship_type, division_set))
|
||
|
||
# 处理ProjectDivisionTree
|
||
for first_level_name, first_level_content in project_division.items():
|
||
# 处理第一层下的内容,直接创建合并后的ProjectDivisionTree节点
|
||
if isinstance(first_level_content, dict):
|
||
# 处理一级名称,去掉"工程"字样
|
||
processed_first_level = first_level_name.replace("工程", "")
|
||
|
||
for second_level_name, second_level_content in first_level_content.items():
|
||
# 确定最终节点名称
|
||
if second_level_name == processed_first_level:
|
||
# 如果二级名称与处理后的一级名称相同,直接使用二级名称
|
||
final_name = second_level_name
|
||
else:
|
||
# 否则组合二级名称和处理后的一级名称
|
||
final_name = f"{second_level_name}{processed_first_level}"
|
||
|
||
# 创建ProjectDivisionTree节点
|
||
division_tree = Node("ProjectDivisionTree", name=final_name)
|
||
|
||
# 保存原始名称作为属性
|
||
division_tree["original_first_level"] = first_level_name
|
||
division_tree["original_second_level"] = second_level_name
|
||
|
||
# 如果有GUID,添加到节点属性
|
||
guid = None
|
||
if isinstance(first_level_content, dict) and "GUID" in first_level_content:
|
||
division_tree["first_level_GUID"] = first_level_content["GUID"]
|
||
guid = first_level_content["GUID"]
|
||
|
||
graph.create(division_tree)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type(
|
||
"ProjectDivisionSet", "ProjectDivisionTree", entity_relationships
|
||
)
|
||
if relationship_type:
|
||
graph.create(Relationship(division_set, relationship_type, division_tree))
|
||
|
||
# 移除GUID关系建立代码,避免重复创建
|
||
# 关系将在establish_relationships函数中批量创建
|
||
|
||
# 处理第二层下的ProjectDivisionItem列表
|
||
if isinstance(second_level_content, list):
|
||
for item in second_level_content:
|
||
process_project_division_item(item, division_tree, entity_relationships)
|
||
else:
|
||
logger.warning(f"ProjectDivisionTree {final_name} 的内容类型未知: {type(second_level_content)}")
|
||
else:
|
||
logger.warning(f"第一层 {first_level_name} 的内容类型未知: {type(first_level_content)}")
|
||
|
||
|
||
# 处理ProjectDivisionItem
|
||
def process_project_division_item(item, parent_node, entity_relationships):
|
||
# 提取必要属性
|
||
guid = item.get("GUID", "")
|
||
name = item.get("项目名称", "")
|
||
|
||
if not guid and not name:
|
||
logger.warning("ProjectDivisionItem缺少GUID和项目名称")
|
||
return
|
||
|
||
# 创建ProjectDivisionItem节点
|
||
item_node = Node("ProjectDivisionItem", GUID=guid, name=name)
|
||
|
||
# 添加path属性,表示从ProjectDivisionItem到ProjectDivisionTree的路径,不包含节点类型
|
||
if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels:
|
||
# 如果父节点是ProjectDivisionTree,使用"父节点名称/当前节点名称"作为路径
|
||
item_node["path"] = f"{parent_node['name']}/{name}"
|
||
# logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}")
|
||
else:
|
||
# 如果父节点是ProjectDivisionItem,使用"父节点path/当前节点名称"作为路径
|
||
parent_path = parent_node.get("path", "")
|
||
if parent_path:
|
||
item_node["path"] = f"{parent_path}/{name}"
|
||
else:
|
||
# 如果父节点没有path属性(不应该发生,但为了健壮性)
|
||
item_node["path"] = name
|
||
# logger.info(f"为ProjectDivisionItem {name} 设置path: {item_node['path']}")
|
||
|
||
# 添加其他属性
|
||
for key, value in item.items():
|
||
if key not in ["GUID", "项目名称", "children"] and value is not None:
|
||
# 检查是否为资源库列表
|
||
if key == "资源库列表" and isinstance(value, list):
|
||
# 将资源库列表转换为分号分隔的字符串
|
||
resource_names = []
|
||
for resource in value:
|
||
if isinstance(resource, dict) and "资源库名称" in resource:
|
||
resource_names.append(resource["资源库名称"])
|
||
item_node["资源库名称"] = ";".join(resource_names)
|
||
# logger.info(f"将资源库列表转换为字符串: {item_node['资源库名称']}")
|
||
# 检查值是否为基本类型
|
||
elif isinstance(value, (str, int, float, bool)):
|
||
item_node[key] = value
|
||
# 如果是列表,尝试转换为分号分隔的字符串
|
||
elif isinstance(value, list):
|
||
try:
|
||
if all(isinstance(x, (str, int, float, bool)) for x in value):
|
||
item_node[key] = ";".join(str(x) for x in value)
|
||
else:
|
||
# 对于包含复杂对象的列表,尝试提取关键信息
|
||
extracted_values = []
|
||
for item_in_list in value:
|
||
if isinstance(item_in_list, dict):
|
||
# 尝试提取字典中的名称或标识符
|
||
for name_key in ["名称", "name", "标识", "id", "ID"]:
|
||
if name_key in item_in_list:
|
||
extracted_values.append(str(item_in_list[name_key]))
|
||
break
|
||
else:
|
||
# 如果没有找到名称键,使用第一个键值对
|
||
if item_in_list:
|
||
first_key = next(iter(item_in_list))
|
||
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
|
||
else:
|
||
extracted_values.append(str(item_in_list))
|
||
item_node[key] = ";".join(extracted_values)
|
||
except Exception as e:
|
||
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
|
||
# 如果是字典,尝试转换为字符串
|
||
elif isinstance(value, dict):
|
||
try:
|
||
# 提取字典中的关键信息
|
||
extracted_info = []
|
||
for dict_key, dict_value in value.items():
|
||
if isinstance(dict_value, (str, int, float, bool)):
|
||
extracted_info.append(f"{dict_key}:{dict_value}")
|
||
item_node[key] = ";".join(extracted_info)
|
||
except Exception as e:
|
||
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
|
||
|
||
graph.create(item_node)
|
||
# logger.info(f"创建ProjectDivisionItem节点: {name} (GUID: {guid})")
|
||
|
||
# 创建与父节点的关系
|
||
if isinstance(parent_node, Node) and "ProjectDivisionTree" in parent_node.labels:
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectDivisionTree", "ProjectDivisionItem", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, item_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}")
|
||
else:
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectDivisionItem", "ProjectDivisionItem", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, item_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {item_node['name']}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {item_node['name']}")
|
||
|
||
# 移除GUID关系建立代码,避免重复创建
|
||
# 关系将在establish_relationships函数中批量创建
|
||
|
||
# 处理子项
|
||
if "children" in item and item["children"]:
|
||
children = item["children"]
|
||
# logger.info(f"ProjectDivisionItem {name} 有 {len(children)} 个子项")
|
||
|
||
for child in children:
|
||
child_type = child.get("type", child.get("类型", ""))
|
||
|
||
if child_type == "项目划分":
|
||
# 递归处理子ProjectDivisionItem
|
||
process_project_division_item(child, item_node, entity_relationships)
|
||
elif child_type == "8" or child_type == "清单":
|
||
# 处理List类型节点
|
||
process_list_item(child, item_node, entity_relationships)
|
||
else:
|
||
# 处理ProjectQuantity及其子类
|
||
process_project_quantity(child, item_node, entity_relationships)
|
||
|
||
|
||
# 处理List及其子类
|
||
def process_list_item(list_item, parent_node, entity_relationships):
|
||
"""处理清单类型的节点"""
|
||
# 提取必要属性
|
||
guid = list_item.get("GUID", "")
|
||
list_name = list_item.get("清单名称")
|
||
list_type = list_item.get("类型", "")
|
||
|
||
# 创建List节点
|
||
list_node = Node("List", guid=guid, name=list_name, type=list_type)
|
||
|
||
# 添加path属性,包含节点类型
|
||
parent_path = parent_node.get("path", "")
|
||
if parent_path:
|
||
list_node["path"] = f"{parent_path}/{list_name}(清单)"
|
||
else:
|
||
# 如果父节点没有path属性(不应该发生,但为了健壮性)
|
||
parent_name = parent_node.get("name", "")
|
||
list_node["path"] = f"{parent_name}/{list_name}(清单)"
|
||
# logger.info(f"为List节点 {list_name} 设置path: {list_node['path']}")
|
||
|
||
# 添加其他属性
|
||
for key, value in list_item.items():
|
||
if key not in ["清单名称", "类型", "guid", "children"] and value is not None:
|
||
# 检查是否为资源库列表
|
||
if key == "资源库列表" and isinstance(value, list):
|
||
# 将资源库列表转换为分号分隔的字符串
|
||
resource_names = []
|
||
for resource in value:
|
||
if isinstance(resource, dict) and "资源库名称" in resource:
|
||
resource_names.append(resource["资源库名称"])
|
||
list_node["资源库名称"] = ";".join(resource_names)
|
||
# logger.info(f"将资源库列表转换为字符串: {list_node['资源库名称']}")
|
||
# 检查值是否为基本类型
|
||
elif isinstance(value, (str, int, float, bool)):
|
||
list_node[key] = value
|
||
# 如果是列表,尝试转换为分号分隔的字符串
|
||
elif isinstance(value, list):
|
||
try:
|
||
if all(isinstance(x, (str, int, float, bool)) for x in value):
|
||
list_node[key] = ";".join(str(x) for x in value)
|
||
else:
|
||
# 对于包含复杂对象的列表,尝试提取关键信息
|
||
extracted_values = []
|
||
for item_in_list in value:
|
||
if isinstance(item_in_list, dict):
|
||
# 尝试提取字典中的名称或标识符
|
||
for name_key in ["名称", "name", "标识", "id", "ID"]:
|
||
if name_key in item_in_list:
|
||
extracted_values.append(str(item_in_list[name_key]))
|
||
break
|
||
else:
|
||
# 如果没有找到名称键,使用第一个键值对
|
||
if item_in_list:
|
||
first_key = next(iter(item_in_list))
|
||
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
|
||
else:
|
||
extracted_values.append(str(item_in_list))
|
||
list_node[key] = ";".join(extracted_values)
|
||
except Exception as e:
|
||
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
|
||
# 如果是字典,尝试转换为字符串
|
||
elif isinstance(value, dict):
|
||
try:
|
||
# 提取字典中的关键信息
|
||
extracted_info = []
|
||
for dict_key, dict_value in value.items():
|
||
if isinstance(dict_value, (str, int, float, bool)):
|
||
extracted_info.append(f"{dict_key}:{dict_value}")
|
||
list_node[key] = ";".join(extracted_info)
|
||
except Exception as e:
|
||
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
|
||
|
||
graph.create(list_node)
|
||
# logger.info(f"创建List节点: {list_name} (类型: {list_type})")
|
||
|
||
# 创建与父节点的关系
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type(list(parent_node.labels)[0], "List", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, list_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {list_name}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {list_name}")
|
||
|
||
# 如果有GUID,尝试建立与CostSet的关系
|
||
if guid:
|
||
# 查找对应的CostSet节点
|
||
cost_set_query = f"""
|
||
MATCH (c:CostSet)
|
||
WHERE c.GUID = '{guid}'
|
||
RETURN c
|
||
"""
|
||
cost_set_nodes = list(graph.run(cost_set_query))
|
||
if cost_set_nodes:
|
||
cost_set_node = cost_set_nodes[0]["c"]
|
||
relationship_type = get_relationship_type("List", "CostSet", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(list_node, relationship_type, cost_set_node))
|
||
# logger.info(f"创建关系: {list_name} {relationship_type} CostSet (GUID: {guid})")
|
||
# else:
|
||
# logger.info(f"不创建关系: {list_name} 到 CostSet (GUID: {guid})")
|
||
|
||
# 处理子项
|
||
if "children" in list_item and list_item["children"]:
|
||
children = list_item["children"]
|
||
# logger.info(f"List {list_name} 有 {len(children)} 个子项")
|
||
|
||
for child in children:
|
||
# 确定子项类型
|
||
child_type = child.get("type", child.get("类型", ""))
|
||
|
||
if child_type == "项目划分":
|
||
# 递归处理子ProjectDivisionItem
|
||
process_project_division_item(child, list_node, entity_relationships)
|
||
elif child_type == "8" or child_type == "清单":
|
||
# 递归处理子List
|
||
process_list_item(child, list_node, entity_relationships)
|
||
else:
|
||
# 处理ProjectQuantity及其子类
|
||
process_project_quantity(child, list_node, entity_relationships)
|
||
|
||
|
||
# 处理ProjectQuantity及其子类
|
||
def process_project_quantity(quantity, parent_node, entity_relationships):
|
||
# 确定具体类型
|
||
quantity_type = quantity.get("类型", "")
|
||
labels = ["ProjectQuantity"]
|
||
type_name = "ProjectQuantity"
|
||
|
||
# 支持数字和文本类型
|
||
if quantity_type == "0" or quantity_type == "定额":
|
||
labels.append("Quota")
|
||
type_name = "定额"
|
||
elif quantity_type == "1" or quantity_type == "主材":
|
||
labels.append("MainMaterial")
|
||
type_name = "主材"
|
||
elif quantity_type == "5" or quantity_type == "设备":
|
||
labels.append("Equipment")
|
||
type_name = "设备"
|
||
|
||
# 创建节点
|
||
quantity_id = quantity.get("id", "")
|
||
quantity_name = quantity.get("项目名称", quantity.get("名称", ""))
|
||
|
||
quantity_node = Node(*labels, id=quantity_id, name=quantity_name)
|
||
|
||
# 添加path属性,包含节点类型
|
||
parent_path = parent_node.get("path", "")
|
||
if parent_path:
|
||
quantity_node["path"] = f"{parent_path}/{quantity_name}({type_name})"
|
||
else:
|
||
# 如果父节点没有path属性(不应该发生,但为了健壮性)
|
||
parent_name = parent_node.get("name", "")
|
||
quantity_node["path"] = f"{parent_name}/{quantity_name}({type_name})"
|
||
# logger.info(f"为ProjectQuantity节点 {quantity_name} 设置path: {quantity_node['path']}")
|
||
|
||
# 添加其他属性
|
||
for key, value in quantity.items():
|
||
if key not in ["id", "名称", "项目名称", "材机列表", "children"] and value is not None:
|
||
# 检查是否为资源库列表
|
||
if key == "资源库列表" and isinstance(value, list):
|
||
# 将资源库列表转换为分号分隔的字符串
|
||
resource_names = []
|
||
for resource in value:
|
||
if isinstance(resource, dict) and "资源库名称" in resource:
|
||
resource_names.append(resource["资源库名称"])
|
||
quantity_node["资源库名称"] = ";".join(resource_names)
|
||
# logger.info(f"将资源库列表转换为字符串: {quantity_node['资源库名称']}")
|
||
# 检查值是否为基本类型
|
||
elif isinstance(value, (str, int, float, bool)):
|
||
quantity_node[key] = value
|
||
# 如果是列表,尝试转换为分号分隔的字符串
|
||
elif isinstance(value, list):
|
||
try:
|
||
if all(isinstance(x, (str, int, float, bool)) for x in value):
|
||
quantity_node[key] = ";".join(str(x) for x in value)
|
||
else:
|
||
# 对于包含复杂对象的列表,尝试提取关键信息
|
||
extracted_values = []
|
||
for item_in_list in value:
|
||
if isinstance(item_in_list, dict):
|
||
# 尝试提取字典中的名称或标识符
|
||
for name_key in ["名称", "name", "标识", "id", "ID"]:
|
||
if name_key in item_in_list:
|
||
extracted_values.append(str(item_in_list[name_key]))
|
||
break
|
||
else:
|
||
# 如果没有找到名称键,使用第一个键值对
|
||
if item_in_list:
|
||
first_key = next(iter(item_in_list))
|
||
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
|
||
else:
|
||
extracted_values.append(str(item_in_list))
|
||
quantity_node[key] = ";".join(extracted_values)
|
||
except Exception as e:
|
||
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
|
||
# 如果是字典,尝试转换为字符串
|
||
elif isinstance(value, dict):
|
||
try:
|
||
# 提取字典中的关键信息
|
||
extracted_info = []
|
||
for dict_key, dict_value in value.items():
|
||
if isinstance(dict_value, (str, int, float, bool)):
|
||
extracted_info.append(f"{dict_key}:{dict_value}")
|
||
quantity_node[key] = ";".join(extracted_info)
|
||
except Exception as e:
|
||
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
|
||
|
||
graph.create(quantity_node)
|
||
# logger.info(f"创建ProjectQuantity节点: {quantity_name} (id: {quantity_id}, 类型: {quantity_type})")
|
||
|
||
# 创建与父节点的关系
|
||
# 从本体层获取关系类型
|
||
parent_label = list(parent_node.labels)[0] # 获取父节点的第一个标签
|
||
relationship_type = get_relationship_type(parent_label, "ProjectQuantity", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, quantity_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {quantity_name}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {quantity_name}")
|
||
|
||
# 移除GUID关系建立代码,避免重复创建
|
||
# 关系将在establish_relationships函数中批量创建
|
||
|
||
# 处理材机列表或children
|
||
materials = None
|
||
|
||
# 先检查是否有材机列表
|
||
if "材机列表" in quantity and quantity["材机列表"]:
|
||
materials = quantity["材机列表"]
|
||
# logger.info(f"ProjectQuantity {quantity_name} 有 {len(materials)} 个材机项")
|
||
|
||
for material in materials:
|
||
process_material_or_equipment(material, quantity_node, entity_relationships)
|
||
|
||
# 如果没有材机列表,则检查是否有children
|
||
elif "children" in quantity and quantity["children"]:
|
||
children = quantity["children"]
|
||
# logger.info(f"ProjectQuantity {quantity_name} 有 {len(children)} 个子项")
|
||
|
||
for child in children:
|
||
child_type = child.get("类型", child.get("type", ""))
|
||
|
||
# 如果子项类型为人工、材料或机械,则视为MaterialOrEquipment
|
||
if child_type in ["人工", "材料", "机械", "2", "3", "4"]:
|
||
process_material_or_equipment(child, quantity_node, entity_relationships)
|
||
# 如果子项类型为主材、设备或定额,则递归处理为ProjectQuantity
|
||
elif child_type in ["1", "主材", "5", "设备", "0", "定额"]:
|
||
process_project_quantity(child, quantity_node, entity_relationships)
|
||
|
||
|
||
# 处理MaterialOrEquipment
|
||
def process_material_or_equipment(material, parent_node, entity_relationships):
|
||
material_id = material.get("id", material.get("ID", ""))
|
||
material_name = material.get("名称", "")
|
||
material_type = material.get("类型", material.get("type", ""))
|
||
|
||
if not material_id and not material_name:
|
||
logger.warning("MaterialOrEquipment缺少id和名称")
|
||
return
|
||
|
||
# 创建唯一标识,结合父节点的ID和当前项的id
|
||
parent_id = parent_node.get("id", parent_node.get("GUID", ""))
|
||
unique_id = f"{parent_id}_{material_id}" if parent_id else material_id
|
||
|
||
# 直接创建新节点,不检查是否已存在
|
||
material_node = Node(
|
||
"MaterialOrEquipment", id=material_id, unique_id=unique_id, name=material_name, type=material_type
|
||
)
|
||
|
||
# 添加其他属性
|
||
for key, value in material.items():
|
||
if key not in ["id", "ID", "名称", "类型", "type"] and value is not None:
|
||
# 检查是否为资源库列表
|
||
if key == "资源库列表" and isinstance(value, list):
|
||
# 将资源库列表转换为分号分隔的字符串
|
||
resource_names = []
|
||
for resource in value:
|
||
if isinstance(resource, dict) and "资源库名称" in resource:
|
||
resource_names.append(resource["资源库名称"])
|
||
material_node["资源库名称"] = ";".join(resource_names)
|
||
# logger.info(f"将资源库列表转换为字符串: {material_node['资源库名称']}")
|
||
# 检查值是否为基本类型
|
||
elif isinstance(value, (str, int, float, bool)):
|
||
material_node[key] = value
|
||
# 如果是列表,尝试转换为分号分隔的字符串
|
||
elif isinstance(value, list):
|
||
try:
|
||
if all(isinstance(x, (str, int, float, bool)) for x in value):
|
||
material_node[key] = ";".join(str(x) for x in value)
|
||
else:
|
||
# 对于包含复杂对象的列表,尝试提取关键信息
|
||
extracted_values = []
|
||
for item_in_list in value:
|
||
if isinstance(item_in_list, dict):
|
||
# 尝试提取字典中的名称或标识符
|
||
for name_key in ["名称", "name", "标识", "id", "ID"]:
|
||
if name_key in item_in_list:
|
||
extracted_values.append(str(item_in_list[name_key]))
|
||
break
|
||
else:
|
||
# 如果没有找到名称键,使用第一个键值对
|
||
if item_in_list:
|
||
first_key = next(iter(item_in_list))
|
||
extracted_values.append(f"{first_key}:{item_in_list[first_key]}")
|
||
else:
|
||
extracted_values.append(str(item_in_list))
|
||
material_node[key] = ";".join(extracted_values)
|
||
except Exception as e:
|
||
logger.warning(f"无法将列表属性 {key} 转换为字符串: {e}")
|
||
# 如果是字典,尝试转换为字符串
|
||
elif isinstance(value, dict):
|
||
try:
|
||
# 提取字典中的关键信息
|
||
extracted_info = []
|
||
for dict_key, dict_value in value.items():
|
||
if isinstance(dict_value, (str, int, float, bool)):
|
||
extracted_info.append(f"{dict_key}:{dict_value}")
|
||
material_node[key] = ";".join(extracted_info)
|
||
except Exception as e:
|
||
logger.warning(f"无法将字典属性 {key} 转换为字符串: {e}")
|
||
|
||
graph.create(material_node)
|
||
# logger.info(f"创建MaterialOrEquipment节点: {material_name} (id: {material_id}, 类型: {material_type})")
|
||
|
||
# 创建与父节点的关系
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectQuantity", "MaterialOrEquipment", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, material_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {material_name}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {material_name}")
|
||
|
||
|
||
# 处理CostSet
|
||
def process_cost_set(data, root_node, entity_relationships):
|
||
"""处理费用预览,为每个具有GUID的节点创建CostSet节点"""
|
||
# 根据JSON结构,访问expensePreview数据
|
||
if "projectData" in data and "expensePreview" in data["projectData"]:
|
||
expense_preview = data["projectData"]["expensePreview"]
|
||
elif "expensePreview" in data:
|
||
expense_preview = data["expensePreview"]
|
||
else:
|
||
logger.warning("JSON中未找到expensePreview数据")
|
||
logger.info(f"JSON顶层键: {list(data.keys())}")
|
||
return
|
||
|
||
# logger.info("开始处理expensePreview")
|
||
|
||
# 用于跟踪已处理的GUID,避免重复处理
|
||
processed_guids = set()
|
||
|
||
# 递归函数,用于遍历expensePreview中的所有节点
|
||
def traverse_expense_preview(node, path=""):
|
||
if isinstance(node, dict):
|
||
# 如果节点包含GUID,则创建CostSet节点
|
||
if "GUID" in node:
|
||
guid = node["GUID"]
|
||
# 标准化GUID格式
|
||
guid = guid.strip("{}")
|
||
guid = "{" + guid.upper() + "}"
|
||
|
||
if guid not in processed_guids:
|
||
processed_guids.add(guid)
|
||
# 创建CostSet节点
|
||
cost_set_node = Node("CostSet", GUID=guid, name="费用预览集")
|
||
|
||
try:
|
||
graph.create(cost_set_node)
|
||
# logger.info(f"创建CostSet节点: 费用预览集 (GUID: {guid}, 路径: {path})")
|
||
except Exception as e:
|
||
logger.error(f"创建CostSet节点失败: {e}")
|
||
return
|
||
|
||
# 创建与根节点的关系
|
||
relationship_type = get_relationship_type("EngineeringData", "CostSet", entity_relationships)
|
||
if relationship_type:
|
||
try:
|
||
graph.create(Relationship(root_node, relationship_type, cost_set_node))
|
||
# logger.info(f"创建关系: {root_node['name']} {relationship_type} 费用预览集")
|
||
except Exception as e:
|
||
logger.error(f"创建关系失败: {e}")
|
||
|
||
# 处理sum数组中的CostItem
|
||
if "sum" in node and isinstance(node["sum"], list) and node["sum"]:
|
||
for item in node["sum"]:
|
||
try:
|
||
process_cost_item(item, cost_set_node, entity_relationships)
|
||
except Exception as e:
|
||
logger.error(f"处理CostItem时出错: {e}")
|
||
|
||
# 处理rcj数组中的MaterialandmachineCostItem
|
||
if "rcj" in node and isinstance(node["rcj"], list) and node["rcj"]:
|
||
for item in node["rcj"]:
|
||
try:
|
||
process_material_machine_cost_item(item, cost_set_node, entity_relationships)
|
||
except Exception as e:
|
||
logger.error(f"处理MaterialandmachineCostItem时出错: {e}")
|
||
|
||
# 递归处理子节点
|
||
for key, value in node.items():
|
||
if key == "children" and isinstance(value, list):
|
||
for i, child in enumerate(value):
|
||
traverse_expense_preview(child, f"{path}.children[{i}]")
|
||
elif isinstance(node, list):
|
||
for i, item in enumerate(node):
|
||
traverse_expense_preview(item, f"{path}[{i}]")
|
||
|
||
# 遍历expensePreview中的所有类别
|
||
for category_name, category in expense_preview.items():
|
||
for subcategory_name, subcategory in category.items():
|
||
if isinstance(subcategory, list):
|
||
for i, item in enumerate(subcategory):
|
||
traverse_expense_preview(item, f"expensePreview.{category_name}.{subcategory_name}[{i}]")
|
||
|
||
# logger.info(f"共处理了 {len(processed_guids)} 个CostSet节点")
|
||
|
||
|
||
# 处理CostItem - 简化处理逻辑,确保正确创建节点
|
||
def process_cost_item(item, parent_node, entity_relationships):
|
||
# 确保item是字典
|
||
if not isinstance(item, dict):
|
||
logger.error(f"CostItem不是字典类型: {item}")
|
||
return
|
||
|
||
item_id = item.get("id", "")
|
||
cost = item.get("cost", "")
|
||
|
||
if not item_id:
|
||
logger.warning(f"跳过没有id的CostItem: {item}")
|
||
return
|
||
|
||
# 使用id作为名称
|
||
name = item_id
|
||
|
||
# 创建唯一标识,结合父节点的GUID和当前项的id
|
||
parent_guid = parent_node.get("GUID", "")
|
||
unique_id = f"{parent_guid}_{item_id}" if parent_guid else item_id
|
||
|
||
# 创建CostItem节点 - 简化属性,只保留关键属性
|
||
item_node = Node("CostItem", id=item_id, unique_id=unique_id, cost=cost, name=name)
|
||
|
||
# 添加其他属性 - 简化属性处理
|
||
for key, value in item.items():
|
||
if key not in ["id", "cost"] and value is not None:
|
||
if isinstance(value, (str, int, float, bool)):
|
||
item_node[key] = value
|
||
|
||
graph.create(item_node)
|
||
# logger.info(f"创建CostItem节点: {name} (id: {item_id}, cost: {cost})")
|
||
|
||
# 创建与父节点的关系
|
||
relationship_type = get_relationship_type("CostSet", "CostItem", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, item_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {name}")
|
||
|
||
|
||
# 处理人材机合价项(MaterialandmachineCostItem)
|
||
def process_material_machine_cost_item(item, parent_node, entity_relationships):
|
||
"""处理人材机合价项"""
|
||
# 提取必要属性
|
||
item_type = item.get("type", "")
|
||
name = item.get("名称", "")
|
||
code = item.get("编码", "")
|
||
unit = item.get("单位", "")
|
||
supplier = item.get("供货方", "")
|
||
budget_price = item.get("预算价不含税", "")
|
||
market_price = item.get("市场价不含税", "")
|
||
budget_total = item.get("预算价合价", "")
|
||
market_total = item.get("市场价合价", "")
|
||
price_diff = item.get("价差", "")
|
||
quantity = item.get("数量", "")
|
||
|
||
if not name:
|
||
logger.warning("MaterialandmachineCostItem缺少名称")
|
||
return
|
||
|
||
# 创建唯一标识,结合父节点的GUID和当前项的编码
|
||
parent_guid = parent_node.get("GUID", "")
|
||
unique_id = f"{parent_guid}_{code}" if parent_guid and code else (parent_guid or code)
|
||
|
||
# 创建人材机合价项节点,确保所有属性值都是原始类型
|
||
properties = {"type": str(item_type), "name": str(name), "unique_id": str(unique_id)}
|
||
|
||
# 添加其他属性,确保都是字符串类型
|
||
if supplier:
|
||
properties["供货方"] = str(supplier)
|
||
if code:
|
||
properties["编码"] = str(code)
|
||
if unit:
|
||
properties["单位"] = str(unit)
|
||
if budget_price:
|
||
properties["预算价不含税"] = str(budget_price)
|
||
if market_price:
|
||
properties["市场价不含税"] = str(market_price)
|
||
if budget_total:
|
||
properties["预算价合价"] = str(budget_total)
|
||
if market_total:
|
||
properties["市场价合价"] = str(market_total)
|
||
if price_diff:
|
||
properties["价差"] = str(price_diff)
|
||
if quantity:
|
||
properties["数量"] = str(quantity)
|
||
|
||
# 创建节点
|
||
item_node = Node("MaterialandmachineCostItem", **properties)
|
||
|
||
try:
|
||
graph.create(item_node)
|
||
# logger.info(f"创建MaterialandmachineCostItem节点: {name} (类型: {item_type}, 编码: {code})")
|
||
except Exception as e:
|
||
logger.error(f"创建MaterialandmachineCostItem节点失败: {e}")
|
||
# 打印详细的节点属性,帮助调试
|
||
for key, value in properties.items():
|
||
logger.error(f"属性 {key}: {type(value)} = {value}")
|
||
return
|
||
|
||
# 创建与父节点的关系
|
||
relationship_type = get_relationship_type("CostSet", "MaterialandmachineCostItem", entity_relationships)
|
||
if relationship_type:
|
||
try:
|
||
graph.create(Relationship(parent_node, relationship_type, item_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
|
||
except Exception as e:
|
||
logger.error(f"创建关系失败: {e}")
|
||
else:
|
||
logger.info(f"不创建关系: {parent_node['name']} 到 {name}")
|
||
|
||
|
||
# 修改establish_relationships函数,添加项目划分与人材机合价集合的关系
|
||
def establish_relationships(entity_relationships):
|
||
# 首先检查数据库中的节点情况
|
||
logger.info("检查数据库中的节点情况...")
|
||
|
||
# 检查ProjectDivisionItem节点
|
||
pdi_query = """
|
||
MATCH (pdi:ProjectDivisionItem)
|
||
RETURN count(pdi) as count, collect(distinct pdi.GUID)[..10] as sample_guids
|
||
"""
|
||
pdi_result = graph.run(pdi_query).data()[0]
|
||
logger.info(f"数据库中有 {pdi_result['count']} 个ProjectDivisionItem节点")
|
||
logger.info(f"ProjectDivisionItem节点GUID样本: {pdi_result['sample_guids']}")
|
||
|
||
# 检查ProjectQuantity节点
|
||
pq_query = """
|
||
MATCH (pq:ProjectQuantity)
|
||
RETURN count(pq) as count, collect(distinct pq.id)[..10] as sample_ids
|
||
"""
|
||
pq_result = graph.run(pq_query).data()[0]
|
||
logger.info(f"数据库中有 {pq_result['count']} 个ProjectQuantity节点")
|
||
logger.info(f"ProjectQuantity节点ID样本: {pq_result['sample_ids']}")
|
||
|
||
# 检查CostSet节点
|
||
cs_query = """
|
||
MATCH (cs:CostSet)
|
||
RETURN count(cs) as count, collect(distinct cs.GUID)[..10] as sample_guids
|
||
"""
|
||
cs_result = graph.run(cs_query).data()[0]
|
||
logger.info(f"数据库中有 {cs_result['count']} 个CostSet节点")
|
||
logger.info(f"CostSet节点GUID样本: {cs_result['sample_guids']}")
|
||
|
||
# 获取从ProjectDivisionItem到CostSet的关系类型
|
||
relationship_type = get_relationship_type("ProjectDivisionItem", "CostSet", entity_relationships)
|
||
|
||
if relationship_type: # 只有当关系类型不为None时才创建关系
|
||
# 建立ProjectDivisionItem与CostSet的关系,使用更宽松的匹配条件,并避免重复创建
|
||
query_division_item = f"""
|
||
MATCH (pdi:ProjectDivisionItem), (cs:CostSet)
|
||
WHERE toUpper(replace(replace(pdi.GUID, '{{', ''), '}}', '')) = toUpper(replace(replace(cs.GUID, '{{', ''), '}}', ''))
|
||
AND pdi.GUID <> ""
|
||
AND NOT EXISTS((pdi)-[:{relationship_type}]->(cs))
|
||
CREATE (pdi)-[:{relationship_type}]->(cs)
|
||
RETURN count(*) as count
|
||
"""
|
||
try:
|
||
result = graph.run(query_division_item)
|
||
count = result.data()[0]["count"]
|
||
logger.info(f"创建了 {count} 个 ProjectDivisionItem {relationship_type} CostSet 关系")
|
||
except Exception as e:
|
||
logger.error(f"创建ProjectDivisionItem与CostSet关系失败: {e}")
|
||
else:
|
||
logger.info("本体层中未定义ProjectDivisionItem到CostSet的关系,跳过创建")
|
||
|
||
# 获取从ProjectQuantity到CostSet的关系类型
|
||
relationship_type = get_relationship_type("ProjectQuantity", "CostSet", entity_relationships)
|
||
|
||
if relationship_type: # 只有当关系类型不为None时才创建关系
|
||
# 建立ProjectQuantity与CostSet的关系,使用更宽松的匹配条件,并避免重复创建
|
||
query_quantity = f"""
|
||
MATCH (pq:ProjectQuantity), (cs:CostSet)
|
||
WHERE toUpper(replace(replace(pq.GUID, '{{', ''), '}}', '')) = toUpper(replace(replace(cs.GUID, '{{', ''), '}}', ''))
|
||
AND pq.GUID <> ""
|
||
AND NOT EXISTS((pq)-[:{relationship_type}]->(cs))
|
||
CREATE (pq)-[:{relationship_type}]->(cs)
|
||
RETURN count(*) as count
|
||
"""
|
||
try:
|
||
result = graph.run(query_quantity)
|
||
count = result.data()[0]["count"]
|
||
logger.info(f"创建了 {count} 个 ProjectQuantity {relationship_type} CostSet 关系")
|
||
except Exception as e:
|
||
logger.error(f"创建ProjectQuantity与CostSet关系失败: {e}")
|
||
else:
|
||
logger.info("本体层中未定义ProjectQuantity到CostSet的关系,跳过创建")
|
||
|
||
# 检查最终的关系数量
|
||
# 检查ProjectDivisionItem到CostSet的关系
|
||
pdi_cs_query = """
|
||
MATCH (pdi:ProjectDivisionItem)-[r]->(cs:CostSet)
|
||
RETURN count(r) as count
|
||
"""
|
||
pdi_cs_result = graph.run(pdi_cs_query).data()[0]
|
||
logger.info(f"数据库中最终有 {pdi_cs_result['count']} 个ProjectDivisionItem到CostSet的关系")
|
||
|
||
# 检查ProjectQuantity到CostSet的关系
|
||
pq_cs_query = """
|
||
MATCH (pq:ProjectQuantity)-[r]->(cs:CostSet)
|
||
RETURN count(r) as count
|
||
"""
|
||
pq_cs_result = graph.run(pq_cs_query).data()[0]
|
||
logger.info(f"数据库中最终有 {pq_cs_result['count']} 个ProjectQuantity到CostSet的关系")
|
||
|
||
|
||
# 处理取费表模板集(FeeTableTemplateSet)
|
||
def process_fee_table_template_set(data, parent_node, entity_relationships):
|
||
# 根据JSON结构,访问costSetting数据
|
||
if "projectData" in data and "costSetting" in data["projectData"]:
|
||
cost_setting = data["projectData"]["costSetting"]
|
||
elif "costSetting" in data:
|
||
cost_setting = data["costSetting"]
|
||
else:
|
||
logger.warning("JSON中未找到costSetting数据")
|
||
logger.info(f"JSON顶层键: {list(data.keys())}")
|
||
return
|
||
|
||
# logger.info(f"开始处理costSetting,包含 {len(cost_setting)} 个取费表模板集")
|
||
|
||
# 只创建一个取费表模板集节点
|
||
fee_template_set_node = Node("FeeTableTemplateSet", name="取费表模板集")
|
||
graph.create(fee_template_set_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("EngineeringData", "FeeTableTemplateSet", entity_relationships)
|
||
graph.create(Relationship(parent_node, relationship_type, fee_template_set_node))
|
||
# logger.info("创建FeeTableTemplateSet节点: 取费表模板集")
|
||
|
||
# 处理每个取费表模板集
|
||
for template_set_name, template_set_content in cost_setting.items():
|
||
|
||
# 直接处理取费表模板项
|
||
if "tables" in template_set_content and isinstance(template_set_content["tables"], list):
|
||
for template_item in template_set_content["tables"]:
|
||
process_fee_table_template_item(template_item, fee_template_set_node, entity_relationships)
|
||
|
||
|
||
# 处理取费表模板项(FeeTableTemplateItem)
|
||
def process_fee_table_template_item(template_item, parent_node, entity_relationships):
|
||
# 提取必要属性
|
||
name = template_item.get("name", "")
|
||
outlay_id = template_item.get("OutlayID", "")
|
||
type_name = template_item.get("类型", "")
|
||
profession = template_item.get("专业", "")
|
||
|
||
if not name:
|
||
logger.warning("FeeTableTemplateItem缺少name")
|
||
return
|
||
|
||
# 创建取费表模板项节点
|
||
template_item_node = Node(
|
||
"FeeTableTemplateItem", name=name, outlayID=outlay_id, type=type_name, profession=profession
|
||
)
|
||
|
||
graph.create(template_item_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("FeeTableTemplateSet", "FeeTableTemplateItem", entity_relationships)
|
||
graph.create(Relationship(parent_node, relationship_type, template_item_node))
|
||
# logger.info(f"创建FeeTableTemplateItem节点: {name} (OutlayID: {outlay_id})")
|
||
|
||
# 处理取费项
|
||
if "children" in template_item and isinstance(template_item["children"], list):
|
||
for fee_item in template_item["children"]:
|
||
process_fee(fee_item, template_item_node, entity_relationships)
|
||
|
||
|
||
# 处理取费(FeeCollection)
|
||
def process_fee(fee_item, parent_node, entity_relationships):
|
||
# 提取必要属性
|
||
serial_number = fee_item.get("序号", "")
|
||
fee_name = fee_item.get("费用名称", "")
|
||
code = fee_item.get("代码", "")
|
||
rate = fee_item.get("费率(%)", "")
|
||
base = fee_item.get("取费基数", "")
|
||
remark = fee_item.get("备注", "")
|
||
|
||
if not fee_name:
|
||
logger.warning("Fee缺少费用名称")
|
||
return
|
||
|
||
# 创建取费节点
|
||
fee_node = Node("FeeCollection", serialNumber=serial_number, name=fee_name, code=code)
|
||
|
||
# 添加path属性
|
||
if "FeeCollection" in parent_node.labels:
|
||
# 如果父节点是FeeCollection节点,使用父节点的path加上当前节点名称
|
||
parent_path = parent_node.get("path", "")
|
||
fee_node["path"] = f"{parent_path}/{fee_name}"
|
||
else:
|
||
# 如果父节点是FeeTableTemplateItem,直接使用父节点名称作为路径的开始
|
||
parent_name = parent_node.get("name", "")
|
||
fee_node["path"] = f"{parent_name}/{fee_name}"
|
||
|
||
# logger.info(f"为FeeCollection节点 {fee_name} 设置path: {fee_node['path']}")
|
||
|
||
# 添加其他属性
|
||
if rate:
|
||
fee_node["rate"] = rate
|
||
if base:
|
||
fee_node["base"] = base
|
||
if remark:
|
||
fee_node["remark"] = remark
|
||
|
||
graph.create(fee_node)
|
||
|
||
# 从本体层获取关系类型
|
||
parent_label = list(parent_node.labels)[0]
|
||
relationship_type = get_relationship_type(parent_label, "FeeCollection", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, fee_node))
|
||
# logger.info(f"创建Fee节点: {fee_name} (序号: {serial_number}, 代码: {code})")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {fee_name}")
|
||
|
||
# 处理子费用项
|
||
if "children" in fee_item and isinstance(fee_item["children"], list):
|
||
for child_fee in fee_item["children"]:
|
||
process_fee(child_fee, fee_node, entity_relationships)
|
||
|
||
|
||
# 处理费用表集(FeeScheduleSet)
|
||
def process_fee_schedule_set(data, parent_node, entity_relationships):
|
||
"""处理费用表集、费用表项和费用"""
|
||
# 检查projectCost是否存在
|
||
if "projectData" in data and "projectCost" in data["projectData"]:
|
||
project_cost = data["projectData"]["projectCost"]
|
||
elif "projectCost" in data:
|
||
project_cost = data["projectCost"]
|
||
else:
|
||
logger.warning("JSON中未找到projectCost数据")
|
||
logger.info(f"JSON顶层键: {list(data.keys())}")
|
||
return
|
||
|
||
# logger.info(f"开始处理projectCost,包含 {len(project_cost)} 个费用表项")
|
||
|
||
# 创建FeeScheduleSet节点 - 工程费用
|
||
fee_schedule_set = Node("FeeScheduleSet", name="工程费用")
|
||
graph.create(fee_schedule_set)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("EngineeringData", "FeeScheduleSet", entity_relationships)
|
||
graph.create(Relationship(parent_node, relationship_type, fee_schedule_set))
|
||
# logger.info(f"创建FeeScheduleSet节点: 工程费用")
|
||
|
||
# 处理费用表集下的费用表项
|
||
for fee_table_name, fee_table_content in project_cost.items():
|
||
# 创建FeeScheduleItem节点
|
||
fee_schedule_item = Node("FeeScheduleItem", name=fee_table_name)
|
||
graph.create(fee_schedule_item)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("FeeScheduleSet", "FeeScheduleItem", entity_relationships)
|
||
graph.create(Relationship(fee_schedule_set, relationship_type, fee_schedule_item))
|
||
# logger.info(f"创建FeeScheduleItem节点: {fee_table_name}")
|
||
|
||
# 处理费用表项下的费用列表
|
||
if isinstance(fee_table_content, list):
|
||
# logger.info(f"FeeScheduleItem {fee_table_name} 包含 {len(fee_table_content)} 个费用项")
|
||
for fee_item in fee_table_content:
|
||
process_fee_item(fee_item, fee_schedule_item, entity_relationships)
|
||
else:
|
||
logger.warning(f"FeeScheduleItem {fee_table_name} 的内容类型未知: {type(fee_table_content)}")
|
||
|
||
|
||
# 处理费用项(Fee)
|
||
def process_fee_item(fee, parent_node, entity_relationships):
|
||
"""处理费用项"""
|
||
# 提取必要属性
|
||
serial_number = fee.get("序号", "")
|
||
name = fee.get("费用名称", "")
|
||
code = fee.get("代码", "")
|
||
rate = fee.get("费率(%)", "")
|
||
amount = fee.get("金额", "")
|
||
|
||
if not name:
|
||
logger.warning("Fee缺少费用名称")
|
||
return
|
||
|
||
# 创建Fee节点
|
||
fee_node = Node("Fee", serialNumber=serial_number, name=name, code=code)
|
||
|
||
# 添加path属性
|
||
if "Fee" in parent_node.labels:
|
||
# 如果父节点是Fee节点,使用父节点的path加上当前节点名称
|
||
parent_path = parent_node.get("path", "")
|
||
fee_node["path"] = f"{parent_path}/{name}"
|
||
else:
|
||
# 如果父节点是FeeScheduleItem,直接使用父节点名称作为路径的开始
|
||
parent_name = parent_node.get("name", "")
|
||
fee_node["path"] = f"{parent_name}/{name}"
|
||
|
||
# logger.info(f"为Fee节点 {name} 设置path: {fee_node['path']}")
|
||
|
||
# 添加其他属性
|
||
if rate:
|
||
fee_node["rate"] = rate
|
||
if amount:
|
||
fee_node["amount"] = amount
|
||
|
||
# 添加其他属性
|
||
for key, value in fee.items():
|
||
if key not in ["序号", "费用名称", "代码", "费率(%)", "金额", "children"] and value is not None:
|
||
if isinstance(value, (str, int, float, bool)):
|
||
fee_node[key] = value
|
||
|
||
graph.create(fee_node)
|
||
# logger.info(f"创建Fee节点: {name} (序号: {serial_number})")
|
||
|
||
# 创建与父节点的关系
|
||
# 从本体层获取关系类型
|
||
parent_label = list(parent_node.labels)[0]
|
||
relationship_type = get_relationship_type(parent_label, "Fee", entity_relationships)
|
||
if relationship_type:
|
||
graph.create(Relationship(parent_node, relationship_type, fee_node))
|
||
# logger.info(f"创建关系: {parent_node['name']} {relationship_type} {name}")
|
||
# else:
|
||
# logger.info(f"不创建关系: {parent_node['name']} 到 {name}")
|
||
|
||
# 处理子费用项
|
||
if "children" in fee and fee["children"]:
|
||
children = fee["children"]
|
||
# logger.info(f"Fee {name} 有 {len(children)} 个子费用项")
|
||
|
||
for child in children:
|
||
process_fee_item(child, fee_node, entity_relationships)
|
||
|
||
|
||
# 处理工程属性集(ProjectPropertySet)和工程属性(ProjectProperty)
|
||
def process_project_property_set(data, parent_node, entity_relationships):
|
||
# 检查projectInfo是否存在
|
||
if "projectData" in data and "projectInfo" in data["projectData"]:
|
||
project_info = data["projectData"]["projectInfo"]
|
||
elif "projectInfo" in data:
|
||
project_info = data["projectInfo"]
|
||
else:
|
||
logger.warning("JSON中未找到projectInfo数据")
|
||
logger.info(f"JSON顶层键: {list(data.keys())}")
|
||
return
|
||
|
||
# logger.info("开始处理projectInfo")
|
||
|
||
# 创建工程属性集节点
|
||
property_set_node = Node("ProjectPropertySet", name="工程属性")
|
||
graph.create(property_set_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("EngineeringData", "ProjectPropertySet", entity_relationships)
|
||
graph.create(Relationship(parent_node, relationship_type, property_set_node))
|
||
# logger.info("创建ProjectPropertySet节点: 工程属性")
|
||
|
||
# 为每个属性创建单独的节点
|
||
for key, value in project_info.items():
|
||
if value is not None:
|
||
# 处理复杂类型的值
|
||
if isinstance(value, list):
|
||
# 对于列表类型,创建一个包含列表摘要的节点
|
||
property_node = Node("ProjectProperty", name=key, value=f"列表({len(value)}项)")
|
||
graph.create(property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
|
||
graph.create(Relationship(property_set_node, relationship_type, property_node))
|
||
# logger.info(f"创建ProjectProperty节点: {key} = 列表({len(value)}项)")
|
||
|
||
# 为列表中的每个项创建子节点
|
||
for i, item in enumerate(value):
|
||
if isinstance(item, dict):
|
||
# 对于字典类型的列表项,创建包含键值对的节点
|
||
for sub_key, sub_value in item.items():
|
||
if sub_value is not None:
|
||
sub_property_node = Node(
|
||
"ProjectProperty", name=f"{key}[{i}].{sub_key}", value=str(sub_value)
|
||
)
|
||
graph.create(sub_property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type(
|
||
"ProjectProperty", "ProjectProperty", entity_relationships
|
||
)
|
||
graph.create(Relationship(property_node, relationship_type, sub_property_node))
|
||
# logger.info(f"创建ProjectProperty子节点: {key}[{i}].{sub_key} = {sub_value}")
|
||
else:
|
||
# 对于基本类型的列表项,创建简单节点
|
||
sub_property_node = Node("ProjectProperty", name=f"{key}[{i}]", value=str(item))
|
||
graph.create(sub_property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type(
|
||
"ProjectProperty", "ProjectProperty", entity_relationships
|
||
)
|
||
graph.create(Relationship(property_node, relationship_type, sub_property_node))
|
||
# logger.info(f"创建ProjectProperty子节点: {key}[{i}] = {item}")
|
||
elif isinstance(value, dict):
|
||
# 对于字典类型,创建一个包含字典摘要的节点
|
||
property_node = Node("ProjectProperty", name=key, value=f"字典({len(value)}项)")
|
||
graph.create(property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
|
||
graph.create(Relationship(property_set_node, relationship_type, property_node))
|
||
# logger.info(f"创建ProjectProperty节点: {key} = 字典({len(value)}项)")
|
||
|
||
# 为字典中的每个键值对创建子节点
|
||
for sub_key, sub_value in value.items():
|
||
if sub_value is not None:
|
||
sub_property_node = Node("ProjectProperty", name=f"{key}.{sub_key}", value=str(sub_value))
|
||
graph.create(sub_property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type(
|
||
"ProjectProperty", "ProjectProperty", entity_relationships
|
||
)
|
||
graph.create(Relationship(property_node, relationship_type, sub_property_node))
|
||
# logger.info(f"创建ProjectProperty子节点: {key}.{sub_key} = {sub_value}")
|
||
else:
|
||
# 对于基本类型,直接创建节点
|
||
property_node = Node("ProjectProperty", name=key, value=str(value))
|
||
graph.create(property_node)
|
||
|
||
# 从本体层获取关系类型
|
||
relationship_type = get_relationship_type("ProjectPropertySet", "ProjectProperty", entity_relationships)
|
||
graph.create(Relationship(property_set_node, relationship_type, property_node))
|
||
# logger.info(f"创建ProjectProperty节点: {key} = {value}")
|
||
|
||
|
||
# 将创建知识图谱的功能封装为函数
|
||
def create_KG(json_file_path, ontology_file_path="Ontology_Layer.txt"):
|
||
"""
|
||
创建知识图谱
|
||
|
||
Args:
|
||
json_file_path: JSON文件路径
|
||
ontology_file_path: 本体层文件路径
|
||
|
||
Returns:
|
||
success: 是否成功创建知识图谱
|
||
"""
|
||
try:
|
||
# 解析本体层文件
|
||
entity_types, entity_relationships = parse_ontology_file(ontology_file_path)
|
||
if not entity_types or not entity_relationships:
|
||
logger.error("解析本体层文件失败")
|
||
return False
|
||
|
||
# 读取JSON文件
|
||
try:
|
||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
except Exception as e:
|
||
logger.error(f"读取JSON文件失败: {e}")
|
||
return False
|
||
|
||
# 获取文件名作为工程名称
|
||
file_name = os.path.basename(json_file_path)
|
||
project_name = os.path.splitext(file_name)[0]
|
||
|
||
# 为每个JSON文件创建一个新的EngineeringData节点
|
||
root_node = Node("EngineeringData", name=project_name)
|
||
graph.create(root_node)
|
||
logger.info(f"创建根节点: {project_name}")
|
||
|
||
# 处理费用预览,创建CostSet节点及其子节点
|
||
process_cost_set(data, root_node, entity_relationships)
|
||
|
||
# 处理项目划分,创建ProjectDivisionSet和ProjectDivisionItem节点
|
||
process_project_division_set(data, root_node, entity_relationships)
|
||
|
||
# 处理取费表模板集
|
||
process_fee_table_template_set(data, root_node, entity_relationships)
|
||
|
||
# 处理费用表集
|
||
process_fee_schedule_set(data, root_node, entity_relationships)
|
||
|
||
# 处理工程属性集
|
||
process_project_property_set(data, root_node, entity_relationships)
|
||
|
||
# 建立实体间的关系
|
||
establish_relationships(entity_relationships)
|
||
|
||
logger.info(f"成功创建知识图谱: {json_file_path}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"创建知识图谱失败: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
return False
|
||
|
||
|
||
# 添加一个新的函数,用于处理文件夹中的多个JSON文件
|
||
def create_KGs_from_folder(input_folder, ontology_file_path="Ontology_Layer.txt"):
|
||
"""
|
||
从文件夹中创建多个知识图谱
|
||
|
||
Args:
|
||
input_folder: 输入文件夹路径,包含多个JSON文件
|
||
ontology_file_path: 本体层文件路径
|
||
|
||
Returns:
|
||
success_count: 成功处理的文件数量
|
||
total_count: 总文件数量
|
||
"""
|
||
# 获取输入文件夹中的所有JSON文件
|
||
json_files = []
|
||
|
||
# 如果输入是文件夹,则获取文件夹中的所有JSON文件
|
||
if os.path.isdir(input_folder):
|
||
json_files = glob.glob(os.path.join(input_folder, "*.json"))
|
||
# 如果输入是文件,则直接使用该文件
|
||
elif os.path.isfile(input_folder) and input_folder.endswith(".json"):
|
||
json_files = [input_folder]
|
||
else:
|
||
logger.error(f"输入路径无效: {input_folder}")
|
||
return 0, 0
|
||
|
||
if not json_files:
|
||
logger.error(f"未找到JSON文件: {input_folder}")
|
||
return 0, 0
|
||
|
||
# 连接到Neo4j数据库
|
||
config = read_config()
|
||
if not connect_to_neo4j(config["neo4j"]["uri"], config["neo4j"]["user"], config["neo4j"]["password"]):
|
||
return 0, 0
|
||
|
||
# 处理每个JSON文件
|
||
success_count = 0
|
||
for json_file in json_files:
|
||
logger.info(f"处理文件: {json_file}")
|
||
if create_KG(json_file, ontology_file_path):
|
||
success_count += 1
|
||
|
||
logger.info(f"知识图谱构建完成,成功处理 {success_count}/{len(json_files)} 个文件")
|
||
return success_count, len(json_files)
|
||
|
||
|
||
# 修改main函数,使用命令行参数
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description="构建知识图谱")
|
||
parser.add_argument("--input", "-i", default="final_outputs", help="输入文件夹或文件路径")
|
||
parser.add_argument("--ontology", "-o", default="Ontology_Layer.txt", help="本体层文件路径")
|
||
parser.add_argument("--clear", "-c", action="store_true", help="是否清空数据库")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 读取配置文件
|
||
config = read_config()
|
||
|
||
# 连接到Neo4j数据库
|
||
if not connect_to_neo4j(config["neo4j"]["uri"], config["neo4j"]["user"], config["neo4j"]["password"]):
|
||
return
|
||
|
||
# 如果指定了清空数据库,则清空
|
||
if args.clear:
|
||
if not clear_database():
|
||
return
|
||
|
||
# 处理输入文件夹或文件
|
||
success_count, total_count = create_KGs_from_folder(args.input, args.ontology)
|
||
|
||
logger.info(f"知识图谱构建完成,成功处理 {success_count}/{total_count} 个文件")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|