280 lines
11 KiB
Python
280 lines
11 KiB
Python
import pandas as pd
|
|
from neo4j import GraphDatabase
|
|
import networkx as nx
|
|
from pyvis.network import Network
|
|
import networkx as nx
|
|
|
|
|
|
|
|
URI = "bolt://10.1.6.34:7687"
|
|
AUTH = ("neo4j", "password")
|
|
|
|
def create_knowledge_graph(excel_file):
|
|
|
|
df = pd.read_excel(excel_file, engine="openpyxl")
|
|
|
|
driver = GraphDatabase.driver(URI, auth=AUTH)
|
|
|
|
def clear_database(tx):
|
|
"""清空Neo4j数据库中的所有节点和关系"""
|
|
tx.run("MATCH (n) DETACH DELETE n")
|
|
print("数据库已清空!")
|
|
|
|
def add_node(tx, label, name, properties=None, parent_path=""):
|
|
"""添加节点,使用父路径+名称作为唯一标识"""
|
|
if properties is None:
|
|
properties = {}
|
|
|
|
# 添加显示名称
|
|
properties["display_name"] = name
|
|
|
|
# 保存原始名称
|
|
properties["original_name"] = name
|
|
|
|
# 保存父路径信息
|
|
if parent_path:
|
|
properties["parent_path"] = parent_path
|
|
|
|
# 将标签也作为节点的属性存储
|
|
properties["node_type"] = label
|
|
|
|
# 创建唯一标识符:父路径+名称
|
|
unique_id = f"{parent_path}|{name}" if parent_path else name
|
|
|
|
# 使用唯一标识符作为name属性
|
|
query = f"MERGE (n:{label} {{name: $unique_id}}) SET n += $properties"
|
|
tx.run(query, unique_id=unique_id, properties=properties)
|
|
|
|
return unique_id
|
|
|
|
def add_relationship(tx, start_label, start_name, end_label, end_name, rel_type="包含"):
|
|
"""添加关系,使用唯一标识符"""
|
|
query = (
|
|
f"MATCH (a:{start_label} {{name: $start_name}}), "
|
|
f"(b:{end_label} {{name: $end_name}}) "
|
|
f"MERGE (a)-[r:{rel_type}]->(b)"
|
|
)
|
|
tx.run(query, start_name=start_name, end_name=end_name)
|
|
|
|
try:
|
|
with driver.session() as session:
|
|
# 清空数据库
|
|
session.write_transaction(clear_database)
|
|
|
|
# 验证数据库是否已清空
|
|
def verify_empty_database(tx):
|
|
result = tx.run("MATCH (n) RETURN count(n) as count")
|
|
count = result.single()["count"]
|
|
print(f"验证结果: 数据库中剩余节点数量 = {count}")
|
|
return count
|
|
|
|
node_count = session.write_transaction(verify_empty_database)
|
|
if node_count == 0:
|
|
print("数据库清空成功!")
|
|
else:
|
|
print(f"警告: 数据库清空不完全,仍有 {node_count} 个节点!")
|
|
|
|
# 创建根节点:配网D3软件
|
|
root_name = session.write_transaction(add_node, "软件", "配网D3软件", {})
|
|
|
|
# 批量处理数据
|
|
batch_size = 100 # 可以根据数据量调整批次大小
|
|
for i in range(0, len(df), batch_size):
|
|
batch_df = df.iloc[i:i+batch_size]
|
|
|
|
def process_batch(tx):
|
|
for idx, row in batch_df.iterrows():
|
|
# 提取数据并确保是字符串类型
|
|
module_1 = str(row.get("一级模块", "")) if not pd.isna(row.get("一级模块", "")) else ""
|
|
module_2 = str(row.get("二级模块", "")) if not pd.isna(row.get("二级模块", "")) else ""
|
|
module_3 = str(row.get("三级模块", "")) if not pd.isna(row.get("三级模块", "")) else ""
|
|
module_4 = str(row.get("四级模块", "")) if not pd.isna(row.get("四级模块", "")) else ""
|
|
function_name = str(row.get("功能名称", "")) if not pd.isna(row.get("功能名称", "")) else ""
|
|
description = str(row.get("功能说明", "")) if not pd.isna(row.get("功能说明", "")) else ""
|
|
|
|
# 现在可以安全地调用strip()
|
|
module_1 = module_1.strip()
|
|
module_2 = module_2.strip()
|
|
module_3 = module_3.strip()
|
|
module_4 = module_4.strip()
|
|
function_name = function_name.strip()
|
|
description = description.strip()
|
|
|
|
# 记录最后一个非空模块的标签和名称,用于连接功能名称节点
|
|
last_module_label = "软件"
|
|
last_module_name = root_name
|
|
|
|
# 构建路径,用于创建唯一标识符
|
|
path = "配网D3软件"
|
|
|
|
# 添加页面节点(一级模块)
|
|
if module_1:
|
|
module_1_name = add_node(tx, "页面", module_1, {}, path)
|
|
add_relationship(tx, last_module_label, last_module_name, "页面", module_1_name)
|
|
last_module_label = "页面"
|
|
last_module_name = module_1_name
|
|
path = f"{path}|{module_1}"
|
|
|
|
# 添加页面节点(二级模块)
|
|
if module_2:
|
|
module_2_name = add_node(tx, "TAB控件", module_2, {}, path)
|
|
add_relationship(tx, last_module_label, last_module_name, "TAB控件", module_2_name)
|
|
last_module_label = "TAB控件"
|
|
last_module_name = module_2_name
|
|
path = f"{path}|{module_2}"
|
|
|
|
# 添加TAB控件节点(三级模块)
|
|
if module_3:
|
|
module_3_name = add_node(tx, "分组控件", module_3, {}, path)
|
|
add_relationship(tx, last_module_label, last_module_name, "分组控件", module_3_name)
|
|
last_module_label = "分组控件"
|
|
last_module_name = module_3_name
|
|
path = f"{path}|{module_3}"
|
|
|
|
# 添加分组控件节点(四级模块)
|
|
if module_4:
|
|
module_4_name = add_node(tx, "属性控件", module_4, {}, path)
|
|
add_relationship(tx, last_module_label, last_module_name, "属性控件", module_4_name)
|
|
last_module_label = "属性控件"
|
|
last_module_name = module_4_name
|
|
path = f"{path}|{module_4}"
|
|
|
|
# 添加功能名称节点 - 使用路径确保唯一性
|
|
if function_name:
|
|
function_name_unique = add_node(
|
|
tx, "功能名称", function_name, {"描述": description}, path
|
|
)
|
|
add_relationship(tx, last_module_label, last_module_name, "功能名称", function_name_unique)
|
|
|
|
session.write_transaction(process_batch)
|
|
print(f"已处理 {min(i+batch_size, len(df))}/{len(df)} 条记录")
|
|
|
|
print("知识图谱构建完成!")
|
|
except Exception as e:
|
|
print(f"构建知识图谱时发生错误: {e}")
|
|
finally:
|
|
driver.close()
|
|
|
|
|
|
def export_graph_to_html(output_file="knowledge_graph.html", limit=1000):
|
|
"""
|
|
将Neo4j中的知识图谱导出为交互式HTML文件
|
|
|
|
参数:
|
|
output_file: 输出的HTML文件路径
|
|
limit: 限制节点数量,防止图过大导致浏览器卡顿
|
|
|
|
返回:
|
|
bool: 是否成功导出
|
|
"""
|
|
try:
|
|
# 连接Neo4j数据库
|
|
driver = GraphDatabase.driver(URI, auth=AUTH)
|
|
|
|
# 创建一个NetworkX图
|
|
G = nx.DiGraph()
|
|
|
|
with driver.session() as session:
|
|
# 获取所有节点 - 避免使用已弃用的id()函数
|
|
nodes_result = session.run(
|
|
f"MATCH (n) RETURN elementId(n) as id, labels(n) as labels, n.name as name, n.display_name as display_name, n.original_name as original_name LIMIT {limit}"
|
|
)
|
|
|
|
# 节点颜色映射
|
|
color_map = {
|
|
"软件": "#FF5733",
|
|
"页面": "#33FF57",
|
|
"页面控件": "#57FF33",
|
|
"TAB控件": "#3357FF",
|
|
"分组控件": "#FF33A8",
|
|
"属性控件": "#33FFF5",
|
|
"功能名称": "#F5FF33"
|
|
}
|
|
|
|
# 添加节点到图中
|
|
node_ids = [] # 存储所有节点ID用于后续查询
|
|
for record in nodes_result:
|
|
node_id = record["id"]
|
|
node_ids.append(node_id)
|
|
node_label = record["labels"][0] if record["labels"] else "Unknown"
|
|
node_name = record["original_name"] or record["name"] # 优先使用原始名称
|
|
display_name = record["display_name"] if record["display_name"] else node_name
|
|
|
|
# 添加到NetworkX图
|
|
G.add_node(
|
|
node_id,
|
|
label=display_name,
|
|
title=f"{node_label}: {display_name}",
|
|
color=color_map.get(node_label, "#CCCCCC")
|
|
)
|
|
|
|
# 获取所有关系 - 修复查询语法,不再使用未定义的path变量
|
|
if node_ids:
|
|
edges_result = session.run(
|
|
f"""
|
|
MATCH (a)-[r]->(b)
|
|
WHERE elementId(a) IN $node_ids AND elementId(b) IN $node_ids
|
|
RETURN elementId(a) as source, elementId(b) as target, type(r) as type
|
|
LIMIT {limit}
|
|
""",
|
|
node_ids=node_ids
|
|
)
|
|
|
|
# 添加边到图中
|
|
for record in edges_result:
|
|
source = record["source"]
|
|
target = record["target"]
|
|
rel_type = record["type"]
|
|
|
|
# 添加到NetworkX图
|
|
G.add_edge(source, target, title=rel_type)
|
|
|
|
# 创建Pyvis网络图
|
|
net = Network(height="800px", width="100%", directed=True, notebook=False)
|
|
|
|
# 从NetworkX图转换
|
|
net.from_nx(G)
|
|
|
|
# 设置物理布局选项
|
|
net.set_options("""
|
|
{
|
|
"physics": {
|
|
"forceAtlas2Based": {
|
|
"gravitationalConstant": -50,
|
|
"centralGravity": 0.01,
|
|
"springLength": 100,
|
|
"springConstant": 0.1
|
|
},
|
|
"maxVelocity": 50,
|
|
"solver": "forceAtlas2Based",
|
|
"timestep": 0.35,
|
|
"stabilization": {
|
|
"enabled": true,
|
|
"iterations": 1000
|
|
}
|
|
},
|
|
"interaction": {
|
|
"navigationButtons": true,
|
|
"keyboard": true
|
|
}
|
|
}
|
|
""")
|
|
|
|
# 保存为HTML文件
|
|
net.save_graph(output_file)
|
|
print(f"知识图谱已成功导出为HTML文件: {output_file}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"导出知识图谱时发生错误: {e}")
|
|
return False
|
|
finally:
|
|
if 'driver' in locals():
|
|
driver.close()
|
|
|
|
|
|
# 创建知识图谱
|
|
create_knowledge_graph("E:\\文件\\LLM_model\\RAG\\code\\GraphRAG\\data\\博微配网工程计价通D3软件产品功能清单.xlsx")
|
|
|
|
# 导出为HTML文件
|
|
# export_graph_to_html("配网D3软件知识图谱.html") |