131 lines
4.6 KiB
Python
131 lines
4.6 KiB
Python
import os
|
||
import sys
|
||
import shutil
|
||
import argparse
|
||
import uuid
|
||
import traceback
|
||
from pathlib import Path
|
||
import tempfile
|
||
|
||
# 仅保留与本流程相关的依赖
|
||
from transform_expense_preview import process_directory
|
||
from supplement_kg import costsummary_upwards
|
||
from build_kg_ontolo import (
|
||
create_KGs_from_folder,
|
||
connect_to_neo4j,
|
||
read_config,
|
||
)
|
||
|
||
TEMP_DIR = tempfile.gettempdir()
|
||
DEFAULT_INPUT_PATH = r"E:\文件\LLM_model\RAG\code\Engineering_data_KG-1\4、模版指标库\test" # 在此填写固定路径(目录或单个JSON文件),例如:r"D:\\your\\json_dir"
|
||
|
||
|
||
def _is_json_file(p: Path) -> bool:
|
||
return p.is_file() and p.suffix.lower() == ".json"
|
||
|
||
|
||
def _prepare_workdir(input_path: Path) -> Path:
|
||
"""
|
||
根据输入路径准备工作目录:
|
||
- 如果输入是目录,直接返回该目录作为工作目录;
|
||
- 如果输入是单个 JSON 文件,则拷贝到临时会话目录并返回该目录;
|
||
"""
|
||
if input_path.is_dir():
|
||
return input_path
|
||
|
||
if _is_json_file(input_path):
|
||
session_id = uuid.uuid4().hex
|
||
work_dir = Path(TEMP_DIR) / f"nobcl2kg_{session_id}"
|
||
work_dir.mkdir(parents=True, exist_ok=True)
|
||
shutil.copy2(str(input_path), str(work_dir / input_path.name))
|
||
return work_dir
|
||
|
||
raise ValueError(f"输入路径既不是目录也不是JSON文件: {input_path}")
|
||
|
||
|
||
def _ensure_dir(p: Path):
|
||
p.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
def run_pipeline(input_path: str) -> None:
|
||
"""
|
||
从一个路径出发,自动处理其中的 JSON 文件并生成知识图谱。
|
||
会跳过 BCL 计算和写回步骤,仅依赖费用向上汇总后的结果生成 KG。
|
||
"""
|
||
try:
|
||
in_path = Path(input_path).expanduser().resolve()
|
||
if not in_path.exists():
|
||
print(f"输入路径不存在: {in_path}")
|
||
sys.exit(1)
|
||
|
||
# 连接 Neo4j
|
||
print("[步骤0] 连接 Neo4j 数据库 ...")
|
||
config = read_config()
|
||
if not connect_to_neo4j(
|
||
config.get("neo4j", "uri"),
|
||
config.get("neo4j", "user"),
|
||
config.get("neo4j", "password"),
|
||
):
|
||
print("连接 Neo4j 失败,请检查配置")
|
||
sys.exit(2)
|
||
|
||
# 准备工作目录
|
||
work_json_dir = _prepare_workdir(in_path)
|
||
print(f"工作目录: {work_json_dir}")
|
||
|
||
# 简单统计 JSON 数量(递归)
|
||
json_count = sum(1 for _ in work_json_dir.rglob("*.json")) if work_json_dir.is_dir() else 0
|
||
if json_count == 0:
|
||
print("未在输入路径中发现任何 JSON 文件,终止处理。")
|
||
sys.exit(3)
|
||
print(f"发现 JSON 文件数量: {json_count}")
|
||
|
||
# 步骤1.3: 处理 JSON 文件结构(就地处理)
|
||
print("[步骤1] 规范化 JSON 结构 ...")
|
||
process_directory(str(work_json_dir))
|
||
|
||
# 步骤2: 费用向上汇总 -> 输出到临时合并目录
|
||
print("[步骤2] 费用向上汇总 ...")
|
||
session_id = uuid.uuid4().hex
|
||
merged_dir = Path(TEMP_DIR) / f"nobcl2kg_merged_{session_id}"
|
||
_ensure_dir(merged_dir)
|
||
result_step2 = costsummary_upwards(str(work_json_dir), str(merged_dir))
|
||
merged_count = len(result_step2) if result_step2 else 0
|
||
print(f"完成费用汇总文件: {merged_count}")
|
||
|
||
# 步骤4: 直接基于合并后的结果生成知识图谱(跳过 BCL 相关步骤)
|
||
print("[步骤3] 创建知识图谱(基于汇总结果) ...")
|
||
success_count, total_count, deleted_projects = create_KGs_from_folder(str(merged_dir))
|
||
if deleted_projects:
|
||
print(f"注意:已删除 {len(deleted_projects)} 个同名工程:{', '.join(deleted_projects)}")
|
||
|
||
print("\n处理完成!\n" f"成功创建知识图谱: {success_count}/{total_count}\n" f"合并目录: {merged_dir}\n")
|
||
|
||
except Exception as e:
|
||
print(f"运行出错: {e}\n{traceback.format_exc()}")
|
||
sys.exit(10)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="从目录或单个JSON文件生成知识图谱(无BCL步骤,无前端)")
|
||
parser.add_argument(
|
||
"path",
|
||
nargs="?",
|
||
default=None,
|
||
metavar="PATH",
|
||
help="包含 JSON 文件的目录,或单个 JSON 文件的路径(可不填,则使用 DEFAULT_INPUT_PATH)",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
path = args.path if args.path else DEFAULT_INPUT_PATH
|
||
if not path:
|
||
print("未提供路径,且 DEFAULT_INPUT_PATH 为空。请在脚本顶部设置 DEFAULT_INPUT_PATH,或通过命令行提供路径。")
|
||
print('示例:python nobcl2kg.py "D:\\data\\jsons"')
|
||
sys.exit(1)
|
||
|
||
run_pipeline(path)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|