更新代码

2025-10-14 16:13:18 +08:00
parent f5f26c5cf8
commit 0a4dedda1c
230 changed files with 7029 additions and 855114 deletions
@@ -1,3 +1,7 @@
+"""
+统计费用
+"""
+
 import os
 import shutil
 import time
@@ -6,13 +10,13 @@ import traceback
 import uuid
 import random
 import string
-import argparse
+import csv

 # 导入各个步骤需要的函数
 from project2json.project_converter import convert_project_to_json
 from transform_expense_preview import process_directory
 from supplement_kg import costsummary_upwards
-from equipment_calculation.main import bcl_calculate
+from equipment_calculation.main import bcl_calculate, parse_json_content
 from cost_comparison import compare_costs_batch
 import tempfile

@@ -136,9 +140,9 @@ def convert_all_steps(files):

 # ========= 批处理与命令行入口（文件末尾追加） =========
 def create_named_workdirs(output_base_dir: str, base_name: str):
-    """在输出根目录下创建 '<工程名>+<GUID>' 的工作目录结构。"""
+    """在输出根目录下创建 '<工程名>_<GUID>' 的工作目录结构。"""
    guid = uuid.uuid4().hex
-    root_name = f"{base_name}+{guid}"
+    root_name = f"{base_name}_{guid}"
    root = os.path.join(output_base_dir, root_name)
    dirs = {
        "root": root,
@@ -219,14 +223,210 @@ def run_batch_with_io(input_dir: str, output_dir: str) -> int:
    return total_success


+# ========= 新增：嵌套目录遍历与结果抽查/汇总 =========
+def _find_leaf_dirs(root_dir: str):
+    """找到 root_dir 下的所有最深层级子文件夹（没有进一步子目录的文件夹）。"""
+    leaf_dirs = []
+    for current, dirs, _files in os.walk(root_dir):
+        # 只要该目录没有子目录，则视为叶子目录
+        if not dirs:
+            leaf_dirs.append(current)
+    return leaf_dirs
+
+
+def _iter_comparison_results_dirs(bcl_results_dir: str):
+    """在 bcl_results_dir 下递归查找名为 'comparison_results' 的目录，返回其绝对路径列表。"""
+    found = []
+    for current, dirs, _files in os.walk(bcl_results_dir):
+        for d in dirs:
+            if d == "comparison_results":
+                found.append(os.path.join(current, d))
+    return found
+
+
+def _parse_diff_value_from_line(line: str) -> float | None:
+    """从一行对齐文本中解析“差异”列的值。
+
+    文本列宽（在 cost_comparison.save_comparison_to_txt 中定义）：
+    - 项目: 20
+    - 参考值: 25
+    - 计算值: 25
+    - 差异: 25  <-- 我们需要的列
+    - 原数据项: 30
+    """
+    try:
+        # 按固定宽度切片
+        start = 20 + 25 + 25
+        end = start + 25
+        cell = line[start:end].strip()
+        if not cell:
+            return None
+        # 去掉可能的对齐空格，再解析为 float
+        val = float(cell)
+        return val
+    except Exception:
+        return None
+
+
+def _evaluate_comparison_folder(comp_dir: str, sample_k: int = 10) -> str:
+    """随机抽查 comp_dir 下的 TXT 文件，若任意“差异”列存在非 0/-0 值则判定为“有问题”，否则为“正确”。"""
+    txt_files = [os.path.join(comp_dir, f) for f in os.listdir(comp_dir) if f.lower().endswith(".txt")]
+    if not txt_files:
+        # 无文件视为有问题（或按需 "正确"），这里更稳妥地标记为有问题
+        return "有问题"
+
+    random.shuffle(txt_files)
+    subset = txt_files[: min(sample_k, len(txt_files))]
+
+    for path in subset:
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+            # 跳过前两行（表头和分隔线）
+            for line in lines[2:]:
+                diff_val = _parse_diff_value_from_line(line)
+                if diff_val is None:
+                    continue
+                # 允许 -0.0 视为 0
+                if abs(diff_val) > 0.0:
+                    return "有问题"
+        except Exception:
+            return "有问题"
+
+    return "正确"
+
+
+def _append_results_to_csv(records: list[tuple[str, str, str, str]], csv_path: str):
+    """将 (文件路径, 软件类别, 项目类型, 计算结果) 记录写入 CSV（追加/创建）。"""
+    header = ["文件路径", "软件类别", "项目类型", "计算结果"]
+    file_exists = os.path.exists(csv_path)
+    with open(csv_path, "a", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        if not file_exists:
+            writer.writerow(header)
+        for row in records:
+            writer.writerow(list(row))
+
+
+def run_nested_batch_with_io(
+    input_root: str, output_root: str, sample_k: int = 10, summary_csv_path: str | None = None
+) -> int:
+    """遍历嵌套目录中每个最深层级的子文件夹，按现有流程处理其中所有工程文件。
+
+    并在完成每个工程的第 8 步（费用对比）后，定位 bclresults 下的所有 comparison_results 文件夹，随机抽查 10 个 TXT，
+    将判定结果汇总到一个 CSV（列：文件路径、工程名称、计算结果）。
+    """
+    input_root = os.path.abspath(input_root)
+    output_root = os.path.abspath(output_root)
+    os.makedirs(output_root, exist_ok=True)
+
+    # 提前确定并创建汇总 CSV 路径，后续逐条写入，避免中途异常导致整体丢失
+    # 需求：多次运行不要覆盖历史数据 -> 当未显式指定时，采用固定文件名并使用追加写入
+    if summary_csv_path is None:
+        summary_csv_path = os.path.join(output_root, "comparison_validation_summary.csv")
+    else:
+        # 若显式指定了路径，确保其父目录存在
+        csv_dir = os.path.dirname(summary_csv_path)
+        if csv_dir:
+            os.makedirs(csv_dir, exist_ok=True)
+
+    leaf_dirs = _find_leaf_dirs(input_root)
+    if not leaf_dirs:
+        print(f"未找到任何叶子目录: {input_root}")
+        return 0
+
+    print(f"共发现 {len(leaf_dirs)} 个最深层级子文件夹，开始逐一处理……")
+
+    total_success = 0
+
+    for didx, leaf in enumerate(leaf_dirs, start=1):
+        # 收集该叶子目录下的所有文件
+        files = [os.path.join(leaf, f) for f in os.listdir(leaf) if os.path.isfile(os.path.join(leaf, f))]
+        if not files:
+            print(f"[{didx}/{len(leaf_dirs)}] 叶子目录无文件，跳过: {leaf}")
+            continue
+
+        print(f"[{didx}/{len(leaf_dirs)}] 处理叶子目录: {leaf}（{len(files)} 个文件）")
+
+        for fidx, file_path in enumerate(files, start=1):
+            base_name = os.path.basename(file_path)
+            stem = os.path.splitext(base_name)[0]
+            print(f"  - ({fidx}/{len(files)}) 处理文件: {base_name}")
+
+            try:
+                # 为该工程创建独立工作区
+                fdirs = create_named_workdirs(output_root, stem)
+                upload_dir = fdirs["upload_dir"]
+                json_dir = fdirs["json_dir"]
+                merged_dir = fdirs["merged_dir"]
+                bcl_results_dir = fdirs["bcl_results_dir"]
+                bcl_dir = fdirs["bcl_dir"]
+
+                # 保存上传
+                save_path = os.path.join(upload_dir, base_name)
+                shutil.copy(file_path, save_path)
+
+                # 转 JSON
+                success, _file_num = convert_project_to_json(upload_dir, json_dir, bcl_dir)
+                if not success:
+                    raise RuntimeError("转换为JSON失败")
+
+                # 处理 JSON
+                process_directory(json_dir)
+
+                # 费用向上汇总
+                _ = costsummary_upwards(json_dir, merged_dir)
+
+                # BCL 计算
+                bcl_calculate(merged_dir, bcl_results_dir, bcl_dir_path=bcl_dir)
+
+                # 选择一个项目 JSON 进行费用对比
+                merged_jsons = [
+                    os.path.join(merged_dir, nf) for nf in os.listdir(merged_dir) if nf.lower().endswith(".json")
+                ]
+                if not merged_jsons:
+                    raise FileNotFoundError("在 merged_dir 中未找到项目 JSON")
+                project_data_json_path = merged_jsons[0]
+
+                # 费用对比
+                compare_costs_batch(bcl_results_dir, project_data_json_path)
+
+                # 从项目 JSON 中解析 软件类别/项目类型
+                try:
+                    category, project_type, engineering_type = parse_json_content(project_data_json_path)
+                except Exception:
+                    category, project_type = "未知", "未知"
+
+                # 对比完成后：在 bclresults 下寻找 comparison_results 目录，并进行抽查
+                comp_dirs = _iter_comparison_results_dirs(bcl_results_dir)
+                if not comp_dirs:
+                    # 若未找到任何 comparison_results，则记录为有问题（立即写入 CSV）
+                    _append_results_to_csv([(bcl_results_dir, category, project_type, "有问题")], summary_csv_path)
+                else:
+                    for comp_dir in comp_dirs:
+                        parent_dir = os.path.dirname(comp_dir)
+                        result = _evaluate_comparison_folder(comp_dir, sample_k=sample_k)
+                        _append_results_to_csv([(parent_dir, category, project_type, result)], summary_csv_path)
+
+                total_success += 1
+            except Exception as fe:
+                print(f"  ❌ 处理文件 {base_name} 失败: {fe}\n{traceback.format_exc()}")
+
+    print(f"\n📄 抽查结果 CSV 路径: {summary_csv_path}")
+
+    print(f"\n✅ 嵌套目录批处理完成。成功处理工程数: {total_success}")
+    return total_success
+
+
 if __name__ == "__main__":

-    input_dir = r"data/input"  # 请修改为你的实际输入路径
-    output_dir = r"data/output"  # 请修改为你的实际输出路径
+    input_path = r"E:\文件\LLM_model\RAG\code\Engineering_data_KG-1\KG_generation\data\input\uploads"  # 请修改为你的实际输入嵌套目录根
+    output_path = r"data/input"  # 请修改为你的实际输出根目录
+    summary_csv = None  # 可选：指定汇总 CSV 路径；为 None 时自动生成

    t0 = time.time()
    try:
-        count = run_batch_with_io(input_dir, output_dir)
+        count = run_nested_batch_with_io(input_path, output_path, sample_k=10, summary_csv_path=summary_csv)
        print(f"\n🎉 ✅ 处理完成，共成功 {count} 个。耗时: {int(time.time() - t0)} 秒")
    except Exception as e:
        print(f"\n❌ 执行失败: {e}\n{traceback.format_exc()}")