上传代码

2025-09-08 17:58:02 +08:00
parent be848c3e78
commit f5f26c5cf8
76 changed files with 839039 additions and 2695 deletions
@@ -1,41 +1,102 @@
 import json
 import os
+import chardet


-def convert_json_to_readable(input_file, output_file=None):
+def detect_encoding(file_path):
    """
-    将JSON文件转换为可读格式并保存
-
-    参数:
-        input_file: 输入JSON文件路径
-        output_file: 输出文件路径，如果为None则自动生成
+    检测文件编码，优先考虑中文编码兼容性
    """
-    try:
-        # 读取JSON文件
-        with open(input_file, "r", encoding="utf-8") as f:
-            data = json.load(f)
+    with open(file_path, "rb") as f:
+        raw_data = f.read()
+        result = chardet.detect(raw_data)
+        encoding = result["encoding"]
+        print(f"初步检测编码: {encoding}")
+        return encoding

-        # 如果未指定输出文件，则自动生成
-        if output_file is None:
-            base_name = os.path.splitext(input_file)[0]
-            output_file = f"{base_name}_readable.json"

-        # 以美化格式写入新文件
-        with open(output_file, "w", encoding="utf-8") as f:
-            json.dump(data, f, ensure_ascii=False, indent=4)
+def convert_json_to_readable(input_folder, output_folder=None):
+    """
+    批量将文件夹下所有JSON文件转换为可读格式（UTF-8 + 缩进美化）
+    支持 GBK/GB2312/gb18030 等中文编码
+    """
+    if not os.path.exists(input_folder):
+        print(f"错误：输入文件夹不存在: {input_folder}")
+        return

-        print(f"转换成功！可读格式的文件已保存为: {output_file}")
-        return output_file
-    except Exception as e:
-        print(f"转换过程中出现错误: {str(e)}")
-        return None
+    if output_folder is None:
+        output_folder = input_folder
+    os.makedirs(output_folder, exist_ok=True)
+
+    json_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".json")]
+
+    if not json_files:
+        print(f"警告：在 {input_folder} 中未找到任何 JSON 文件。")
+        return
+
+    processed_count = 0
+    failed_files = []
+
+    # 中文编码优先列表
+    chinese_encodings = ["utf-8", "gb18030", "gbk", "gb2312"]
+
+    for filename in json_files:
+        input_file = os.path.join(input_folder, filename)
+        output_file = os.path.join(output_folder, filename)
+
+        try:
+            print(f"正在处理: {filename}")
+
+            # 先尝试 chardet 检测
+            detected_encoding = detect_encoding(input_file)
+
+            # 构建尝试的编码列表：检测结果 + 常见中文编码
+            encodings_to_try = []
+            if detected_encoding:
+                encodings_to_try.append(detected_encoding.lower())
+            encodings_to_try.extend([enc for enc in chinese_encodings if enc.lower() != detected_encoding])
+
+            data = None
+            content_str = None
+
+            for enc in encodings_to_try:
+                try:
+                    with open(input_file, "r", encoding=enc) as f:
+                        content_str = f.read()
+                    data = json.loads(content_str)
+                    print(f"  使用编码 '{enc}' 成功解码并解析 JSON")
+                    break
+                except UnicodeDecodeError:
+                    print(f"  编码 '{enc}' 解码失败，尝试下一个...")
+                    continue
+                except json.JSONDecodeError as je:
+                    print(f"  编码 '{enc}' 解码成功，但 JSON 格式错误: {je}")
+                    continue
+                except Exception as e:
+                    print(f"  使用编码 '{enc}' 失败: {e}")
+                    continue
+
+            if data is None:
+                raise ValueError(f"所有编码尝试均失败: {encodings_to_try}")
+
+            # 成功解析后，以标准 UTF-8 保存美化格式
+            with open(output_file, "w", encoding="utf-8") as f:
+                json.dump(data, f, ensure_ascii=False, indent=4)
+
+            print(f"✅ 转换成功: {output_file}")
+            processed_count += 1
+
+        except Exception as e:
+            print(f"❌ 转换失败: {filename} -> 错误: {str(e)}")
+            failed_files.append(filename)
+
+    print(f"\n--- 处理完成 ---")
+    print(f"共处理 {len(json_files)} 个文件，成功 {processed_count} 个。")
+    if failed_files:
+        print(f"失败文件: {failed_files}")


 if __name__ == "__main__":
-    # 指定输入文件路径
-    input_file = (
-        r"E:/文件/LLM_model/RAG/code/Engineering_data_KG-1/equipment_dataset/数据工程/技改/预算/通信线路检修国网.json"
-    )
+    input_folder = r"project2json/outputs/json"

-    # 调用转换函数
-    convert_json_to_readable(input_file)
+    convert_json_to_readable(input_folder)