Files
KG_generation/convert_json.py
T
chentianrui 0a4dedda1c 更新代码
2025-10-14 16:13:18 +08:00

103 lines
3.5 KiB
Python

import json
import os
import chardet
def detect_encoding(file_path):
"""
检测文件编码,优先考虑中文编码兼容性
"""
with open(file_path, "rb") as f:
raw_data = f.read()
result = chardet.detect(raw_data)
encoding = result["encoding"]
print(f"初步检测编码: {encoding}")
return encoding
def convert_json_to_readable(input_folder, output_folder=None):
"""
批量将文件夹下所有JSON文件转换为可读格式(UTF-8 + 缩进美化)
支持 GBK/GB2312/gb18030 等中文编码
"""
if not os.path.exists(input_folder):
print(f"错误:输入文件夹不存在: {input_folder}")
return
if output_folder is None:
output_folder = input_folder
os.makedirs(output_folder, exist_ok=True)
json_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".json")]
if not json_files:
print(f"警告:在 {input_folder} 中未找到任何 JSON 文件。")
return
processed_count = 0
failed_files = []
# 中文编码优先列表
chinese_encodings = ["utf-8", "gb18030", "gbk", "gb2312"]
for filename in json_files:
input_file = os.path.join(input_folder, filename)
output_file = os.path.join(output_folder, filename)
try:
print(f"正在处理: {filename}")
# 先尝试 chardet 检测
detected_encoding = detect_encoding(input_file)
# 构建尝试的编码列表:检测结果 + 常见中文编码
encodings_to_try = []
if detected_encoding:
encodings_to_try.append(detected_encoding.lower())
encodings_to_try.extend([enc for enc in chinese_encodings if enc.lower() != detected_encoding])
data = None
content_str = None
for enc in encodings_to_try:
try:
with open(input_file, "r", encoding=enc) as f:
content_str = f.read()
data = json.loads(content_str)
print(f" 使用编码 '{enc}' 成功解码并解析 JSON")
break
except UnicodeDecodeError:
print(f" 编码 '{enc}' 解码失败,尝试下一个...")
continue
except json.JSONDecodeError as je:
print(f" 编码 '{enc}' 解码成功,但 JSON 格式错误: {je}")
continue
except Exception as e:
print(f" 使用编码 '{enc}' 失败: {e}")
continue
if data is None:
raise ValueError(f"所有编码尝试均失败: {encodings_to_try}")
# 成功解析后,以标准 UTF-8 保存美化格式
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"✅ 转换成功: {output_file}")
processed_count += 1
except Exception as e:
print(f"❌ 转换失败: {filename} -> 错误: {str(e)}")
failed_files.append(filename)
print(f"\n--- 处理完成 ---")
print(f"共处理 {len(json_files)} 个文件,成功 {processed_count} 个。")
if failed_files:
print(f"失败文件: {failed_files}")
if __name__ == "__main__":
input_folder = r"E:\文件\LLM_model\RAG\code\Engineering_data_KG-1\4、模版指标库\test"
convert_json_to_readable(input_folder)