上传代码
This commit is contained in:
+89
-28
@@ -1,41 +1,102 @@
|
||||
import json
|
||||
import os
|
||||
import chardet
|
||||
|
||||
|
||||
def convert_json_to_readable(input_file, output_file=None):
|
||||
def detect_encoding(file_path):
|
||||
"""
|
||||
将JSON文件转换为可读格式并保存
|
||||
|
||||
参数:
|
||||
input_file: 输入JSON文件路径
|
||||
output_file: 输出文件路径,如果为None则自动生成
|
||||
检测文件编码,优先考虑中文编码兼容性
|
||||
"""
|
||||
try:
|
||||
# 读取JSON文件
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
with open(file_path, "rb") as f:
|
||||
raw_data = f.read()
|
||||
result = chardet.detect(raw_data)
|
||||
encoding = result["encoding"]
|
||||
print(f"初步检测编码: {encoding}")
|
||||
return encoding
|
||||
|
||||
# 如果未指定输出文件,则自动生成
|
||||
if output_file is None:
|
||||
base_name = os.path.splitext(input_file)[0]
|
||||
output_file = f"{base_name}_readable.json"
|
||||
|
||||
# 以美化格式写入新文件
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
def convert_json_to_readable(input_folder, output_folder=None):
|
||||
"""
|
||||
批量将文件夹下所有JSON文件转换为可读格式(UTF-8 + 缩进美化)
|
||||
支持 GBK/GB2312/gb18030 等中文编码
|
||||
"""
|
||||
if not os.path.exists(input_folder):
|
||||
print(f"错误:输入文件夹不存在: {input_folder}")
|
||||
return
|
||||
|
||||
print(f"转换成功!可读格式的文件已保存为: {output_file}")
|
||||
return output_file
|
||||
except Exception as e:
|
||||
print(f"转换过程中出现错误: {str(e)}")
|
||||
return None
|
||||
if output_folder is None:
|
||||
output_folder = input_folder
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
json_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".json")]
|
||||
|
||||
if not json_files:
|
||||
print(f"警告:在 {input_folder} 中未找到任何 JSON 文件。")
|
||||
return
|
||||
|
||||
processed_count = 0
|
||||
failed_files = []
|
||||
|
||||
# 中文编码优先列表
|
||||
chinese_encodings = ["utf-8", "gb18030", "gbk", "gb2312"]
|
||||
|
||||
for filename in json_files:
|
||||
input_file = os.path.join(input_folder, filename)
|
||||
output_file = os.path.join(output_folder, filename)
|
||||
|
||||
try:
|
||||
print(f"正在处理: {filename}")
|
||||
|
||||
# 先尝试 chardet 检测
|
||||
detected_encoding = detect_encoding(input_file)
|
||||
|
||||
# 构建尝试的编码列表:检测结果 + 常见中文编码
|
||||
encodings_to_try = []
|
||||
if detected_encoding:
|
||||
encodings_to_try.append(detected_encoding.lower())
|
||||
encodings_to_try.extend([enc for enc in chinese_encodings if enc.lower() != detected_encoding])
|
||||
|
||||
data = None
|
||||
content_str = None
|
||||
|
||||
for enc in encodings_to_try:
|
||||
try:
|
||||
with open(input_file, "r", encoding=enc) as f:
|
||||
content_str = f.read()
|
||||
data = json.loads(content_str)
|
||||
print(f" 使用编码 '{enc}' 成功解码并解析 JSON")
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
print(f" 编码 '{enc}' 解码失败,尝试下一个...")
|
||||
continue
|
||||
except json.JSONDecodeError as je:
|
||||
print(f" 编码 '{enc}' 解码成功,但 JSON 格式错误: {je}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" 使用编码 '{enc}' 失败: {e}")
|
||||
continue
|
||||
|
||||
if data is None:
|
||||
raise ValueError(f"所有编码尝试均失败: {encodings_to_try}")
|
||||
|
||||
# 成功解析后,以标准 UTF-8 保存美化格式
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"✅ 转换成功: {output_file}")
|
||||
processed_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 转换失败: {filename} -> 错误: {str(e)}")
|
||||
failed_files.append(filename)
|
||||
|
||||
print(f"\n--- 处理完成 ---")
|
||||
print(f"共处理 {len(json_files)} 个文件,成功 {processed_count} 个。")
|
||||
if failed_files:
|
||||
print(f"失败文件: {failed_files}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 指定输入文件路径
|
||||
input_file = (
|
||||
r"E:/文件/LLM_model/RAG/code/Engineering_data_KG-1/equipment_dataset/数据工程/技改/预算/通信线路检修国网.json"
|
||||
)
|
||||
input_folder = r"project2json/outputs/json"
|
||||
|
||||
# 调用转换函数
|
||||
convert_json_to_readable(input_file)
|
||||
convert_json_to_readable(input_folder)
|
||||
|
||||
Reference in New Issue
Block a user