Files
langchain_KG/kg_lab_6.13/xml_to_json.py
T
zoujiwen fbc6c06f2b 上传文件至 kg_lab_6.13
6.24 xml自动转json脚本
2025-06-24 11:36:22 +08:00

118 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import chardet
import xml.etree.ElementTree as ET
import json
def read_xml_as_string(file_path):
# 先读取部分字节探测编码
with open(file_path, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
encoding = result['encoding']
# 使用探测到的编码重新读取为字符串
return raw_data.decode(encoding)
def parse_keyword(keyword, indicator_name):
# 特殊处理:电压等级
if indicator_name == "电压等级":
return {"映射规则": "1", "指标映射": [keyword]}
# 处理范围表达式(包含"||"分隔符)
if "||" in keyword:
parts = keyword.split("||")
table_rows = []
all_codes = []
for part in parts:
if "@@" not in part:
continue
codes_str, value = part.split("@@", 1)
code_ranges = codes_str.split("、")
for code_range in code_ranges:
# 处理连续编号(如YX5-67~69
if "~" in code_range:
prefix, range_part = code_range.rsplit("-", 1)
start_str, end_str = range_part.split("~")
try:
start = int(start_str)
end = int(end_str)
for num in range(start, end + 1):
all_codes.append(f"{prefix}-{num}")
except ValueError:
all_codes.append(code_range)
else:
all_codes.append(code_range)
table_rows.append(f"| {codes_str} | {value} |")
rule_table = "| 资源识别规则 | 指标值 |\n|-------|-------|\n" + "\n".join(table_rows)
return {"映射规则": rule_table, "指标映射": all_codes}
# 处理数学公式(包含"/"和括号)
if "/" in keyword and "(" in keyword and ")" in keyword:
# 提取分子(括号前部分)
molecule = keyword.split("/")[0].strip()
# 提取分母(括号内部分)
denominator_start = keyword.find("(") + 1
denominator_end = keyword.find(")")
denominator_expr = keyword[denominator_start:denominator_end]
# 分割分母中的加法项
denominator_items = [item.strip() for item in denominator_expr.split("+")]
return {"映射规则": keyword, "指标映射": [molecule] + denominator_items}
# 处理加法表达式
if "+" in keyword:
items = [item.strip() for item in keyword.split("+")]
return {"映射规则": keyword, "指标映射": items}
# 默认处理(普通关键字)
return {"映射规则": keyword, "指标映射": [keyword]}
def xml_to_json(xml_content):
root = ET.fromstring(xml_content)
records = root.findall('.//records/record')
result = []
for record in records:
indicator_name = record.get("指标名称")
keyword = record.get("关键字")
# 解析关键字生成映射规则和指标映射
parsed = parse_keyword(keyword, indicator_name)
result.append({
"指标名称": indicator_name,
"指标描述": {
"指标映射": parsed["指标映射"],
"映射规则": parsed["映射规则"]
},
"code": ""
})
return json.dumps(result, ensure_ascii=False, indent=4)
def parse_indicator_string_to_json(indicator_str: str, output_path: str = "output.json"):
try:
# 解析为 JSON 对象
result = json.loads(indicator_str)
# 保存为 JSON 文件
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
return ("结果已保存") # 返回 JSON 对象本身(非字符串)
except json.JSONDecodeError as e:
return {"error": f"JSON解析失败: {str(e)}"}
xml_content = read_xml_as_string('./data/主网架空线路造价分析指标.xml')
json_output = xml_to_json(xml_content)
parse_indicator_string_to_json(json_output, output_path= "./data/result.json")
print("转换完毕!")