import chardet import xml.etree.ElementTree as ET import json def read_xml_as_string(file_path): # 先读取部分字节探测编码 with open(file_path, 'rb') as f: raw_data = f.read() result = chardet.detect(raw_data) encoding = result['encoding'] # 使用探测到的编码重新读取为字符串 return raw_data.decode(encoding) def parse_keyword(keyword, indicator_name): # 特殊处理:电压等级 # if indicator_name == "电压等级": # return {"映射规则": "1", "指标映射": [keyword]} # 处理范围表达式(包含"||"分隔符) if "||" in keyword: parts = keyword.split("||") table_rows = [] all_codes = [] for part in parts: if "@@" not in part: continue codes_str, value = part.split("@@", 1) code_ranges = codes_str.split("、") for code_range in code_ranges: # 处理连续编号(如YX5-67~69) if "~" in code_range: prefix, range_part = code_range.rsplit("-", 1) start_str, end_str = range_part.split("~") try: start = int(start_str) end = int(end_str) for num in range(start, end + 1): all_codes.append(f"{prefix}-{num}") except ValueError: all_codes.append(code_range) else: all_codes.append(code_range) table_rows.append(f"| {codes_str} | {value} |") rule_table = "| 资源识别规则 | 指标值 |\n|-------|-------|\n" + "\n".join(table_rows) return {"映射规则": rule_table, "指标映射": all_codes} # 处理数学公式(包含"/"和括号) if "/" in keyword and "(" in keyword and ")" in keyword: # 提取分子(括号前部分) molecule = keyword.split("/")[0].strip() # 提取分母(括号内部分) denominator_start = keyword.find("(") + 1 denominator_end = keyword.find(")") denominator_expr = keyword[denominator_start:denominator_end] # 分割分母中的加法项 denominator_items = [item.strip() for item in denominator_expr.split("+")] return {"映射规则": keyword, "指标映射": [molecule] + denominator_items} # 处理加法表达式 if "+" in keyword: items = [item.strip() for item in keyword.split("+")] return {"映射规则": keyword, "指标映射": items} # 默认处理(普通关键字) return {"映射规则": keyword, "指标映射": [keyword]} def xml_to_json2(xml_content): root = ET.fromstring(xml_content) records = root.findall('.//records/record') result = [] for record in records: unit = record.get("单位") unit_type = record.get("单价类型") order = record.get("序号") extraction_method = record.get("提取方式") indicator_type = record.get("指标类型") index_extraction_scope = record.get("指标提取范围") data_sources = record.get("数据来源") indicator_name = record.get("指标名称") keyword = record.get("关键字") parsed = parse_keyword(keyword, indicator_name) if data_sources == "报表指标": result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": parsed["指标映射"], "映射规则": parsed["映射规则"] }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "主材单价": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "主材参数": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "主材数量": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "定额参数": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "定额数量": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "工程费用": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "指标库": result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": parsed["指标映射"], "映射规则": parsed["映射规则"] }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) elif data_sources == "项目划分费用": if index_extraction_scope is not None: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] temp2 = data_sources[-2:] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) else: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] result.append({ "指标名称": indicator_name, "指标描述": { "指标映射": f"从【{data_sources}】中获取{temp0}的属性", "映射规则": f"{temp1}" }, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources }) return json.dumps(result, ensure_ascii=False, indent=4) def xml_to_json(xml_content): root = ET.fromstring(xml_content) records = root.findall('.//records/record') result = [] # 定义需要特殊处理的数据来源类型 scope_based_sources = ["主材单价", "主材参数", "主材数量", "定额参数", "定额数量", "工程费用"] direct_sources = ["报表指标", "指标库"] project_division = ["项目划分费用"] for record in records: unit = record.get("单位") unit_type = record.get("单价类型") order = record.get("序号") extraction_method = record.get("提取方式") indicator_type = record.get("指标类型") index_extraction_scope = record.get("指标提取范围") data_sources = record.get("数据来源") indicator_name = record.get("指标名称") keyword = record.get("关键字") parsed = parse_keyword(keyword, indicator_name) base_item = { "指标名称": indicator_name, "code": "", "单位": unit, "单价类型": unit_type, "序号": order, "提取方式": extraction_method, "指标类型": indicator_type, "数据来源": data_sources } if data_sources in direct_sources: base_item["指标描述"] = { "指标映射": parsed["指标映射"], "映射规则": parsed["映射规则"] } result.append(base_item) elif data_sources in project_division: mapping_desc = f"查找一下项目划分节点【{index_extraction_scope}】下费用预览的【{indicator_name}】" base_item["指标描述"] = { "指标映射": mapping_desc, "映射规则": parsed["映射规则"] } result.append(base_item) elif data_sources in scope_based_sources: temp0 = parsed["指标映射"] temp1 = parsed["映射规则"] if index_extraction_scope: # 取数据来源的最后两个字(如"单价"、"参数"等) temp2 = data_sources[-2:] mapping_desc = f"从项目划分【{index_extraction_scope}】下所有子孙项目划分中查找名称属于{temp0}的所有{temp2}" else: mapping_desc = f"从【{data_sources}】中获取{temp0}的属性" base_item["指标描述"] = { "指标映射": mapping_desc, "映射规则": temp1 } result.append(base_item) else: # 处理未定义的数据来源类型 base_item["指标描述"] = { "指标映射": parsed["指标映射"], "映射规则": parsed["映射规则"] } result.append(base_item) return json.dumps(result, ensure_ascii=False, indent=4) def transform_text(text, input_str="【FFFFF】"): import re # 匹配方括号内的内容 pattern = re.compile(r"\[(.*?)\]") # 查找方括号内容 match = pattern.search(text) if not match: # 如果没有匹配到,返回原文 return text # 获取匹配的方括号内容 bracket_content = match.group(1) # 在原字符串中,用sub插入新内容 # 注意使用re.sub的count=1只替换第一个匹配 result = pattern.sub(f"{input_str}【{bracket_content}】", text, count=1) return result def replace_last_brackets(s): import re # 使用正则查找所有 [] matches = list(re.finditer(r'\[([^\[\]]*)\]', s)) if not matches: return s # 没有匹配,直接返回 # 取最后一个匹配 last = matches[-1] # 构造替换后的字符串 start, end = last.span() content = last.group(1) return s[:start] + f'【{content}】' + s[end:] def parse_indicator_string_to_json(indicator_str: str, output_path: str = "output.json"): import re try: # 解析为 JSON 对象 result = json.loads(indicator_str) # 过滤 for item in result: mapping = item.get("指标描述", {}).get("指标映射") if isinstance(mapping, str): if "【工程费用】" in mapping: cleaned = re.sub(r"@.*?\.", "", mapping) item["指标描述"]["指标映射"] = cleaned elif isinstance(mapping, list): first_elem = mapping[0] if first_elem.startswith("(") and first_elem.endswith(")"): del mapping[0] # 过滤2 for item in result: mapping = item.get("数据来源") if mapping == "定额数量": temp = item["指标描述"]["指标映射"] temp = transform_text(temp, input_str="【定额】") item["指标描述"]["指标映射"] = temp elif mapping == "主材数量" or mapping == "主材单价": temp = item["指标描述"]["指标映射"] temp = transform_text(temp, input_str="【主材】") item["指标描述"]["指标映射"] = temp elif mapping == "设备数量": temp = item["指标描述"]["指标映射"] temp = transform_text(temp, input_str="【设备】") item["指标描述"]["指标映射"] = temp else: if isinstance(item["指标描述"]["指标映射"], str): temp = item["指标描述"]["指标映射"] temp = replace_last_brackets(temp) item["指标描述"]["指标映射"] = temp # 保存为 JSON 文件 with open(output_path, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) return ("结果已保存") # 返回 JSON 对象本身(非字符串) except json.JSONDecodeError as e: return {"error": f"JSON解析失败: {str(e)}"} xml_content = read_xml_as_string('./data/主网架空线路造价分析指标.xml') json_output = xml_to_json(xml_content) parse_indicator_string_to_json(json_output, output_path= "./data/result6.27.json") print("转换完毕!")