import pandas as pd import random import math work_order_excel="data/excel/6万工单记录.xlsx" soft_row_data={ "博微配网计价通D3":{"基本功能":[], "高级功能":[]}, "储能C1软件":{"基本功能":[], "高级功能":[]}, "西藏计价通Z1":{"基本功能":[], "高级功能":[]}, "技改检修工程计价通T1":{"基本功能":[], "高级功能":[]}, "检修清单计价通T1":{"基本功能":[], "高级功能":[]}, "电力建设计价通软件":{"基本功能":[], "高级功能":[]}, } df = pd.read_excel(work_order_excel) for idx, row in df.iterrows(): if pd.isna(row["产品线"]): continue if "博微配网计价通D3" in row["产品线"]: soft_row_data["博微配网计价通D3"][row["问题类型"]].append((idx, row)) elif "博微电力建设计价通软件" in row["产品线"]: soft_row_data["电力建设计价通软件"][row["问题类型"]].append((idx, row)) elif "新能源系列" in row["产品线"] and "博微新型储能电站建设计价通C1软件" in row["产品名称"]: soft_row_data["储能C1软件"][row["问题类型"]].append((idx, row)) elif "博微西藏计价通Z1" in row["产品线"]: soft_row_data["西藏计价通Z1"][row["问题类型"]].append((idx, row)) elif "博微技改检修计价通T1软件" in row["产品线"] and "技改检修计价通T1软件-概预算" in row["产品名称"]: soft_row_data["技改检修工程计价通T1"][row["问题类型"]].append((idx, row)) elif "博微技改检修计价通T1软件" in row["产品线"] and "技改检修计价通T1软件-清单" in row["产品名称"]: soft_row_data["检修清单计价通T1"][row["问题类型"]].append((idx, row)) # 计算每个软件和功能类型的数据量 total_count = 0 counts = {} for software, types in soft_row_data.items(): counts[software] = {} for type_name, rows in types.items(): counts[software][type_name] = len(rows) total_count += len(rows) print(f"原始数据总量: {total_count}条") for software, types in counts.items(): print(f"{software}: 基本功能 {types['基本功能']}条, 高级功能 {types['高级功能']}条") # 计算均衡提取的数量 total_target = 2000 categories_count = sum(len(types) for types in soft_row_data.values()) per_category_target = math.ceil(total_target / categories_count) # 均衡提取数据 balanced_data = [] extracted_counts = {} extracted_indices = set() # 使用集合存储已提取数据的索引 for software, types in soft_row_data.items(): extracted_counts[software] = {} for type_name, rows in types.items(): # 如果数据量不足,全部提取;否则随机抽取目标数量 if len(rows) <= per_category_target: extracted = rows else: extracted = random.sample(rows, per_category_target) extracted_counts[software][type_name] = len(extracted) for idx, row in extracted: extracted_indices.add(idx) # 记录已提取数据的索引 balanced_data.append(row) # 数据量不足2000时,从剩余数据中补充 remaining_target = total_target - len(balanced_data) if remaining_target > 0: # 收集所有未被选中的数据 remaining_data = [] for software, types in soft_row_data.items(): for type_name, rows in types.items(): # 添加未被选中的数据 for idx, row in rows: if idx not in extracted_indices: remaining_data.append(row) # 如果剩余数据足够,随机抽取补充 if len(remaining_data) >= remaining_target: additional_data = random.sample(remaining_data, remaining_target) else: additional_data = remaining_data balanced_data.extend(additional_data) # 输出结果 print(f"\n均衡提取后数据总量: {len(balanced_data)}条") for software, types in extracted_counts.items(): print(f"{software}: 基本功能 {types['基本功能']}条, 高级功能 {types['高级功能']}条") # 将均衡提取的数据转换为DataFrame并保存 balanced_df = pd.DataFrame(balanced_data) balanced_df.to_excel("data/excel/均衡提取2000条工单.xlsx", index=False) print(f"\n已将均衡提取的{len(balanced_data)}条数据保存至'data/excel/均衡提取2000条工单.xlsx'")