上传代码还原,元数据导出水准
This commit is contained in:
@@ -72,7 +72,7 @@ def get_random_ua():
|
|||||||
def scan_all_parquet(root_dir):
|
def scan_all_parquet(root_dir):
|
||||||
"""递归扫描并分类Parquet文件,过滤空文件"""
|
"""递归扫描并分类Parquet文件,过滤空文件"""
|
||||||
classified_files = {data_type: [] for data_type in DATA_TYPE_MAPPING.keys()}
|
classified_files = {data_type: [] for data_type in DATA_TYPE_MAPPING.keys()}
|
||||||
|
|
||||||
print("递归扫描并分类Parquet文件,过滤空文件", root_dir)
|
print("递归扫描并分类Parquet文件,过滤空文件", root_dir)
|
||||||
for root, dirs, files in os.walk(root_dir):
|
for root, dirs, files in os.walk(root_dir):
|
||||||
# 匹配目录关键词
|
# 匹配目录关键词
|
||||||
@@ -85,12 +85,12 @@ def scan_all_parquet(root_dir):
|
|||||||
if not matched_data_type:
|
if not matched_data_type:
|
||||||
print("跳过", root)
|
print("跳过", root)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 匹配文件关键词并过滤空文件
|
# 匹配文件关键词并过滤空文件
|
||||||
print("匹配文件关键词并过滤空文件",matched_data_type)
|
print("匹配文件关键词并过滤空文件",matched_data_type)
|
||||||
_, file_keyword, _, _ = DATA_TYPE_MAPPING[matched_data_type]
|
_, file_keyword, _, _ = DATA_TYPE_MAPPING[matched_data_type]
|
||||||
print(file_keyword)
|
print(file_keyword)
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
print("检查文件", file)
|
print("检查文件", file)
|
||||||
if file.endswith(".parquet") and file_keyword in file:
|
if file.endswith(".parquet") and file_keyword in file:
|
||||||
@@ -101,7 +101,7 @@ def scan_all_parquet(root_dir):
|
|||||||
print(f"[扫描] 有效文件:{file_path}")
|
print(f"[扫描] 有效文件:{file_path}")
|
||||||
else:
|
else:
|
||||||
print(f"[扫描] 跳过空文件:{file_path}")
|
print(f"[扫描] 跳过空文件:{file_path}")
|
||||||
|
|
||||||
# 打印完整扫描结果
|
# 打印完整扫描结果
|
||||||
print(f"\n=== 扫描完成(完整统计)===")
|
print(f"\n=== 扫描完成(完整统计)===")
|
||||||
for data_type, paths in classified_files.items():
|
for data_type, paths in classified_files.items():
|
||||||
@@ -116,22 +116,22 @@ def read_parquet_by_type(file_paths, data_type):
|
|||||||
critical_fields = {
|
critical_fields = {
|
||||||
"section": ["section_id", "account_id", "mileage", "work_site"],
|
"section": ["section_id", "account_id", "mileage", "work_site"],
|
||||||
"checkpoint": ["point_id", "section_id", "aname", "burial_date"],
|
"checkpoint": ["point_id", "section_id", "aname", "burial_date"],
|
||||||
"settlement": ["NYID", "point_id", "sjName"],
|
"settlement": ["NYID", "point_id", "sjName"],
|
||||||
"level": ["NYID", "linecode", "wsphigh", "createDate"],
|
"level": ["NYID", "linecode", "wsphigh", "createDate"],
|
||||||
"original": ["NYID", "bfpcode", "mtime", "bfpvalue", "sort"]
|
"original": ["NYID", "bfpcode", "mtime", "bfpvalue", "sort"]
|
||||||
}.get(data_type, [])
|
}.get(data_type, [])
|
||||||
|
|
||||||
for file_path in file_paths:
|
for file_path in file_paths:
|
||||||
try:
|
try:
|
||||||
# 读取并处理空值
|
# 读取并处理空值
|
||||||
df = pd.read_parquet(file_path)
|
df = pd.read_parquet(file_path)
|
||||||
df = df.fillna("")
|
df = df.fillna("")
|
||||||
file_basename = os.path.basename(file_path)
|
file_basename = os.path.basename(file_path)
|
||||||
|
|
||||||
# 1. 打印文件实际列名(方便核对字段)
|
# 1. 打印文件实际列名(方便核对字段)
|
||||||
actual_columns = df.columns.tolist()
|
actual_columns = df.columns.tolist()
|
||||||
print(f"[读取] {file_basename} 实际列名:{actual_columns}")
|
print(f"[读取] {file_basename} 实际列名:{actual_columns}")
|
||||||
|
|
||||||
# 2. 校验核心字段是否存在
|
# 2. 校验核心字段是否存在
|
||||||
missing_fields = [f for f in critical_fields if f not in actual_columns]
|
missing_fields = [f for f in critical_fields if f not in actual_columns]
|
||||||
if missing_fields:
|
if missing_fields:
|
||||||
@@ -143,27 +143,27 @@ def read_parquet_by_type(file_paths, data_type):
|
|||||||
"original": ["NYID", "bfpcode"]
|
"original": ["NYID", "bfpcode"]
|
||||||
}.get(data_type, [])
|
}.get(data_type, [])
|
||||||
missing_core = [f for f in core_relation_fields if f not in actual_columns]
|
missing_core = [f for f in core_relation_fields if f not in actual_columns]
|
||||||
|
|
||||||
if missing_core:
|
if missing_core:
|
||||||
print(f"[读取] {file_basename} 缺失核心关联字段:{missing_core} → 跳过")
|
print(f"[读取] {file_basename} 缺失核心关联字段:{missing_core} → 跳过")
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print(f"[读取] {file_basename} 缺失普通字段:{missing_fields} → 继续处理")
|
print(f"[读取] {file_basename} 缺失普通字段:{missing_fields} → 继续处理")
|
||||||
|
|
||||||
# 3. 转换为字典列表并过滤空记录
|
# 3. 转换为字典列表并过滤空记录
|
||||||
records = df.to_dict("records")
|
records = df.to_dict("records")
|
||||||
valid_records = [r for r in records if any(r.values())] # 过滤全空记录
|
valid_records = [r for r in records if any(r.values())] # 过滤全空记录
|
||||||
if not valid_records:
|
if not valid_records:
|
||||||
print(f"[读取] {file_basename} 无有效记录 → 跳过")
|
print(f"[读取] {file_basename} 无有效记录 → 跳过")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 4. 字段格式化(仅处理存在的字段)
|
# 4. 字段格式化(仅处理存在的字段)
|
||||||
for record in valid_records:
|
for record in valid_records:
|
||||||
# 补充必填字段(如account_id)
|
# 补充必填字段(如account_id)
|
||||||
if "account_id" in required_supplement and "account_id" not in record:
|
if "account_id" in required_supplement and "account_id" not in record:
|
||||||
record["account_id"] = DEFAULT_ACCOUNT_ID
|
record["account_id"] = DEFAULT_ACCOUNT_ID
|
||||||
print(f"[读取] {file_basename} 补充 account_id={DEFAULT_ACCOUNT_ID}")
|
print(f"[读取] {file_basename} 补充 account_id={DEFAULT_ACCOUNT_ID}")
|
||||||
|
|
||||||
# 数值型字段强制转换
|
# 数值型字段强制转换
|
||||||
if data_type == "section" and "section_id" in record:
|
if data_type == "section" and "section_id" in record:
|
||||||
record["section_id"] = int(record["section_id"]) if str(record["section_id"]).isdigit() else 0
|
record["section_id"] = int(record["section_id"]) if str(record["section_id"]).isdigit() else 0
|
||||||
@@ -171,19 +171,19 @@ def read_parquet_by_type(file_paths, data_type):
|
|||||||
record["point_id"] = int(record["point_id"]) if str(record["point_id"]).isdigit() else 0
|
record["point_id"] = int(record["point_id"]) if str(record["point_id"]).isdigit() else 0
|
||||||
if data_type == "settlement" and "NYID" in record:
|
if data_type == "settlement" and "NYID" in record:
|
||||||
record["NYID"] = str(record["NYID"]) # 沉降NYID转为字符串
|
record["NYID"] = str(record["NYID"]) # 沉降NYID转为字符串
|
||||||
|
|
||||||
# 5. 累加数据并打印日志
|
# 5. 累加数据并打印日志
|
||||||
data_list.extend(valid_records)
|
data_list.extend(valid_records)
|
||||||
print(f"[读取] {file_basename} 处理完成 → 有效记录:{len(valid_records)}条,累计:{len(data_list)}条")
|
print(f"[读取] {file_basename} 处理完成 → 有效记录:{len(valid_records)}条,累计:{len(data_list)}条")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[读取] {os.path.basename(file_path)} 读取失败:{str(e)} → 跳过")
|
print(f"[读取] {os.path.basename(file_path)} 读取失败:{str(e)} → 跳过")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 沉降数据为空时的提示
|
# 沉降数据为空时的提示
|
||||||
if data_type == "settlement" and not data_list:
|
if data_type == "settlement" and not data_list:
|
||||||
print(f"\n⚠️ 【沉降数据读取异常】未读取到有效数据,请检查文件字段和内容")
|
print(f"\n⚠️ 【沉降数据读取异常】未读取到有效数据,请检查文件字段和内容")
|
||||||
|
|
||||||
print(f"\n=== {data_type} 数据读取总结 ===")
|
print(f"\n=== {data_type} 数据读取总结 ===")
|
||||||
print(f" 总文件数:{len(file_paths)} 个")
|
print(f" 总文件数:{len(file_paths)} 个")
|
||||||
print(f" 有效记录数:{len(data_list)} 条")
|
print(f" 有效记录数:{len(data_list)} 条")
|
||||||
@@ -265,30 +265,30 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
|
|||||||
if not data_list:
|
if not data_list:
|
||||||
print(f"[入库] 无 {data_type} 数据 → 跳过")
|
print(f"[入库] 无 {data_type} 数据 → 跳过")
|
||||||
return True, []
|
return True, []
|
||||||
|
|
||||||
_, _, import_func, _ = DATA_TYPE_MAPPING[data_type]
|
_, _, import_func, _ = DATA_TYPE_MAPPING[data_type]
|
||||||
total = len(data_list)
|
total = len(data_list)
|
||||||
success_flag = True
|
success_flag = True
|
||||||
success_nyids = []
|
success_nyids = []
|
||||||
total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE # 总批次数
|
total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE # 总批次数
|
||||||
|
|
||||||
# 获取未处理批次范围
|
# 获取未处理批次范围
|
||||||
unprocessed_ranges = filter_unprocessed_batches(total_batches, data_type, progress)
|
unprocessed_ranges = filter_unprocessed_batches(total_batches, data_type, progress)
|
||||||
if not unprocessed_ranges:
|
if not unprocessed_ranges:
|
||||||
print(f"[入库] {data_type} 无待处理批次 → 跳过")
|
print(f"[入库] {data_type} 无待处理批次 → 跳过")
|
||||||
return True, success_nyids
|
return True, success_nyids
|
||||||
|
|
||||||
# 处理未完成批次
|
# 处理未完成批次
|
||||||
for (batch_start, batch_end) in unprocessed_ranges:
|
for (batch_start, batch_end) in unprocessed_ranges:
|
||||||
batch_data = data_list[batch_start:batch_end]
|
batch_data = data_list[batch_start:batch_end]
|
||||||
batch_num = (batch_start // BATCH_SIZE) + 1 # 当前批次号
|
batch_num = (batch_start // BATCH_SIZE) + 1 # 当前批次号
|
||||||
batch_len = len(batch_data)
|
batch_len = len(batch_data)
|
||||||
print(f"\n=== [入库] {data_type} 第 {batch_num} 批(共{total}条,当前{batch_len}条)===")
|
print(f"\n=== [入库] {data_type} 第 {batch_num} 批(共{total}条,当前{batch_len}条)===")
|
||||||
|
|
||||||
# 水准数据过滤:仅保留沉降已存在的NYID
|
# 水准数据过滤:仅保留沉降已存在的NYID
|
||||||
# if data_type == "level" and settlement_nyids is not None:
|
# if data_type == "level" and settlement_nyids is not None:
|
||||||
# valid_batch = [
|
# valid_batch = [
|
||||||
# item for item in batch_data
|
# item for item in batch_data
|
||||||
# if str(item.get("NYID", "")) in settlement_nyids
|
# if str(item.get("NYID", "")) in settlement_nyids
|
||||||
# ]
|
# ]
|
||||||
# invalid_count = batch_len - len(valid_batch)
|
# invalid_count = batch_len - len(valid_batch)
|
||||||
@@ -302,16 +302,15 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
|
|||||||
# progress["processed_batches"][data_type].append(batch_num)
|
# progress["processed_batches"][data_type].append(batch_num)
|
||||||
# save_progress(progress)
|
# save_progress(progress)
|
||||||
# continue
|
# continue
|
||||||
|
|
||||||
# 重试机制
|
# 重试机制
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
while retry_count < MAX_RETRY:
|
while retry_count < MAX_RETRY:
|
||||||
try:
|
try:
|
||||||
result = import_func(batch_data)
|
result = import_func(batch_data)
|
||||||
print(f"[入库] 第 {batch_num} 批接口返回:{json.dumps(result, ensure_ascii=False, indent=2)}")
|
print(f"[入库] 第 {batch_num} 批接口返回:{json.dumps(result, ensure_ascii=False, indent=2)}")
|
||||||
|
|
||||||
# 解析返回结果
|
# 解析返回结果
|
||||||
success = True
|
|
||||||
if isinstance(result, tuple):
|
if isinstance(result, tuple):
|
||||||
# 处理 (status, msg) 格式
|
# 处理 (status, msg) 格式
|
||||||
status, msg = result
|
status, msg = result
|
||||||
@@ -321,7 +320,7 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
|
|||||||
# 处理字典格式(code=0或特定消息为成功)
|
# 处理字典格式(code=0或特定消息为成功)
|
||||||
if result.get("code") == SUCCESS_CODE or result.get("message") == "批量导入完成":
|
if result.get("code") == SUCCESS_CODE or result.get("message") == "批量导入完成":
|
||||||
success = True
|
success = True
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
print(f"[入库] 第 {batch_num} 批成功({retry_count+1}/{MAX_RETRY})")
|
print(f"[入库] 第 {batch_num} 批成功({retry_count+1}/{MAX_RETRY})")
|
||||||
# 标记批次为已处理
|
# 标记批次为已处理
|
||||||
@@ -333,26 +332,26 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print(f"[入库] 第 {batch_num} 批失败({retry_count+1}/{MAX_RETRY})")
|
print(f"[入库] 第 {batch_num} 批失败({retry_count+1}/{MAX_RETRY})")
|
||||||
|
|
||||||
# 指数退避重试
|
# 指数退避重试
|
||||||
delay = RETRY_DELAY * (retry_count + 1)
|
delay = RETRY_DELAY * (retry_count + 1)
|
||||||
print(f"[入库] 重试延迟 {delay} 秒...")
|
print(f"[入库] 重试延迟 {delay} 秒...")
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
retry_count += 1
|
retry_count += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[入库] 第 {batch_num} 批异常({retry_count+1}/{MAX_RETRY}):{str(e)}")
|
print(f"[入库] 第 {batch_num} 批异常({retry_count+1}/{MAX_RETRY}):{str(e)}")
|
||||||
delay = RETRY_DELAY * (retry_count + 1)
|
delay = RETRY_DELAY * (retry_count + 1)
|
||||||
print(f"[入库] 重试延迟 {delay} 秒...")
|
print(f"[入库] 重试延迟 {delay} 秒...")
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
retry_count += 1
|
retry_count += 1
|
||||||
|
|
||||||
# 多次重试失败处理
|
# 多次重试失败处理
|
||||||
if retry_count >= MAX_RETRY:
|
if retry_count >= MAX_RETRY:
|
||||||
print(f"\n[入库] 第 {batch_num} 批经 {MAX_RETRY} 次重试仍失败 → 终止该类型入库")
|
print(f"\n[入库] 第 {batch_num} 批经 {MAX_RETRY} 次重试仍失败 → 终止该类型入库")
|
||||||
success_flag = False
|
success_flag = False
|
||||||
break
|
break
|
||||||
|
|
||||||
return success_flag, success_nyids
|
return success_flag, success_nyids
|
||||||
|
|
||||||
|
|
||||||
@@ -365,21 +364,21 @@ def main():
|
|||||||
print(f" 断点续传:{'开启' if RESUME_ENABLE else '关闭'}(进度文件:{RESUME_PROGRESS_FILE})")
|
print(f" 断点续传:{'开启' if RESUME_ENABLE else '关闭'}(进度文件:{RESUME_PROGRESS_FILE})")
|
||||||
print(f" 接口成功标识:code={SUCCESS_CODE}")
|
print(f" 接口成功标识:code={SUCCESS_CODE}")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# 加载断点续传进度
|
# 加载断点续传进度
|
||||||
progress = load_progress()
|
progress = load_progress()
|
||||||
# 恢复已入库的沉降NYID
|
# 恢复已入库的沉降NYID
|
||||||
settlement_nyids = set(progress.get("settlement_nyids", []))
|
settlement_nyids = set(progress.get("settlement_nyids", []))
|
||||||
if settlement_nyids:
|
if settlement_nyids:
|
||||||
print(f"[断点续传] 恢复已入库沉降NYID:{len(settlement_nyids)} 个")
|
print(f"[断点续传] 恢复已入库沉降NYID:{len(settlement_nyids)} 个")
|
||||||
|
|
||||||
# 1. 扫描所有Parquet文件
|
# 1. 扫描所有Parquet文件
|
||||||
print(f"\n=== 第一步:扫描数据文件 ===")
|
print(f"\n=== 第一步:扫描数据文件 ===")
|
||||||
classified_files = scan_all_parquet(DATA_ROOT)
|
classified_files = scan_all_parquet(DATA_ROOT)
|
||||||
if not any(classified_files.values()):
|
if not any(classified_files.values()):
|
||||||
print(f"\n❌ 未找到任何有效Parquet文件 → 终止程序")
|
print(f"\n❌ 未找到任何有效Parquet文件 → 终止程序")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 2. 按依赖顺序入库(断面→测点→沉降→水准→原始)
|
# 2. 按依赖顺序入库(断面→测点→沉降→水准→原始)
|
||||||
print(f"\n=== 第二步:按依赖顺序入库 ===")
|
print(f"\n=== 第二步:按依赖顺序入库 ===")
|
||||||
data_type_order = [
|
data_type_order = [
|
||||||
@@ -389,25 +388,25 @@ def main():
|
|||||||
("level", "水准数据"),
|
("level", "水准数据"),
|
||||||
("original", "原始数据")
|
("original", "原始数据")
|
||||||
]
|
]
|
||||||
|
|
||||||
for data_type, data_name in data_type_order:
|
for data_type, data_name in data_type_order:
|
||||||
print(f"\n=====================================")
|
print(f"\n=====================================")
|
||||||
print(f"处理【{data_name}】(类型:{data_type})")
|
print(f"处理【{data_name}】(类型:{data_type})")
|
||||||
print(f"=====================================")
|
print(f"=====================================")
|
||||||
|
|
||||||
# 获取文件路径并过滤已处理文件
|
# 获取文件路径并过滤已处理文件
|
||||||
file_paths = classified_files.get(data_type, [])
|
file_paths = classified_files.get(data_type, [])
|
||||||
unprocessed_files = filter_unprocessed_files(file_paths, data_type, progress)
|
unprocessed_files = filter_unprocessed_files(file_paths, data_type, progress)
|
||||||
if not unprocessed_files:
|
if not unprocessed_files:
|
||||||
print(f"[主逻辑] 【{data_name}】无待处理文件 → 跳过")
|
print(f"[主逻辑] 【{data_name}】无待处理文件 → 跳过")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 读取未处理文件的数据
|
# 读取未处理文件的数据
|
||||||
data_list = read_parquet_by_type(unprocessed_files, data_type)
|
data_list = read_parquet_by_type(unprocessed_files, data_type)
|
||||||
if not data_list:
|
if not data_list:
|
||||||
print(f"\n❌ 【{data_name}】无有效数据 → 终止程序(后续数据依赖该类型)")
|
print(f"\n❌ 【{data_name}】无有效数据 → 终止程序(后续数据依赖该类型)")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 批量入库
|
# 批量入库
|
||||||
print(f"\n[主逻辑] 开始入库:{len(data_list)} 条数据,分 {len(unprocessed_files)} 个文件")
|
print(f"\n[主逻辑] 开始入库:{len(data_list)} 条数据,分 {len(unprocessed_files)} 个文件")
|
||||||
if data_type == "level":
|
if data_type == "level":
|
||||||
@@ -420,15 +419,15 @@ def main():
|
|||||||
progress["settlement_nyids"] = list(settlement_nyids)
|
progress["settlement_nyids"] = list(settlement_nyids)
|
||||||
save_progress(progress)
|
save_progress(progress)
|
||||||
print(f"\n[主逻辑] 沉降数据入库结果:成功 {len(settlement_nyids)} 个NYID(已保存到进度)")
|
print(f"\n[主逻辑] 沉降数据入库结果:成功 {len(settlement_nyids)} 个NYID(已保存到进度)")
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
print(f"\n❌ 【{data_name}】入库失败 → 终止后续流程(进度已保存)")
|
print(f"\n❌ 【{data_name}】入库失败 → 终止后续流程(进度已保存)")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 标记当前类型所有文件为已处理
|
# 标记当前类型所有文件为已处理
|
||||||
progress["processed_files"][data_type].extend(unprocessed_files)
|
progress["processed_files"][data_type].extend(unprocessed_files)
|
||||||
save_progress(progress)
|
save_progress(progress)
|
||||||
|
|
||||||
# 最终统计
|
# 最终统计
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
elapsed = (end_time - start_time) / 60
|
elapsed = (end_time - start_time) / 60
|
||||||
@@ -437,7 +436,7 @@ def main():
|
|||||||
print(f"核心成果:")
|
print(f"核心成果:")
|
||||||
print(f" - 沉降数据:成功入库 {len(settlement_nyids)} 个NYID")
|
print(f" - 沉降数据:成功入库 {len(settlement_nyids)} 个NYID")
|
||||||
print(f" - 所有数据按依赖顺序入库完成,建议后台核对数据完整性")
|
print(f" - 所有数据按依赖顺序入库完成,建议后台核对数据完整性")
|
||||||
|
|
||||||
# 任务完成后删除进度文件(避免下次误读)
|
# 任务完成后删除进度文件(避免下次误读)
|
||||||
# if RESUME_ENABLE and os.path.exists(RESUME_PROGRESS_FILE):
|
# if RESUME_ENABLE and os.path.exists(RESUME_PROGRESS_FILE):
|
||||||
# os.remove(RESUME_PROGRESS_FILE)
|
# os.remove(RESUME_PROGRESS_FILE)
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ def save_point_times(point_id, point_times):
|
|||||||
def batch_import_sections(data_list):
|
def batch_import_sections(data_list):
|
||||||
"""批量导入断面数据到指定API"""
|
"""批量导入断面数据到指定API"""
|
||||||
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_sections"
|
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_sections"
|
||||||
|
|
||||||
# 数据格式校验
|
# 数据格式校验
|
||||||
for index, item in enumerate(data_list):
|
for index, item in enumerate(data_list):
|
||||||
# 检查必填字段
|
# 检查必填字段
|
||||||
@@ -44,23 +44,23 @@ def batch_import_sections(data_list):
|
|||||||
for field in required_fields:
|
for field in required_fields:
|
||||||
if field not in item:
|
if field not in item:
|
||||||
return False, f"第{index+1}条数据缺失必填字段:{field}"
|
return False, f"第{index+1}条数据缺失必填字段:{field}"
|
||||||
|
|
||||||
# 校验section_id是否为整数
|
# 校验section_id是否为整数
|
||||||
if not isinstance(item["section_id"], int):
|
if not isinstance(item["section_id"], int):
|
||||||
return False, f"第{index+1}条数据的section_id必须为整数,实际为:{type(item['section_id']).__name__}"
|
return False, f"第{index+1}条数据的section_id必须为整数,实际为:{type(item['section_id']).__name__}"
|
||||||
|
|
||||||
# 校验account_id是否为整数
|
# 校验account_id是否为整数
|
||||||
if not isinstance(item["account_id"], int):
|
if not isinstance(item["account_id"], int):
|
||||||
return False, f"第{index+1}条数据的account_id必须为整数,实际为:{type(item['account_id']).__name__}"
|
return False, f"第{index+1}条数据的account_id必须为整数,实际为:{type(item['account_id']).__name__}"
|
||||||
|
|
||||||
# 校验字符串字段不为空
|
# 校验字符串字段不为空
|
||||||
for str_field in ["mileage", "work_site", "status"]:
|
for str_field in ["mileage", "work_site", "status"]:
|
||||||
if not isinstance(item[str_field], str) or not item[str_field].strip():
|
if not isinstance(item[str_field], str) or not item[str_field].strip():
|
||||||
return False, f"第{index+1}条数据的{str_field}必须为非空字符串"
|
return False, f"第{index+1}条数据的{str_field}必须为非空字符串"
|
||||||
|
|
||||||
# 构建请求体
|
# 构建请求体
|
||||||
payload = json.dumps({"data": data_list})
|
payload = json.dumps({"data": data_list})
|
||||||
|
|
||||||
# 随机选择一个User-Agent
|
# 随机选择一个User-Agent
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': random.choice(USER_AGENTS), # 核心修改:随机选择
|
'User-Agent': random.choice(USER_AGENTS), # 核心修改:随机选择
|
||||||
@@ -90,10 +90,10 @@ def batch_import_sections(data_list):
|
|||||||
def batch_import_checkpoints(data_list):
|
def batch_import_checkpoints(data_list):
|
||||||
"""批量导入检查点数据到指定API"""
|
"""批量导入检查点数据到指定API"""
|
||||||
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_checkpoints"
|
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_checkpoints"
|
||||||
|
|
||||||
# 构建请求体
|
# 构建请求体
|
||||||
payload = json.dumps({"data": data_list})
|
payload = json.dumps({"data": data_list})
|
||||||
|
|
||||||
# 随机选择User-Agent
|
# 随机选择User-Agent
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
||||||
@@ -102,7 +102,7 @@ def batch_import_checkpoints(data_list):
|
|||||||
'Host': 'www.yuxindazhineng.com:3002',
|
'Host': 'www.yuxindazhineng.com:3002',
|
||||||
'Connection': 'keep-alive'
|
'Connection': 'keep-alive'
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -120,12 +120,11 @@ def batch_import_checkpoints(data_list):
|
|||||||
|
|
||||||
# 导入沉降数据
|
# 导入沉降数据
|
||||||
def batch_import_settlement_data(settlement_data_list):
|
def batch_import_settlement_data(settlement_data_list):
|
||||||
return
|
|
||||||
"""批量导入沉降数据到指定API接口"""
|
"""批量导入沉降数据到指定API接口"""
|
||||||
api_url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_settlement_data"
|
api_url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_settlement_data"
|
||||||
|
|
||||||
request_payload = json.dumps({"data": settlement_data_list})
|
request_payload = json.dumps({"data": settlement_data_list})
|
||||||
|
|
||||||
# 随机选择User-Agent
|
# 随机选择User-Agent
|
||||||
request_headers = {
|
request_headers = {
|
||||||
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
||||||
@@ -134,7 +133,7 @@ def batch_import_settlement_data(settlement_data_list):
|
|||||||
'Host': 'www.yuxindazhineng.com:3002',
|
'Host': 'www.yuxindazhineng.com:3002',
|
||||||
'Connection': 'keep-alive'
|
'Connection': 'keep-alive'
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
url=api_url,
|
url=api_url,
|
||||||
@@ -159,9 +158,9 @@ def batch_import_settlement_data(settlement_data_list):
|
|||||||
def batch_import_level_data(data_list):
|
def batch_import_level_data(data_list):
|
||||||
"""批量导入层级数据到指定API"""
|
"""批量导入层级数据到指定API"""
|
||||||
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_level_data"
|
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_level_data"
|
||||||
|
|
||||||
payload = json.dumps({"data": data_list})
|
payload = json.dumps({"data": data_list})
|
||||||
|
|
||||||
# 随机选择User-Agent
|
# 随机选择User-Agent
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
||||||
@@ -170,7 +169,7 @@ def batch_import_level_data(data_list):
|
|||||||
'Host': 'www.yuxindazhineng.com:3002',
|
'Host': 'www.yuxindazhineng.com:3002',
|
||||||
'Connection': 'keep-alive'
|
'Connection': 'keep-alive'
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -188,27 +187,27 @@ def batch_import_level_data(data_list):
|
|||||||
def batch_import_original_data(data_list):
|
def batch_import_original_data(data_list):
|
||||||
"""批量导入原始数据到指定API"""
|
"""批量导入原始数据到指定API"""
|
||||||
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_original_data"
|
url = "http://www.yuxindazhineng.com:3002/api/comprehensive_data/batch_import_original_data"
|
||||||
|
|
||||||
# 校验数据格式
|
# 校验数据格式
|
||||||
for i, item in enumerate(data_list):
|
for i, item in enumerate(data_list):
|
||||||
required_fields = ["bfpcode", "mtime", "bffb", "bfpl", "bfpvalue", "NYID", "sort"]
|
required_fields = ["bfpcode", "mtime", "bffb", "bfpl", "bfpvalue", "NYID", "sort"]
|
||||||
for field in required_fields:
|
for field in required_fields:
|
||||||
if field not in item:
|
if field not in item:
|
||||||
return False, f"第{i+1}条数据缺少必填字段: {field}"
|
return False, f"第{i+1}条数据缺少必填字段: {field}"
|
||||||
|
|
||||||
# 校验mtime格式
|
# 校验mtime格式
|
||||||
mtime = item["mtime"]
|
mtime = item["mtime"]
|
||||||
try:
|
try:
|
||||||
datetime.strptime(mtime, "%Y-%m-%d %H:%M:%S")
|
datetime.strptime(mtime, "%Y-%m-%d %H:%M:%S")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return False, f"第{i+1}条数据的mtime格式错误,应为'YYYY-MM-DD HH:MM:SS',实际值: {mtime}"
|
return False, f"第{i+1}条数据的mtime格式错误,应为'YYYY-MM-DD HH:MM:SS',实际值: {mtime}"
|
||||||
|
|
||||||
# 校验sort是否为整数
|
# 校验sort是否为整数
|
||||||
if not isinstance(item["sort"], int):
|
if not isinstance(item["sort"], int):
|
||||||
return False, f"第{i+1}条数据的sort必须为整数,实际值: {item['sort']}"
|
return False, f"第{i+1}条数据的sort必须为整数,实际值: {item['sort']}"
|
||||||
|
|
||||||
payload = json.dumps({"data": data_list})
|
payload = json.dumps({"data": data_list})
|
||||||
|
|
||||||
# 随机选择User-Agent
|
# 随机选择User-Agent
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
'User-Agent': random.choice(USER_AGENTS), # 核心修改
|
||||||
@@ -218,7 +217,7 @@ def batch_import_original_data(data_list):
|
|||||||
'Host': '127.0.0.1:8000',
|
'Host': '127.0.0.1:8000',
|
||||||
'Connection': 'keep-alive'
|
'Connection': 'keep-alive'
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
response = requests.post(url, headers=headers, data=payload, timeout=60)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|||||||
558
upload_app/process_parquet_to_excel.py
Normal file
558
upload_app/process_parquet_to_excel.py
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Parquet数据处理与Excel导出脚本
|
||||||
|
|
||||||
|
功能:
|
||||||
|
1. 读取upload_app\data路径下全部parquet文件(按文件夹分组)
|
||||||
|
- 支持两层目录结构:主文件夹/中文子文件夹/parquet文件
|
||||||
|
- 自动识别5种数据类型:section_、point_、settlement_、level_、original_
|
||||||
|
2. 关联5种类型数据:断面、观测点、沉降、水准、原始
|
||||||
|
- 数据关联链:断面→观测点→沉降→水准→原始
|
||||||
|
3. 以水准数据为主体整理数据
|
||||||
|
- 拆分的benchmarkids(起始点/终止点)
|
||||||
|
- 收集测点(同一水准线路的所有观测点)
|
||||||
|
- 计算时间范围(原始数据mtime范围)
|
||||||
|
- 格式化日期(YYYY-MM-DD)
|
||||||
|
4. 导出为Excel文件
|
||||||
|
- 每个数据文件夹生成一个Excel文件
|
||||||
|
- 输出列:日期、水准线路、起始点、终止点、测点、起始时间、终止时间、类型
|
||||||
|
|
||||||
|
依赖:
|
||||||
|
- pandas
|
||||||
|
- numpy
|
||||||
|
- openpyxl (用于Excel导出)
|
||||||
|
|
||||||
|
安装依赖:
|
||||||
|
pip install pandas numpy openpyxl
|
||||||
|
|
||||||
|
作者:Claude Code
|
||||||
|
日期:2025-11-08
|
||||||
|
版本:1.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
|
||||||
|
# ------------------------------ 配置信息 ------------------------------
|
||||||
|
|
||||||
|
# 数据根目录
|
||||||
|
DATA_ROOT = "./data"
|
||||||
|
|
||||||
|
# 输出目录
|
||||||
|
OUTPUT_DIR = "./output"
|
||||||
|
|
||||||
|
# 文件类型映射
|
||||||
|
DATA_TYPE_MAPPING = {
|
||||||
|
"section": {
|
||||||
|
"keyword": "section_",
|
||||||
|
"fields": ["section_id", "account_id", "mileage", "work_site"]
|
||||||
|
},
|
||||||
|
"checkpoint": {
|
||||||
|
"keyword": "point_",
|
||||||
|
"fields": ["point_id", "section_id", "aname", "burial_date"]
|
||||||
|
},
|
||||||
|
"settlement": {
|
||||||
|
"keyword": "settlement_",
|
||||||
|
"fields": ["NYID", "point_id", "sjName"]
|
||||||
|
},
|
||||||
|
"level": {
|
||||||
|
"keyword": "level_",
|
||||||
|
"fields": ["NYID", "linecode", "wsphigh", "createDate"]
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"keyword": "original_",
|
||||||
|
"fields": ["NYID", "bfpcode", "mtime", "bfpvalue", "sort"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ------------------------------ 工具函数 ------------------------------
|
||||||
|
|
||||||
|
def scan_parquet_files(root_dir):
|
||||||
|
"""递归扫描parquet文件,按文件夹分组(支持两层目录结构)"""
|
||||||
|
folders = {}
|
||||||
|
|
||||||
|
print(f"开始扫描目录: {os.path.abspath(root_dir)}")
|
||||||
|
|
||||||
|
# 获取所有主文件夹(一级目录)
|
||||||
|
for main_folder in os.listdir(root_dir):
|
||||||
|
main_path = os.path.join(root_dir, main_folder)
|
||||||
|
if os.path.isdir(main_path):
|
||||||
|
print(f"\n发现主文件夹: {main_folder}")
|
||||||
|
|
||||||
|
# 初始化数据结构
|
||||||
|
folders[main_folder] = {
|
||||||
|
"path": main_path,
|
||||||
|
"files": {
|
||||||
|
"section": [],
|
||||||
|
"checkpoint": [],
|
||||||
|
"settlement": [],
|
||||||
|
"level": [],
|
||||||
|
"original": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 扫描子文件夹(二级目录)
|
||||||
|
for sub_folder in os.listdir(main_path):
|
||||||
|
sub_path = os.path.join(main_path, sub_folder)
|
||||||
|
if os.path.isdir(sub_path):
|
||||||
|
print(f" 扫描子文件夹: {sub_folder}")
|
||||||
|
|
||||||
|
# 扫描子文件夹内的parquet文件(三级)
|
||||||
|
for file in os.listdir(sub_path):
|
||||||
|
if file.endswith(".parquet"):
|
||||||
|
# 确定文件类型
|
||||||
|
file_type = None
|
||||||
|
for dtype, config in DATA_TYPE_MAPPING.items():
|
||||||
|
if config["keyword"] in file:
|
||||||
|
file_type = dtype
|
||||||
|
break
|
||||||
|
|
||||||
|
if file_type:
|
||||||
|
file_path = os.path.join(sub_path, file)
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
if file_size > 1024: # 过滤空文件
|
||||||
|
folders[main_folder]["files"][file_type].append(file_path)
|
||||||
|
print(f" 找到 {dtype} 文件: {file}")
|
||||||
|
else:
|
||||||
|
print(f" 跳过空文件: {file}")
|
||||||
|
|
||||||
|
return folders
|
||||||
|
|
||||||
|
|
||||||
|
def read_parquet_files(file_paths, data_type):
|
||||||
|
"""读取parquet文件列表,返回DataFrame"""
|
||||||
|
all_data = []
|
||||||
|
|
||||||
|
if not file_paths:
|
||||||
|
print(f" 无 {data_type} 文件")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
print(f" 读取 {data_type} 数据,共 {len(file_paths)} 个文件")
|
||||||
|
|
||||||
|
for file_path in file_paths:
|
||||||
|
try:
|
||||||
|
df = pd.read_parquet(file_path)
|
||||||
|
if not df.empty:
|
||||||
|
# 填充空值
|
||||||
|
df = df.fillna("")
|
||||||
|
all_data.append(df)
|
||||||
|
print(f" 读取: {os.path.basename(file_path)} - {len(df)} 条记录")
|
||||||
|
else:
|
||||||
|
print(f" 跳过空文件: {os.path.basename(file_path)}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" 错误: {os.path.basename(file_path)} - {str(e)}")
|
||||||
|
|
||||||
|
if all_data:
|
||||||
|
result = pd.concat(all_data, ignore_index=True)
|
||||||
|
print(f" {data_type} 数据读取完成,共 {len(result)} 条记录")
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
print(f" {data_type} 无有效数据")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_benchmarkids(benchmarkids_str):
|
||||||
|
"""
|
||||||
|
解析benchmarkids,拆分为起始点和终止点
|
||||||
|
|
||||||
|
例如: "JM35-1、JMZJWZQ01" -> ("JM35-1", "JMZJWZQ01")
|
||||||
|
|
||||||
|
Args:
|
||||||
|
benchmarkids_str: benchmarkids字符串,格式为 "起始点、终止点"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (起始点, 终止点)
|
||||||
|
"""
|
||||||
|
if not benchmarkids_str or pd.isna(benchmarkids_str):
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
# 按"、"拆分
|
||||||
|
parts = str(benchmarkids_str).split("、")
|
||||||
|
start_point = parts[0].strip() if len(parts) > 0 else ""
|
||||||
|
end_point = parts[1].strip() if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
return start_point, end_point
|
||||||
|
|
||||||
|
|
||||||
|
def format_datetime(dt_str):
|
||||||
|
"""格式化时间字符串,从 '2023-09-28 00:15:46' 转为 '2023-09-28'"""
|
||||||
|
if not dt_str or pd.isna(dt_str):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 解析datetime字符串
|
||||||
|
dt = pd.to_datetime(dt_str)
|
||||||
|
# 返回日期部分
|
||||||
|
return dt.strftime("%Y-%m-%d")
|
||||||
|
except:
|
||||||
|
return str(dt_str)
|
||||||
|
|
||||||
|
|
||||||
|
def find_mtime_range(original_data, nyids):
|
||||||
|
"""在原始数据中找到给定NYID集合的mtime最早和最晚时间"""
|
||||||
|
# 修复:检查nyids的长度,而不是使用not(对numpy array无效)
|
||||||
|
if original_data.empty or nyids.size == 0:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
# 筛选对应的原始数据
|
||||||
|
filtered = original_data[original_data["NYID"].isin(nyids)]
|
||||||
|
|
||||||
|
if filtered.empty:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
# 找到mtime的最小值和最大值
|
||||||
|
try:
|
||||||
|
# 转换mtime为datetime
|
||||||
|
mtimes = pd.to_datetime(filtered["mtime"], errors="coerce")
|
||||||
|
mtimes = mtimes.dropna()
|
||||||
|
|
||||||
|
if mtimes.empty:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
min_time = mtimes.min().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
max_time = mtimes.max().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
return min_time, max_time
|
||||||
|
except:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------ 核心处理函数 ------------------------------
|
||||||
|
|
||||||
|
def process_folder_data(folder_name, folder_path, files):
|
||||||
|
"""处理单个文件夹的数据"""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"处理文件夹: {folder_name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# 读取所有类型的数据
|
||||||
|
print(f"\n开始读取数据...")
|
||||||
|
section_df = read_parquet_files(files["section"], "section")
|
||||||
|
checkpoint_df = read_parquet_files(files["checkpoint"], "checkpoint")
|
||||||
|
settlement_df = read_parquet_files(files["settlement"], "settlement")
|
||||||
|
level_df = read_parquet_files(files["level"], "level")
|
||||||
|
original_df = read_parquet_files(files["original"], "original")
|
||||||
|
|
||||||
|
# 检查是否有原始数据
|
||||||
|
has_original = not original_df.empty if isinstance(original_df, pd.DataFrame) else False
|
||||||
|
if not has_original:
|
||||||
|
print(f" 警告: {folder_name} 无原始数据,时间范围功能将受限")
|
||||||
|
|
||||||
|
# 存储处理结果
|
||||||
|
result_data = []
|
||||||
|
|
||||||
|
# 按水准数据为主体进行处理
|
||||||
|
if level_df.empty:
|
||||||
|
print(f" 警告: {folder_name} 无水准数据,跳过")
|
||||||
|
return pd.DataFrame(), pd.Series(dtype=int) # 返回空的重复NYID Series
|
||||||
|
|
||||||
|
print(f"\n开始处理水准数据...")
|
||||||
|
print(f" 水准数据记录数: {len(level_df)}")
|
||||||
|
|
||||||
|
# 检查水准数据的列名
|
||||||
|
if not level_df.empty:
|
||||||
|
level_columns = level_df.columns.tolist()
|
||||||
|
print(f" 水准数据实际列名: {level_columns}")
|
||||||
|
if "benchmarkids" not in level_columns:
|
||||||
|
print(f" 注意: 未发现benchmarkids字段,起始点/终止点将为空")
|
||||||
|
|
||||||
|
# 检查NYID期数ID是否有重复
|
||||||
|
print(f"\n 检查NYID期数ID重复...")
|
||||||
|
if not level_df.empty:
|
||||||
|
nyid_counts = level_df['NYID'].value_counts()
|
||||||
|
duplicate_nyids = nyid_counts[nyid_counts > 1]
|
||||||
|
if not duplicate_nyids.empty:
|
||||||
|
print(f" ⚠️ 发现 {len(duplicate_nyids)} 个重复的NYID:")
|
||||||
|
for nyid, count in duplicate_nyids.items():
|
||||||
|
print(f" NYID={nyid} 出现 {count} 次")
|
||||||
|
else:
|
||||||
|
print(f" ✅ 未发现重复的NYID")
|
||||||
|
|
||||||
|
# 添加处理进度计数器
|
||||||
|
total_levels = len(level_df)
|
||||||
|
processed_count = 0
|
||||||
|
|
||||||
|
# 数据质量检验:计算预期记录数
|
||||||
|
# 每条水准数据理论上对应最终Excel的一条记录
|
||||||
|
expected_records = total_levels
|
||||||
|
print(f" 预期生成记录数: {expected_records}")
|
||||||
|
print(f" 数据质量检验:最终记录数应等于此数字")
|
||||||
|
|
||||||
|
for _, level_row in level_df.iterrows():
|
||||||
|
processed_count += 1
|
||||||
|
if processed_count % 100 == 0 or processed_count == total_levels:
|
||||||
|
print(f" 进度: {processed_count}/{total_levels} ({processed_count*100/total_levels:.1f}%)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
nyid = level_row["NYID"]
|
||||||
|
linecode = level_row["linecode"]
|
||||||
|
createDate = level_row["createDate"]
|
||||||
|
benchmarkids = level_row.get("benchmarkids", "")
|
||||||
|
|
||||||
|
# 1. 解析benchmarkids获取起始点和终止点
|
||||||
|
# 注意:benchmarkids字段可能不存在,使用默认值
|
||||||
|
if benchmarkids:
|
||||||
|
start_point, end_point = parse_benchmarkids(benchmarkids)
|
||||||
|
else:
|
||||||
|
# 如果没有benchmarkids字段,使用空值或默认值
|
||||||
|
start_point = ""
|
||||||
|
end_point = ""
|
||||||
|
|
||||||
|
# 2. 格式化createDate
|
||||||
|
formatted_date = format_datetime(createDate)
|
||||||
|
|
||||||
|
# 3. 找到该水准数据对应的沉降数据
|
||||||
|
related_settlements = settlement_df[settlement_df["NYID"] == nyid]
|
||||||
|
|
||||||
|
# 防御性检查:确保related_settlements是DataFrame
|
||||||
|
if isinstance(related_settlements, pd.DataFrame) and related_settlements.empty:
|
||||||
|
print(f" 警告: NYID={nyid} 无对应沉降数据")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 4. 获取所有相关的point_id
|
||||||
|
related_point_ids = related_settlements["point_id"].unique()
|
||||||
|
|
||||||
|
# 5. 找到这些观测点对应的断面数据,获取work_site
|
||||||
|
work_site = ""
|
||||||
|
# 防御性检查:确保DataFrame存在且不为空
|
||||||
|
if isinstance(checkpoint_df, pd.DataFrame) and isinstance(section_df, pd.DataFrame):
|
||||||
|
if not checkpoint_df.empty and not section_df.empty:
|
||||||
|
# 通过point_id找到section_id
|
||||||
|
related_checkpoints = checkpoint_df[checkpoint_df["point_id"].isin(related_point_ids)]
|
||||||
|
# 防御性检查
|
||||||
|
if isinstance(related_checkpoints, pd.DataFrame) and not related_checkpoints.empty:
|
||||||
|
related_section_ids = related_checkpoints["section_id"].unique()
|
||||||
|
# 通过section_id找到work_site
|
||||||
|
related_sections = section_df[section_df["section_id"].isin(related_section_ids)]
|
||||||
|
# 防御性检查
|
||||||
|
if isinstance(related_sections, pd.DataFrame) and not related_sections.empty:
|
||||||
|
work_sites = related_sections["work_site"].unique()
|
||||||
|
# 修复:使用 .size 正确处理 numpy array
|
||||||
|
if work_sites.size > 0:
|
||||||
|
work_site = str(work_sites[0]) # 确保是字符串
|
||||||
|
else:
|
||||||
|
work_site = ""
|
||||||
|
|
||||||
|
# 6. 收集同一水准线路编码的所有水准数据对应的沉降数据,进而获取观测点
|
||||||
|
# 找到所有具有相同linecode的水准数据
|
||||||
|
same_line_levels = level_df[level_df["linecode"] == linecode]
|
||||||
|
same_line_nyids = same_line_levels["NYID"].unique()
|
||||||
|
|
||||||
|
# 找到这些水准数据对应的沉降数据
|
||||||
|
all_settlements_same_line = settlement_df[settlement_df["NYID"].isin(same_line_nyids)]
|
||||||
|
|
||||||
|
# 获取这些沉降数据对应的观测点point_id
|
||||||
|
all_point_ids = all_settlements_same_line["point_id"].unique()
|
||||||
|
point_ids_str = ",".join(map(str, sorted(all_point_ids)))
|
||||||
|
|
||||||
|
# 7. 计算时间范围(通过同一水准线路编码的所有NYID)
|
||||||
|
if has_original:
|
||||||
|
min_mtime, max_mtime = find_mtime_range(original_df, same_line_nyids)
|
||||||
|
else:
|
||||||
|
# 如果没有原始数据,使用水准数据的createDate
|
||||||
|
min_mtime = formatted_date + " 00:00:00" if formatted_date else ""
|
||||||
|
max_mtime = formatted_date + " 23:59:59" if formatted_date else ""
|
||||||
|
|
||||||
|
# 8. 组合结果
|
||||||
|
result_row = {
|
||||||
|
"日期": formatted_date,
|
||||||
|
"水准线路": linecode,
|
||||||
|
"起始点": start_point,
|
||||||
|
"终止点": end_point,
|
||||||
|
"测点": point_ids_str,
|
||||||
|
"起始时间": min_mtime,
|
||||||
|
"终止时间": max_mtime,
|
||||||
|
"类型": work_site
|
||||||
|
}
|
||||||
|
|
||||||
|
result_data.append(result_row)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
error_msg = str(e)
|
||||||
|
print(f" 错误: 处理水准数据时出错 - {error_msg}")
|
||||||
|
# 如果是数组布尔值错误,提供更详细的提示
|
||||||
|
if "truth value of an array" in error_msg:
|
||||||
|
print(f" 提示: 可能是使用了错误的布尔判断(应使用 .any() 或 .all())")
|
||||||
|
# 打印堆栈跟踪的最后几行
|
||||||
|
tb_lines = traceback.format_exc().strip().split('\n')
|
||||||
|
print(f" 位置: {tb_lines[-1].strip() if tb_lines else '未知'}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
result_df = pd.DataFrame(result_data)
|
||||||
|
actual_records = len(result_df)
|
||||||
|
print(f"\n{folder_name} 处理完成,共生成 {actual_records} 条记录")
|
||||||
|
|
||||||
|
# 数据质量检验:验证记录数
|
||||||
|
if actual_records == expected_records:
|
||||||
|
print(f" ✅ 数据质量检验通过:实际记录数({actual_records}) = 预期记录数({expected_records})")
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ 数据质量检验警告:")
|
||||||
|
print(f" 预期记录数: {expected_records}")
|
||||||
|
print(f" 实际记录数: {actual_records}")
|
||||||
|
print(f" 差异: {expected_records - actual_records} 条记录")
|
||||||
|
print(f" 可能原因:")
|
||||||
|
print(f" 1. 某些水准数据无对应的沉降数据")
|
||||||
|
print(f" 2. 数据关联过程中出现错误")
|
||||||
|
print(f" 3. 数据质量问题")
|
||||||
|
|
||||||
|
return result_df, duplicate_nyids if not level_df.empty else pd.Series(dtype=int)
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_excel(data_df, folder_name, output_dir=OUTPUT_DIR):
|
||||||
|
"""导出数据到Excel文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_df: 要导出的DataFrame
|
||||||
|
folder_name: 文件夹名称(用于生成文件名)
|
||||||
|
output_dir: 输出目录,默认为配置中的OUTPUT_DIR
|
||||||
|
"""
|
||||||
|
if data_df.empty:
|
||||||
|
print(f" 跳过: 无数据可导出")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 确保输出目录存在
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 生成文件名
|
||||||
|
output_file = os.path.join(output_dir, f"{folder_name}_水准数据报表.xlsx")
|
||||||
|
|
||||||
|
# 导出到Excel
|
||||||
|
try:
|
||||||
|
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
|
||||||
|
data_df.to_excel(writer, index=False, sheet_name='水准数据')
|
||||||
|
|
||||||
|
print(f" 导出成功: {output_file}")
|
||||||
|
print(f" 记录数: {len(data_df)}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" 导出失败: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------ 主函数 ------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
print("="*60)
|
||||||
|
print("Parquet数据处理与Excel导出程序")
|
||||||
|
print("="*60)
|
||||||
|
print("\n功能说明:")
|
||||||
|
print("1. 读取data目录下所有parquet文件(按文件夹分组)")
|
||||||
|
print("2. 关联5种数据:断面、观测点、沉降、水准、原始数据")
|
||||||
|
print("3. 以水准数据为主体整理并生成Excel报表")
|
||||||
|
print("\n输出列:")
|
||||||
|
print("- 日期 (水准数据时间)")
|
||||||
|
print("- 水准线路 (linecode)")
|
||||||
|
print("- 起始点/终止点 (benchmarkids拆分)")
|
||||||
|
print("- 测点 (同一水准线路的观测点集合)")
|
||||||
|
print("- 起始时间/终止时间 (原始数据mtime范围)")
|
||||||
|
print("- 类型 (work_site)")
|
||||||
|
print("\n配置信息:")
|
||||||
|
print(f" 数据根目录: {os.path.abspath(DATA_ROOT)}")
|
||||||
|
print(f" 输出目录: {os.path.abspath(OUTPUT_DIR)}")
|
||||||
|
print(f"\n开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# 1. 扫描所有parquet文件
|
||||||
|
folders = scan_parquet_files(DATA_ROOT)
|
||||||
|
|
||||||
|
if not folders:
|
||||||
|
print("\n错误: 未找到任何数据文件夹")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"\n找到 {len(folders)} 个数据文件夹")
|
||||||
|
|
||||||
|
# 显示每个文件夹的文件统计
|
||||||
|
print("\n文件夹文件统计:")
|
||||||
|
for folder_name, folder_info in folders.items():
|
||||||
|
file_counts = {k: len(v) for k, v in folder_info["files"].items()}
|
||||||
|
print(f" {folder_name}:")
|
||||||
|
print(f" 断面数据: {file_counts['section']} 个文件")
|
||||||
|
print(f" 观测点数据: {file_counts['checkpoint']} 个文件")
|
||||||
|
print(f" 沉降数据: {file_counts['settlement']} 个文件")
|
||||||
|
print(f" 水准数据: {file_counts['level']} 个文件")
|
||||||
|
print(f" 原始数据: {file_counts['original']} 个文件")
|
||||||
|
|
||||||
|
# 2. 处理每个文件夹
|
||||||
|
quality_stats = [] # 记录每个文件夹的数据质量统计
|
||||||
|
all_duplicate_nyids = {} # 收集所有文件夹的重复NYID
|
||||||
|
|
||||||
|
for folder_name, folder_info in folders.items():
|
||||||
|
try:
|
||||||
|
# 处理数据
|
||||||
|
result_df, duplicate_nyids = process_folder_data(
|
||||||
|
folder_name,
|
||||||
|
folder_info["path"],
|
||||||
|
folder_info["files"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 记录重复的NYID
|
||||||
|
if not duplicate_nyids.empty:
|
||||||
|
all_duplicate_nyids[folder_name] = duplicate_nyids
|
||||||
|
|
||||||
|
# 保存质量统计信息
|
||||||
|
actual_count = len(result_df) if not result_df.empty else 0
|
||||||
|
quality_stats.append({
|
||||||
|
"folder": folder_name,
|
||||||
|
"actual_records": actual_count
|
||||||
|
})
|
||||||
|
|
||||||
|
# 导出Excel
|
||||||
|
if not result_df.empty:
|
||||||
|
export_to_excel(result_df, folder_name)
|
||||||
|
else:
|
||||||
|
print(f"\n{folder_name}: 无数据可导出")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n错误: 处理文件夹 {folder_name} 时出错 - {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 3. 显示全局数据质量统计
|
||||||
|
if quality_stats:
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("全局数据质量统计")
|
||||||
|
print("="*60)
|
||||||
|
total_records = 0
|
||||||
|
for stat in quality_stats:
|
||||||
|
print(f"{stat['folder']}: {stat['actual_records']} 条记录")
|
||||||
|
total_records += stat['actual_records']
|
||||||
|
print(f"\n总计: {total_records} 条记录")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# 4. 显示NYID重复汇总
|
||||||
|
if all_duplicate_nyids:
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("NYID期数ID重复汇总")
|
||||||
|
print("="*60)
|
||||||
|
total_duplicates = 0
|
||||||
|
for folder_name, duplicate_nyids in all_duplicate_nyids.items():
|
||||||
|
print(f"\n{folder_name}:")
|
||||||
|
for nyid, count in duplicate_nyids.items():
|
||||||
|
print(f" NYID={nyid} 出现 {count} 次")
|
||||||
|
total_duplicates += (count - 1) # 计算额外重复次数
|
||||||
|
print(f"\n总计额外重复记录: {total_duplicates} 条")
|
||||||
|
print("="*60)
|
||||||
|
else:
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("NYID期数ID重复检查")
|
||||||
|
print("✅ 所有数据集均未发现重复的NYID")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("所有任务完成")
|
||||||
|
print(f"完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f"\n输出目录: {os.path.abspath(OUTPUT_DIR)}")
|
||||||
|
print("请查看输出目录中的Excel文件")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("提示:如需安装依赖,请运行:")
|
||||||
|
print(" pip install pandas numpy openpyxl")
|
||||||
|
print("="*60)
|
||||||
Reference in New Issue
Block a user