上传代码还原，元数据导出水准

2025-11-08 19:31:37 +08:00
parent 47eb275056
commit 74fbece74b
3 changed files with 618 additions and 62 deletions
--- a/upload_app/insert_all.py
+++ b/upload_app/insert_all.py
@@ -72,7 +72,7 @@ def get_random_ua():
 def scan_all_parquet(root_dir):
    """递归扫描并分类Parquet文件，过滤空文件"""
    classified_files = {data_type: [] for data_type in DATA_TYPE_MAPPING.keys()}
-    
+
    print("递归扫描并分类Parquet文件，过滤空文件", root_dir)
    for root, dirs, files in os.walk(root_dir):
        # 匹配目录关键词
@@ -85,12 +85,12 @@ def scan_all_parquet(root_dir):
        if not matched_data_type:
            print("跳过", root)
            continue
-        
+
        # 匹配文件关键词并过滤空文件
        print("匹配文件关键词并过滤空文件",matched_data_type)
        _, file_keyword, _, _ = DATA_TYPE_MAPPING[matched_data_type]
        print(file_keyword)
-        
+
        for file in files:
            print("检查文件", file)
            if file.endswith(".parquet") and file_keyword in file:
@@ -101,7 +101,7 @@ def scan_all_parquet(root_dir):
                    print(f"[扫描] 有效文件：{file_path}")
                else:
                    print(f"[扫描] 跳过空文件：{file_path}")
-    
+
    # 打印完整扫描结果
    print(f"\n=== 扫描完成（完整统计）===")
    for data_type, paths in classified_files.items():
@@ -116,22 +116,22 @@ def read_parquet_by_type(file_paths, data_type):
    critical_fields = {
        "section": ["section_id", "account_id", "mileage", "work_site"],
        "checkpoint": ["point_id", "section_id", "aname", "burial_date"],
-        "settlement": ["NYID", "point_id", "sjName"],  
+        "settlement": ["NYID", "point_id", "sjName"],
        "level": ["NYID", "linecode", "wsphigh", "createDate"],
        "original": ["NYID", "bfpcode", "mtime", "bfpvalue", "sort"]
    }.get(data_type, [])
-    
+
    for file_path in file_paths:
        try:
            # 读取并处理空值
            df = pd.read_parquet(file_path)
            df = df.fillna("")
            file_basename = os.path.basename(file_path)
-            
+
            # 1. 打印文件实际列名（方便核对字段）
            actual_columns = df.columns.tolist()
            print(f"[读取] {file_basename} 实际列名：{actual_columns}")
-            
+
            # 2. 校验核心字段是否存在
            missing_fields = [f for f in critical_fields if f not in actual_columns]
            if missing_fields:
@@ -143,27 +143,27 @@ def read_parquet_by_type(file_paths, data_type):
                    "original": ["NYID", "bfpcode"]
                }.get(data_type, [])
                missing_core = [f for f in core_relation_fields if f not in actual_columns]
-                
+
                if missing_core:
                    print(f"[读取] {file_basename} 缺失核心关联字段：{missing_core} → 跳过")
                    continue
                else:
                    print(f"[读取] {file_basename} 缺失普通字段：{missing_fields} → 继续处理")
-            
+
            # 3. 转换为字典列表并过滤空记录
            records = df.to_dict("records")
            valid_records = [r for r in records if any(r.values())]  # 过滤全空记录
            if not valid_records:
                print(f"[读取] {file_basename} 无有效记录 → 跳过")
                continue
-            
+
            # 4. 字段格式化（仅处理存在的字段）
            for record in valid_records:
                # 补充必填字段（如account_id）
                if "account_id" in required_supplement and "account_id" not in record:
                    record["account_id"] = DEFAULT_ACCOUNT_ID
                    print(f"[读取] {file_basename} 补充 account_id={DEFAULT_ACCOUNT_ID}")
-                
+
                # 数值型字段强制转换
                if data_type == "section" and "section_id" in record:
                    record["section_id"] = int(record["section_id"]) if str(record["section_id"]).isdigit() else 0
@@ -171,19 +171,19 @@ def read_parquet_by_type(file_paths, data_type):
                    record["point_id"] = int(record["point_id"]) if str(record["point_id"]).isdigit() else 0
                if data_type == "settlement" and "NYID" in record:
                    record["NYID"] = str(record["NYID"])  # 沉降NYID转为字符串
-            
+
            # 5. 累加数据并打印日志
            data_list.extend(valid_records)
            print(f"[读取] {file_basename} 处理完成 → 有效记录：{len(valid_records)}条，累计：{len(data_list)}条")
-        
+
        except Exception as e:
            print(f"[读取] {os.path.basename(file_path)} 读取失败：{str(e)} → 跳过")
            continue
-    
+
    # 沉降数据为空时的提示
    if data_type == "settlement" and not data_list:
        print(f"\n⚠️ 【沉降数据读取异常】未读取到有效数据，请检查文件字段和内容")
-    
+
    print(f"\n=== {data_type} 数据读取总结 ===")
    print(f"  总文件数：{len(file_paths)} 个")
    print(f"  有效记录数：{len(data_list)} 条")
@@ -265,30 +265,30 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
    if not data_list:
        print(f"[入库] 无 {data_type} 数据 → 跳过")
        return True, []
-    
+
    _, _, import_func, _ = DATA_TYPE_MAPPING[data_type]
    total = len(data_list)
    success_flag = True
    success_nyids = []
    total_batches = (total + BATCH_SIZE - 1) // BATCH_SIZE  # 总批次数
-    
+
    # 获取未处理批次范围
    unprocessed_ranges = filter_unprocessed_batches(total_batches, data_type, progress)
    if not unprocessed_ranges:
        print(f"[入库] {data_type} 无待处理批次 → 跳过")
        return True, success_nyids
-    
+
    # 处理未完成批次
    for (batch_start, batch_end) in unprocessed_ranges:
        batch_data = data_list[batch_start:batch_end]
        batch_num = (batch_start // BATCH_SIZE) + 1  # 当前批次号
        batch_len = len(batch_data)
        print(f"\n=== [入库] {data_type} 第 {batch_num} 批（共{total}条，当前{batch_len}条）===")
-        
+
        # 水准数据过滤：仅保留沉降已存在的NYID
        # if data_type == "level" and settlement_nyids is not None:
        #     valid_batch = [
-        #         item for item in batch_data 
+        #         item for item in batch_data
        #         if str(item.get("NYID", "")) in settlement_nyids
        #     ]
        #     invalid_count = batch_len - len(valid_batch)
@@ -302,16 +302,15 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
        #             progress["processed_batches"][data_type].append(batch_num)
        #             save_progress(progress)
        #             continue
-        
+
        # 重试机制
        retry_count = 0
        while retry_count < MAX_RETRY:
            try:
                result = import_func(batch_data)
                print(f"[入库] 第 {batch_num} 批接口返回：{json.dumps(result, ensure_ascii=False, indent=2)}")
-                
+
                # 解析返回结果
-                success = True
                if isinstance(result, tuple):
                    # 处理 (status, msg) 格式
                    status, msg = result
@@ -321,7 +320,7 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
                    # 处理字典格式（code=0或特定消息为成功）
                    if result.get("code") == SUCCESS_CODE or result.get("message") == "批量导入完成":
                        success = True
-                
+
                if success:
                    print(f"[入库] 第 {batch_num} 批成功（{retry_count+1}/{MAX_RETRY}）")
                    # 标记批次为已处理
@@ -333,26 +332,26 @@ def batch_import(data_list, data_type, settlement_nyids=None, progress=None):
                    break
                else:
                    print(f"[入库] 第 {batch_num} 批失败（{retry_count+1}/{MAX_RETRY}）")
-                
+
                # 指数退避重试
                delay = RETRY_DELAY * (retry_count + 1)
                print(f"[入库] 重试延迟 {delay} 秒...")
                time.sleep(delay)
                retry_count += 1
-            
+
            except Exception as e:
                print(f"[入库] 第 {batch_num} 批异常（{retry_count+1}/{MAX_RETRY}）：{str(e)}")
                delay = RETRY_DELAY * (retry_count + 1)
                print(f"[入库] 重试延迟 {delay} 秒...")
                time.sleep(delay)
                retry_count += 1
-        
+
        # 多次重试失败处理
        if retry_count >= MAX_RETRY:
            print(f"\n[入库] 第 {batch_num} 批经 {MAX_RETRY} 次重试仍失败 → 终止该类型入库")
            success_flag = False
            break
-    
+
    return success_flag, success_nyids


@@ -365,21 +364,21 @@ def main():
    print(f"  断点续传：{'开启' if RESUME_ENABLE else '关闭'}（进度文件：{RESUME_PROGRESS_FILE}）")
    print(f"  接口成功标识：code={SUCCESS_CODE}")
    start_time = time.time()
-    
+
    # 加载断点续传进度
    progress = load_progress()
    # 恢复已入库的沉降NYID
    settlement_nyids = set(progress.get("settlement_nyids", []))
    if settlement_nyids:
        print(f"[断点续传] 恢复已入库沉降NYID：{len(settlement_nyids)} 个")
-    
+
    # 1. 扫描所有Parquet文件
    print(f"\n=== 第一步：扫描数据文件 ===")
    classified_files = scan_all_parquet(DATA_ROOT)
    if not any(classified_files.values()):
        print(f"\n❌ 未找到任何有效Parquet文件 → 终止程序")
        return
-    
+
    # 2. 按依赖顺序入库（断面→测点→沉降→水准→原始）
    print(f"\n=== 第二步：按依赖顺序入库 ===")
    data_type_order = [
@@ -389,25 +388,25 @@ def main():
        ("level", "水准数据"),
        ("original", "原始数据")
    ]
-    
+
    for data_type, data_name in data_type_order:
        print(f"\n=====================================")
        print(f"处理【{data_name}】（类型：{data_type}）")
        print(f"=====================================")
-        
+
        # 获取文件路径并过滤已处理文件
        file_paths = classified_files.get(data_type, [])
        unprocessed_files = filter_unprocessed_files(file_paths, data_type, progress)
        if not unprocessed_files:
            print(f"[主逻辑] 【{data_name}】无待处理文件 → 跳过")
            continue
-        
+
        # 读取未处理文件的数据
        data_list = read_parquet_by_type(unprocessed_files, data_type)
        if not data_list:
            print(f"\n❌ 【{data_name}】无有效数据 → 终止程序（后续数据依赖该类型）")
            return
-        
+
        # 批量入库
        print(f"\n[主逻辑] 开始入库：{len(data_list)} 条数据，分 {len(unprocessed_files)} 个文件")
        if data_type == "level":
@@ -420,15 +419,15 @@ def main():
                progress["settlement_nyids"] = list(settlement_nyids)
                save_progress(progress)
                print(f"\n[主逻辑] 沉降数据入库结果：成功 {len(settlement_nyids)} 个NYID（已保存到进度）")
-        
+
        if not success:
            print(f"\n❌ 【{data_name}】入库失败 → 终止后续流程（进度已保存）")
            return
-        
+
        # 标记当前类型所有文件为已处理
        progress["processed_files"][data_type].extend(unprocessed_files)
        save_progress(progress)
-    
+
    # 最终统计
    end_time = time.time()
    elapsed = (end_time - start_time) / 60
@@ -437,7 +436,7 @@ def main():
    print(f"核心成果：")
    print(f"  - 沉降数据：成功入库 {len(settlement_nyids)} 个NYID")
    print(f"  - 所有数据按依赖顺序入库完成，建议后台核对数据完整性")
-    
+
    # 任务完成后删除进度文件（避免下次误读）
    # if RESUME_ENABLE and os.path.exists(RESUME_PROGRESS_FILE):
    #     os.remove(RESUME_PROGRESS_FILE)