批量导入优化

2025-11-17 16:14:12 +08:00
parent 502ef50a87
commit 54ac4037d5
3 changed files with 283 additions and 154 deletions
--- a/app/services/checkpoint.py
+++ b/app/services/checkpoint.py
@@ -37,8 +37,10 @@ class CheckpointService(BaseService[Checkpoint]):

    def batch_import_checkpoints(self, db: Session, data: List) -> Dict[str, Any]:
        """
-        批量导入观测点数据，根据观测点ID判断是否重复，重复数据改为更新操作
-        判断断面id是否存在，不存在则全部不导入
+        批量导入观测点数据 - 性能优化版
+        使用批量查询和批量操作，大幅提升导入速度
+        1.判断断面id是否存在，不存在则跳过该条数据
+        2.根据观测点ID判断是否重复，重复数据跳过，不进行更新操作
        支持事务回滚，失败时重试一次
        """
        import logging
@@ -49,6 +51,16 @@ class CheckpointService(BaseService[Checkpoint]):
        failed_count = 0
        failed_items = []

+        if total_count == 0:
+            return {
+                'success': False,
+                'message': '导入数据不能为空',
+                'total_count': 0,
+                'success_count': 0,
+                'failed_count': 0,
+                'failed_items': []
+            }
+
        for attempt in range(2):  # 最多重试1次
            try:
                db.begin()
@@ -56,40 +68,112 @@ class CheckpointService(BaseService[Checkpoint]):
                failed_count = 0
                failed_items = []

+                # ===== 性能优化1：批量查询断面数据（IN查询） =====
+                # 统一转换为字符串处理（数据库section_id字段是VARCHAR类型）
+                section_id_list = list(set(str(item.get('section_id')) for item in data if item.get('section_id')))
+                logger.info(f"Checking {len(section_id_list)} unique section_ids in section data")
+                sections = db.query(SectionData).filter(SectionData.section_id.in_(section_id_list)).all()
+                section_map = {s.section_id: s for s in sections}
+                missing_section_ids = set(section_id_list) - set(section_map.keys())
+
+                # 记录缺失的断面
                for item_data in data:
-                    try:
-                        # 判断断面id是否存在
-                        if not self._check_section_exists(db, item_data.get('section_id')):
-                            logger.error(f"Section {item_data.get('section_id')} not found")
-                            raise Exception(f"Section {item_data.get('section_id')} not found")
-
-                        checkpoint = self.get_by_point_id(db, item_data.get('point_id'))
-                        if checkpoint:
-                            # 更新操作
-                            checkpoint.aname = item_data.get('aname')
-                            checkpoint.section_id = item_data.get('section_id')
-                            checkpoint.burial_date = item_data.get('burial_date')
-                            logger.info(f"Updated checkpoint: {item_data.get('point_id')}")
-                        else:
-                            # 新增操作
-                            checkpoint = Checkpoint(
-                                point_id=item_data.get('point_id'),
-                                aname=item_data.get('aname'),
-                                section_id=item_data.get('section_id'),
-                                burial_date=item_data.get('burial_date'),
-                            )
-                            db.add(checkpoint)
-                            logger.info(f"Created checkpoint: {item_data.get('point_id')}")
-
-                        success_count += 1
-                    except Exception as e:
+                    section_id = str(item_data.get('section_id'))  # 统一转换为字符串
+                    if section_id in missing_section_ids:
                        failed_count += 1
                        failed_items.append({
                            'data': item_data,
-                            'error': str(e)
+                            'error': '断面ID不存在，跳过插入操作'
                        })
-                        logger.error(f"Failed to process checkpoint {item_data.get('point_id')}: {str(e)}")
-                        raise e
+
+                # 如果所有数据都失败，直接返回
+                if failed_count == total_count:
+                    db.rollback()
+                    return {
+                        'success': False,
+                        'message': '所有断面ID都不存在',
+                        'total_count': total_count,
+                        'success_count': 0,
+                        'failed_count': total_count,
+                        'failed_items': failed_items
+                    }
+
+                # ===== 性能优化2：批量查询现有观测点数据（IN查询） =====
+                # 只查询有效的断面数据
+                valid_items = [item for item in data if str(item.get('section_id')) not in missing_section_ids]
+                if valid_items:
+                    # 统一转换为字符串处理（数据库point_id字段是VARCHAR类型）
+                    point_id_list = list(set(str(item.get('point_id')) for item in valid_items if item.get('point_id')))
+                    existing_checkpoints = db.query(Checkpoint).filter(Checkpoint.point_id.in_(point_id_list)).all()
+
+                    # 使用point_id创建查找表
+                    existing_map = {
+                        checkpoint.point_id: checkpoint
+                        for checkpoint in existing_checkpoints
+                    }
+                    logger.info(f"Found {len(existing_checkpoints)} existing checkpoints")
+
+                    # ===== 性能优化3：批量处理插入和跳过 =====
+                    to_insert = []
+
+                    for item_data in valid_items:
+                        point_id = str(item_data.get('point_id'))  # 统一转换为字符串
+
+                        if point_id in existing_map:
+                            # 数据已存在，跳过
+                            logger.info(f"Continue checkpoint data: {point_id}")
+                            failed_count += 1
+                            failed_items.append({
+                                'data': item_data,
+                                'error': '数据已存在，跳过插入操作'
+                            })
+                        else:
+                            # 记录需要插入的数据
+                            to_insert.append(item_data)
+
+                    # ===== 执行批量插入 =====
+                    if to_insert:
+                        logger.info(f"Inserting {len(to_insert)} new records")
+                        # 分批插入，每批500条（避免SQL过长）
+                        batch_size = 500
+                        for i in range(0, len(to_insert), batch_size):
+                            batch = to_insert[i:i + batch_size]
+                            try:
+                                checkpoint_list = [
+                                    Checkpoint(
+                                        point_id=str(item.get('point_id')),  # 统一转换为字符串
+                                        aname=item.get('aname'),
+                                        section_id=str(item.get('section_id')),  # 统一转换为字符串
+                                        burial_date=item.get('burial_date')
+                                    )
+                                    for item in batch
+                                ]
+                                db.add_all(checkpoint_list)
+                                success_count += len(batch)
+                                logger.info(f"Inserted batch {i//batch_size + 1}: {len(batch)} records")
+                            except Exception as e:
+                                failed_count += len(batch)
+                                failed_items.extend([
+                                    {
+                                        'data': item,
+                                        'error': f'插入失败: {str(e)}'
+                                    }
+                                    for item in batch
+                                ])
+                                logger.error(f"Failed to insert batch: {str(e)}")
+                                raise e
+
+                # 如果有失败记录，不提交事务
+                if failed_items:
+                    db.rollback()
+                    return {
+                        'success': False,
+                        'message': f'批量导入失败: {len(failed_items)}条记录处理失败',
+                        'total_count': total_count,
+                        'success_count': success_count,
+                        'failed_count': failed_count,
+                        'failed_items': failed_items
+                    }

                db.commit()
                logger.info(f"Batch import checkpoints completed. Success: {success_count}, Failed: {failed_count}")