批量导入优化

This commit is contained in:
lhx
2025-11-17 16:14:12 +08:00
parent 502ef50a87
commit 54ac4037d5
3 changed files with 283 additions and 154 deletions

View File

@@ -37,8 +37,10 @@ class CheckpointService(BaseService[Checkpoint]):
def batch_import_checkpoints(self, db: Session, data: List) -> Dict[str, Any]:
"""
批量导入观测点数据根据观测点ID判断是否重复重复数据改为更新操作
判断断面id是否存在不存在则全部不导入
批量导入观测点数据 - 性能优化版
使用批量查询和批量操作,大幅提升导入速度
1.判断断面id是否存在不存在则跳过该条数据
2.根据观测点ID判断是否重复重复数据跳过不进行更新操作
支持事务回滚,失败时重试一次
"""
import logging
@@ -49,6 +51,16 @@ class CheckpointService(BaseService[Checkpoint]):
failed_count = 0
failed_items = []
if total_count == 0:
return {
'success': False,
'message': '导入数据不能为空',
'total_count': 0,
'success_count': 0,
'failed_count': 0,
'failed_items': []
}
for attempt in range(2): # 最多重试1次
try:
db.begin()
@@ -56,40 +68,112 @@ class CheckpointService(BaseService[Checkpoint]):
failed_count = 0
failed_items = []
# ===== 性能优化1批量查询断面数据IN查询 =====
# 统一转换为字符串处理数据库section_id字段是VARCHAR类型
section_id_list = list(set(str(item.get('section_id')) for item in data if item.get('section_id')))
logger.info(f"Checking {len(section_id_list)} unique section_ids in section data")
sections = db.query(SectionData).filter(SectionData.section_id.in_(section_id_list)).all()
section_map = {s.section_id: s for s in sections}
missing_section_ids = set(section_id_list) - set(section_map.keys())
# 记录缺失的断面
for item_data in data:
try:
# 判断断面id是否存在
if not self._check_section_exists(db, item_data.get('section_id')):
logger.error(f"Section {item_data.get('section_id')} not found")
raise Exception(f"Section {item_data.get('section_id')} not found")
checkpoint = self.get_by_point_id(db, item_data.get('point_id'))
if checkpoint:
# 更新操作
checkpoint.aname = item_data.get('aname')
checkpoint.section_id = item_data.get('section_id')
checkpoint.burial_date = item_data.get('burial_date')
logger.info(f"Updated checkpoint: {item_data.get('point_id')}")
else:
# 新增操作
checkpoint = Checkpoint(
point_id=item_data.get('point_id'),
aname=item_data.get('aname'),
section_id=item_data.get('section_id'),
burial_date=item_data.get('burial_date'),
)
db.add(checkpoint)
logger.info(f"Created checkpoint: {item_data.get('point_id')}")
success_count += 1
except Exception as e:
section_id = str(item_data.get('section_id')) # 统一转换为字符串
if section_id in missing_section_ids:
failed_count += 1
failed_items.append({
'data': item_data,
'error': str(e)
'error': '断面ID不存在跳过插入操作'
})
logger.error(f"Failed to process checkpoint {item_data.get('point_id')}: {str(e)}")
raise e
# 如果所有数据都失败,直接返回
if failed_count == total_count:
db.rollback()
return {
'success': False,
'message': '所有断面ID都不存在',
'total_count': total_count,
'success_count': 0,
'failed_count': total_count,
'failed_items': failed_items
}
# ===== 性能优化2批量查询现有观测点数据IN查询 =====
# 只查询有效的断面数据
valid_items = [item for item in data if str(item.get('section_id')) not in missing_section_ids]
if valid_items:
# 统一转换为字符串处理数据库point_id字段是VARCHAR类型
point_id_list = list(set(str(item.get('point_id')) for item in valid_items if item.get('point_id')))
existing_checkpoints = db.query(Checkpoint).filter(Checkpoint.point_id.in_(point_id_list)).all()
# 使用point_id创建查找表
existing_map = {
checkpoint.point_id: checkpoint
for checkpoint in existing_checkpoints
}
logger.info(f"Found {len(existing_checkpoints)} existing checkpoints")
# ===== 性能优化3批量处理插入和跳过 =====
to_insert = []
for item_data in valid_items:
point_id = str(item_data.get('point_id')) # 统一转换为字符串
if point_id in existing_map:
# 数据已存在,跳过
logger.info(f"Continue checkpoint data: {point_id}")
failed_count += 1
failed_items.append({
'data': item_data,
'error': '数据已存在,跳过插入操作'
})
else:
# 记录需要插入的数据
to_insert.append(item_data)
# ===== 执行批量插入 =====
if to_insert:
logger.info(f"Inserting {len(to_insert)} new records")
# 分批插入每批500条避免SQL过长
batch_size = 500
for i in range(0, len(to_insert), batch_size):
batch = to_insert[i:i + batch_size]
try:
checkpoint_list = [
Checkpoint(
point_id=str(item.get('point_id')), # 统一转换为字符串
aname=item.get('aname'),
section_id=str(item.get('section_id')), # 统一转换为字符串
burial_date=item.get('burial_date')
)
for item in batch
]
db.add_all(checkpoint_list)
success_count += len(batch)
logger.info(f"Inserted batch {i//batch_size + 1}: {len(batch)} records")
except Exception as e:
failed_count += len(batch)
failed_items.extend([
{
'data': item,
'error': f'插入失败: {str(e)}'
}
for item in batch
])
logger.error(f"Failed to insert batch: {str(e)}")
raise e
# 如果有失败记录,不提交事务
if failed_items:
db.rollback()
return {
'success': False,
'message': f'批量导入失败: {len(failed_items)}条记录处理失败',
'total_count': total_count,
'success_count': success_count,
'failed_count': failed_count,
'failed_items': failed_items
}
db.commit()
logger.info(f"Batch import checkpoints completed. Success: {success_count}, Failed: {failed_count}")