""" 数据库连接池监控模块 监控连接池状态、事务执行情况,预防雪崩效应 """ import logging import time import threading from sqlalchemy import create_engine, event from sqlalchemy.engine import Engine from sqlalchemy.pool import QueuePool from typing import Dict, Any, Optional from datetime import datetime, timedelta from .database import engine from .logging_config import get_logger logger = get_logger(__name__) # 全局监控数据 _pool_stats = { 'total_connections': 0, 'checked_in': 0, 'checked_out': 0, 'overflow': 0, 'invalidate_count': 0, 'transactions': [], # 事务统计 'slow_queries': [], # 慢查询 'connection_errors': [], # 连接错误 'peak_connections': 0, 'last_reset': datetime.now() } # 告警配置 _alert_thresholds = { 'pool_usage_percent': 80, # 连接池使用率告警阈值 'slow_query_time': 5.0, # 慢查询阈值(秒) 'max_transaction_time': 30.0, # 最大事务执行时间 'connection_error_count': 10, # 连接错误告警阈值 'alert_cooldown': 300 # 告警冷却时间(秒) } _last_alerts = {} def get_pool_status() -> Dict[str, Any]: """获取连接池状态""" if hasattr(engine.pool, 'status'): pool = engine.pool stats = { 'total': pool.size() if hasattr(pool, 'size') else 0, 'checked_in': pool.checkedin() if hasattr(pool, 'checkedin') else 0, 'checked_out': pool.checkedout() if hasattr(pool, 'checkedout') else 0, 'overflow': pool.overflow() if hasattr(pool, 'overflow') else 0, 'invalidate': pool.invalid() if hasattr(pool, 'invalid') else 0, } else: # 估算值 stats = { 'total': _pool_stats.get('total_connections', 0), 'checked_in': _pool_stats.get('checked_in', 0), 'checked_out': _pool_stats.get('checked_out', 0), 'overflow': _pool_stats.get('overflow', 0), 'invalidate': _pool_stats.get('invalidate_count', 0), } # 计算使用率 if stats['total'] > 0: usage_percent = (stats['checked_out'] / stats['total']) * 100 stats['usage_percent'] = round(usage_percent, 2) else: stats['usage_percent'] = 0 # 更新峰值 if stats['checked_out'] > _pool_stats['peak_connections']: _pool_stats['peak_connections'] = stats['checked_out'] return stats def check_pool_alerts(): """检查连接池告警""" current_time = time.time() stats = get_pool_status() # 连接池使用率告警 if stats.get('usage_percent', 0) >= _alert_thresholds['pool_usage_percent']: alert_key = 'pool_usage' if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']: logger.warning( f"🚨 数据库连接池告警: 使用率 {stats['usage_percent']}% 超过阈值 {_alert_thresholds['pool_usage_percent']}% " f"(已使用: {stats['checked_out']}/{stats['total']})" ) _last_alerts[alert_key] = current_time # 连接错误告警 error_count = len(_pool_stats['connection_errors']) if error_count >= _alert_thresholds['connection_error_count']: alert_key = 'connection_errors' if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']: recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300] logger.warning( f"🚨 数据库连接错误告警: 近5分钟内发生 {len(recent_errors)} 次连接错误" ) _last_alerts[alert_key] = current_time # 慢查询告警 slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300] if len(slow_queries) >= 5: alert_key = 'slow_queries' if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']: avg_time = sum(q['duration'] for q in slow_queries) / len(slow_queries) logger.warning( f"🚨 慢查询告警: 近5分钟内 {len(slow_queries)} 个慢查询,平均耗时 {avg_time:.2f}s" ) _last_alerts[alert_key] = current_time def log_transaction_start(sql: str, params: Optional[Dict] = None): """记录事务开始""" transaction_info = { 'sql': sql[:100] + '...' if len(sql) > 100 else sql, 'params': params, 'start_time': time.time(), 'thread_id': threading.get_ident() } _pool_stats['transactions'].append(transaction_info) def log_transaction_end(success: bool = True, error: Optional[str] = None): """记录事务结束""" if not _pool_stats['transactions']: return current_time = time.time() transaction = _pool_stats['transactions'][-1] duration = current_time - transaction['start_time'] # 慢事务告警 if duration >= _alert_thresholds['max_transaction_time']: logger.warning( f"🐌 慢事务告警: 执行时间 {duration:.2f}s 超过阈值 {_alert_thresholds['max_transaction_time']}s " f"SQL: {transaction['sql']}" ) # 记录慢查询 if duration >= _alert_thresholds['slow_query_time']: _pool_stats['slow_queries'].append({ 'sql': transaction['sql'], 'duration': duration, 'timestamp': current_time }) # 清理旧记录(保留最近1000条) if len(_pool_stats['slow_queries']) > 1000: _pool_stats['slow_queries'] = _pool_stats['slow_queries'][-1000:] _pool_stats['transactions'].pop() def log_connection_error(error: str, sql: Optional[str] = None): """记录连接错误""" _pool_stats['connection_errors'].append({ 'error': error, 'sql': sql, 'timestamp': time.time() }) # 清理旧记录(保留最近100条) if len(_pool_stats['connection_errors']) > 100: _pool_stats['connection_errors'] = _pool_stats['connection_errors'][-100:] def reset_stats(): """重置统计信息""" global _pool_stats _pool_stats = { 'total_connections': 0, 'checked_in': 0, 'checked_out': 0, 'overflow': 0, 'invalidate_count': 0, 'transactions': [], 'slow_queries': [], 'connection_errors': [], 'peak_connections': 0, 'last_reset': datetime.now() } logger.info("数据库监控统计已重置") def get_monitoring_report() -> Dict[str, Any]: """获取监控报告""" stats = get_pool_status() current_time = time.time() # 计算最近5分钟的数据 recent_slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300] recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300] report = { 'timestamp': datetime.now().isoformat(), 'pool_status': stats, 'peak_connections': _pool_stats['peak_connections'], 'recent_5min': { 'slow_queries_count': len(recent_slow_queries), 'connection_errors_count': len(recent_errors), 'avg_slow_query_time': sum(q['duration'] for q in recent_slow_queries) / len(recent_slow_queries) if recent_slow_queries else 0 }, 'slow_queries': recent_slow_queries[-10:], # 最近10条慢查询 'connection_errors': recent_errors[-10:], # 最近10条连接错误 'last_reset': _pool_stats['last_reset'].isoformat() } return report # 定时监控任务 def monitoring_task(): """定时监控任务""" while True: try: check_pool_alerts() time.sleep(30) # 每30秒检查一次 except Exception as e: logger.error(f"数据库监控任务异常: {e}") time.sleep(60) # 异常时等待更长时间 # 启动后台监控线程 def start_monitoring(): """启动后台监控""" monitor_thread = threading.Thread(target=monitoring_task, daemon=True) monitor_thread.start() logger.info("数据库连接池监控已启动") # SQLAlchemy事件监听 @event.listens_for(Engine, "before_cursor_execute") def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany): """SQL执行前监听""" log_transaction_start(statement, params) @event.listens_for(Engine, "after_cursor_execute") def receive_after_cursor_execute(conn, cursor, statement, params, context, executemany): """SQL执行后监听""" log_transaction_end(success=True) @event.listens_for(Engine, "handle_error") def receive_handle_error(exception, context): """错误监听""" error_msg = str(exception) sql = context.statement if context and hasattr(context, 'statement') else None log_connection_error(error_msg, sql) log_transaction_end(success=False, error=error_msg) def log_pool_status(): """记录连接池状态到日志""" stats = get_pool_status() logger.info( f"数据库连接池状态: 使用率 {stats['usage_percent']}% " f"(已用: {stats['checked_out']}, 空闲: {stats['checked_in']}, 总计: {stats['total']}) " f"峰值: {_pool_stats['peak_connections']}" )