数据库监控日志、接口监控，熔断机制，提高连接池

2025-11-29 16:02:28 +08:00
parent c82c4b1dbe
commit ae476256a9
5 changed files with 651 additions and 54 deletions
--- a/app/core/database.py
+++ b/app/core/database.py
@@ -1,14 +1,28 @@
 from sqlalchemy import create_engine, MetaData
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
+from sqlalchemy.pool import QueuePool
 from .config import settings

-engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True, echo=False, pool_size=30, max_overflow=60, pool_timeout=30, pool_recycle=3600)
+# 创建带连接池监控的引擎
+engine = create_engine(
+    settings.DATABASE_URL,
+    poolclass=QueuePool,
+    pool_pre_ping=True,
+    echo=False,  # 生产环境建议关闭SQL日志
+    pool_size=400,
+    max_overflow=600,
+    pool_timeout=60,  # 增加超时时间到60秒
+    pool_recycle=3600,  # 1小时回收连接
+    pool_reset_on_return='commit'  # 归还连接时重置状态
+)
+
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

 Base = declarative_base()

 def get_db():
+    """数据库依赖注入函数"""
    db = SessionLocal()
    try:
        yield db
--- a/app/core/db_monitor.py
+++ b/app/core/db_monitor.py
@@ -0,0 +1,252 @@
+"""
+数据库连接池监控模块
+监控连接池状态、事务执行情况，预防雪崩效应
+"""
+import logging
+import time
+import threading
+from sqlalchemy import create_engine, event
+from sqlalchemy.engine import Engine
+from sqlalchemy.pool import QueuePool
+from typing import Dict, Any, Optional
+from datetime import datetime, timedelta
+from .database import engine
+from .logging_config import get_logger
+
+logger = get_logger(__name__)
+
+# 全局监控数据
+_pool_stats = {
+    'total_connections': 0,
+    'checked_in': 0,
+    'checked_out': 0,
+    'overflow': 0,
+    'invalidate_count': 0,
+    'transactions': [],  # 事务统计
+    'slow_queries': [],  # 慢查询
+    'connection_errors': [],  # 连接错误
+    'peak_connections': 0,
+    'last_reset': datetime.now()
+}
+
+# 告警配置
+_alert_thresholds = {
+    'pool_usage_percent': 80,  # 连接池使用率告警阈值
+    'slow_query_time': 5.0,  # 慢查询阈值（秒）
+    'max_transaction_time': 30.0,  # 最大事务执行时间
+    'connection_error_count': 10,  # 连接错误告警阈值
+    'alert_cooldown': 300  # 告警冷却时间（秒）
+}
+
+_last_alerts = {}
+
+def get_pool_status() -> Dict[str, Any]:
+    """获取连接池状态"""
+    if hasattr(engine.pool, 'status'):
+        pool = engine.pool
+        stats = {
+            'total': pool.size() if hasattr(pool, 'size') else 0,
+            'checked_in': pool.checkedin() if hasattr(pool, 'checkedin') else 0,
+            'checked_out': pool.checkedout() if hasattr(pool, 'checkedout') else 0,
+            'overflow': pool.overflow() if hasattr(pool, 'overflow') else 0,
+            'invalidate': pool.invalid() if hasattr(pool, 'invalid') else 0,
+        }
+    else:
+        # 估算值
+        stats = {
+            'total': _pool_stats.get('total_connections', 0),
+            'checked_in': _pool_stats.get('checked_in', 0),
+            'checked_out': _pool_stats.get('checked_out', 0),
+            'overflow': _pool_stats.get('overflow', 0),
+            'invalidate': _pool_stats.get('invalidate_count', 0),
+        }
+
+    # 计算使用率
+    if stats['total'] > 0:
+        usage_percent = (stats['checked_out'] / stats['total']) * 100
+        stats['usage_percent'] = round(usage_percent, 2)
+    else:
+        stats['usage_percent'] = 0
+
+    # 更新峰值
+    if stats['checked_out'] > _pool_stats['peak_connections']:
+        _pool_stats['peak_connections'] = stats['checked_out']
+
+    return stats
+
+def check_pool_alerts():
+    """检查连接池告警"""
+    current_time = time.time()
+    stats = get_pool_status()
+
+    # 连接池使用率告警
+    if stats.get('usage_percent', 0) >= _alert_thresholds['pool_usage_percent']:
+        alert_key = 'pool_usage'
+        if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
+            logger.warning(
+                f"🚨 数据库连接池告警: 使用率 {stats['usage_percent']}% 超过阈值 {_alert_thresholds['pool_usage_percent']}% "
+                f"(已使用: {stats['checked_out']}/{stats['total']})"
+            )
+            _last_alerts[alert_key] = current_time
+
+    # 连接错误告警
+    error_count = len(_pool_stats['connection_errors'])
+    if error_count >= _alert_thresholds['connection_error_count']:
+        alert_key = 'connection_errors'
+        if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
+            recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300]
+            logger.warning(
+                f"🚨 数据库连接错误告警: 近5分钟内发生 {len(recent_errors)} 次连接错误"
+            )
+            _last_alerts[alert_key] = current_time
+
+    # 慢查询告警
+    slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300]
+    if len(slow_queries) >= 5:
+        alert_key = 'slow_queries'
+        if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
+            avg_time = sum(q['duration'] for q in slow_queries) / len(slow_queries)
+            logger.warning(
+                f"🚨 慢查询告警: 近5分钟内 {len(slow_queries)} 个慢查询，平均耗时 {avg_time:.2f}s"
+            )
+            _last_alerts[alert_key] = current_time
+
+def log_transaction_start(sql: str, params: Optional[Dict] = None):
+    """记录事务开始"""
+    transaction_info = {
+        'sql': sql[:100] + '...' if len(sql) > 100 else sql,
+        'params': params,
+        'start_time': time.time(),
+        'thread_id': threading.get_ident()
+    }
+    _pool_stats['transactions'].append(transaction_info)
+
+def log_transaction_end(success: bool = True, error: Optional[str] = None):
+    """记录事务结束"""
+    if not _pool_stats['transactions']:
+        return
+
+    current_time = time.time()
+    transaction = _pool_stats['transactions'][-1]
+    duration = current_time - transaction['start_time']
+
+    # 慢事务告警
+    if duration >= _alert_thresholds['max_transaction_time']:
+        logger.warning(
+            f"🐌 慢事务告警: 执行时间 {duration:.2f}s 超过阈值 {_alert_thresholds['max_transaction_time']}s "
+            f"SQL: {transaction['sql']}"
+        )
+
+    # 记录慢查询
+    if duration >= _alert_thresholds['slow_query_time']:
+        _pool_stats['slow_queries'].append({
+            'sql': transaction['sql'],
+            'duration': duration,
+            'timestamp': current_time
+        })
+
+    # 清理旧记录（保留最近1000条）
+    if len(_pool_stats['slow_queries']) > 1000:
+        _pool_stats['slow_queries'] = _pool_stats['slow_queries'][-1000:]
+
+    _pool_stats['transactions'].pop()
+
+def log_connection_error(error: str, sql: Optional[str] = None):
+    """记录连接错误"""
+    _pool_stats['connection_errors'].append({
+        'error': error,
+        'sql': sql,
+        'timestamp': time.time()
+    })
+
+    # 清理旧记录（保留最近100条）
+    if len(_pool_stats['connection_errors']) > 100:
+        _pool_stats['connection_errors'] = _pool_stats['connection_errors'][-100:]
+
+def reset_stats():
+    """重置统计信息"""
+    global _pool_stats
+    _pool_stats = {
+        'total_connections': 0,
+        'checked_in': 0,
+        'checked_out': 0,
+        'overflow': 0,
+        'invalidate_count': 0,
+        'transactions': [],
+        'slow_queries': [],
+        'connection_errors': [],
+        'peak_connections': 0,
+        'last_reset': datetime.now()
+    }
+    logger.info("数据库监控统计已重置")
+
+def get_monitoring_report() -> Dict[str, Any]:
+    """获取监控报告"""
+    stats = get_pool_status()
+    current_time = time.time()
+
+    # 计算最近5分钟的数据
+    recent_slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300]
+    recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300]
+
+    report = {
+        'timestamp': datetime.now().isoformat(),
+        'pool_status': stats,
+        'peak_connections': _pool_stats['peak_connections'],
+        'recent_5min': {
+            'slow_queries_count': len(recent_slow_queries),
+            'connection_errors_count': len(recent_errors),
+            'avg_slow_query_time': sum(q['duration'] for q in recent_slow_queries) / len(recent_slow_queries) if recent_slow_queries else 0
+        },
+        'slow_queries': recent_slow_queries[-10:],  # 最近10条慢查询
+        'connection_errors': recent_errors[-10:],  # 最近10条连接错误
+        'last_reset': _pool_stats['last_reset'].isoformat()
+    }
+
+    return report
+
+# 定时监控任务
+def monitoring_task():
+    """定时监控任务"""
+    while True:
+        try:
+            check_pool_alerts()
+            time.sleep(30)  # 每30秒检查一次
+        except Exception as e:
+            logger.error(f"数据库监控任务异常: {e}")
+            time.sleep(60)  # 异常时等待更长时间
+
+# 启动后台监控线程
+def start_monitoring():
+    """启动后台监控"""
+    monitor_thread = threading.Thread(target=monitoring_task, daemon=True)
+    monitor_thread.start()
+    logger.info("数据库连接池监控已启动")
+
+# SQLAlchemy事件监听
+@event.listens_for(Engine, "before_cursor_execute")
+def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
+    """SQL执行前监听"""
+    log_transaction_start(statement, params)
+
+@event.listens_for(Engine, "after_cursor_execute")
+def receive_after_cursor_execute(conn, cursor, statement, params, context, executemany):
+    """SQL执行后监听"""
+    log_transaction_end(success=True)
+
+@event.listens_for(Engine, "handle_error")
+def receive_handle_error(exception, context):
+    """错误监听"""
+    error_msg = str(exception)
+    sql = context.statement if context and hasattr(context, 'statement') else None
+    log_connection_error(error_msg, sql)
+    log_transaction_end(success=False, error=error_msg)
+
+def log_pool_status():
+    """记录连接池状态到日志"""
+    stats = get_pool_status()
+    logger.info(
+        f"数据库连接池状态: 使用率 {stats['usage_percent']}% "
+        f"(已用: {stats['checked_out']}, 空闲: {stats['checked_in']}, 总计: {stats['total']}) "
+        f"峰值: {_pool_stats['peak_connections']}"
+    )
--- a/app/core/retry.py
+++ b/app/core/retry.py
@@ -0,0 +1,211 @@
+"""
+重试机制和雪崩效应防护
+提供指数退避、熔断器、重试装饰器等功能
+"""
+import logging
+import time
+import functools
+from typing import Callable, Any, Optional
+from enum import Enum
+from datetime import datetime, timedelta
+
+logger = logging.getLogger(__name__)
+
+class CircuitBreakerState(Enum):
+    """熔断器状态"""
+    CLOSED = "closed"      # 正常状态
+    OPEN = "open"          # 熔断状态
+    HALF_OPEN = "half_open"  # 半开状态
+
+class CircuitBreaker:
+    """熔断器实现"""
+
+    def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
+        """
+        初始化熔断器
+
+        Args:
+            failure_threshold: 失败阈值，达到此数量后触发熔断
+            recovery_timeout: 恢复超时时间（秒）
+        """
+        self.failure_threshold = failure_threshold
+        self.recovery_timeout = recovery_timeout
+        self.failure_count = 0
+        self.last_failure_time = None
+        self.state = CircuitBreakerState.CLOSED
+
+    def call(self, func: Callable, *args, **kwargs):
+        """通过熔断器执行函数"""
+        if self.state == CircuitBreakerState.OPEN:
+            # 检查是否应该进入半开状态
+            if self.last_failure_time and \
+               (datetime.now() - self.last_failure_time).seconds >= self.recovery_timeout:
+                self.state = CircuitBreakerState.HALF_OPEN
+                logger.info("熔断器进入半开状态")
+            else:
+                raise Exception("熔断器开启，直接拒绝请求")
+
+        try:
+            result = func(*args, **kwargs)
+            # 执行成功，重置状态
+            if self.state == CircuitBreakerState.HALF_OPEN:
+                self.state = CircuitBreakerState.CLOSED
+                self.failure_count = 0
+                logger.info("熔断器关闭，恢复正常")
+            return result
+        except Exception as e:
+            self.failure_count += 1
+            self.last_failure_time = datetime.now()
+
+            if self.failure_count >= self.failure_threshold:
+                self.state = CircuitBreakerState.OPEN
+                logger.warning(f"熔断器开启，失败次数: {self.failure_count}")
+
+            raise e
+
+class RetryConfig:
+    """重试配置"""
+    def __init__(
+        self,
+        max_attempts: int = 3,
+        base_delay: float = 1.0,
+        max_delay: float = 60.0,
+        exponential_base: float = 2.0,
+        jitter: bool = True
+    ):
+        """
+        初始化重试配置
+
+        Args:
+            max_attempts: 最大重试次数
+            base_delay: 基础延迟时间（秒）
+            max_delay: 最大延迟时间（秒）
+            exponential_base: 指数退避基数
+            jitter: 是否添加随机抖动
+        """
+        self.max_attempts = max_attempts
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+        self.exponential_base = exponential_base
+        self.jitter = jitter
+
+def retry(
+    config: Optional[RetryConfig] = None,
+    exceptions: tuple = (Exception,)
+):
+    """
+    重试装饰器
+
+    Args:
+        config: 重试配置，如果为None则使用默认配置
+        exceptions: 需要重试的异常类型
+    """
+    if config is None:
+        config = RetryConfig()
+
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+
+            for attempt in range(config.max_attempts):
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    last_exception = e
+
+                    if attempt == config.max_attempts - 1:
+                        # 最后一次尝试失败，抛出异常
+                        logger.error(
+                            f"函数 {func.__name__} 经过 {config.max_attempts} 次重试后仍然失败: {str(e)}"
+                        )
+                        raise e
+
+                    # 计算延迟时间
+                    delay = min(
+                        config.base_delay * (config.exponential_base ** attempt),
+                        config.max_delay
+                    )
+
+                    # 添加抖动
+                    if config.jitter:
+                        import random
+                        delay = delay * (0.5 + random.random() * 0.5)
+
+                    logger.warning(
+                        f"函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, "
+                        f"{delay:.2f} 秒后重试"
+                    )
+
+                    time.sleep(delay)
+
+            # 理论上不会到达这里，但为了安全起见
+            if last_exception:
+                raise last_exception
+
+        return wrapper
+    return decorator
+
+# 全局熔断器实例
+_default_circuit_breaker = CircuitBreaker()
+
+def circuit_breaker(
+    failure_threshold: int = 5,
+    recovery_timeout: int = 60
+):
+    """
+    熔断器装饰器
+
+    Args:
+        failure_threshold: 失败阈值
+        recovery_timeout: 恢复超时时间
+    """
+    def decorator(func: Callable) -> Callable:
+        breaker = CircuitBreaker(failure_threshold, recovery_timeout)
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            return breaker.call(func, *args, **kwargs)
+
+        # 将熔断器实例附加到函数上，方便外部查看状态
+        wrapper.circuit_breaker = breaker
+        return wrapper
+
+    return decorator
+
+def with_circuit_breaker(func: Callable, *args, **kwargs):
+    """使用默认熔断器执行函数"""
+    return _default_circuit_breaker.call(func, *args, **kwargs)
+
+# 预定义的重试配置
+RETRY_CONFIG_FAST = RetryConfig(
+    max_attempts=3,
+    base_delay=0.5,
+    max_delay=5.0,
+    exponential_base=2.0,
+    jitter=True
+)
+
+RETRY_CONFIG_SLOW = RetryConfig(
+    max_attempts=5,
+    base_delay=2.0,
+    max_delay=60.0,
+    exponential_base=2.0,
+    jitter=True
+)
+
+RETRY_CONFIG_DB = RetryConfig(
+    max_attempts=3,
+    base_delay=1.0,
+    max_delay=10.0,
+    exponential_base=2.0,
+    jitter=True
+)
+
+# 数据库操作重试装饰器
+def retry_db_operation(max_attempts: int = 3):
+    """数据库操作重试装饰器"""
+    return retry(
+        config=RETRY_CONFIG_DB,
+        exceptions=(Exception,)
+    )