数据库监控日志、接口监控,熔断机制,提高连接池
This commit is contained in:
@@ -1,14 +1,28 @@
|
||||
from sqlalchemy import create_engine, MetaData
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.pool import QueuePool
|
||||
from .config import settings
|
||||
|
||||
engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True, echo=False, pool_size=30, max_overflow=60, pool_timeout=30, pool_recycle=3600)
|
||||
# 创建带连接池监控的引擎
|
||||
engine = create_engine(
|
||||
settings.DATABASE_URL,
|
||||
poolclass=QueuePool,
|
||||
pool_pre_ping=True,
|
||||
echo=False, # 生产环境建议关闭SQL日志
|
||||
pool_size=400,
|
||||
max_overflow=600,
|
||||
pool_timeout=60, # 增加超时时间到60秒
|
||||
pool_recycle=3600, # 1小时回收连接
|
||||
pool_reset_on_return='commit' # 归还连接时重置状态
|
||||
)
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
def get_db():
|
||||
"""数据库依赖注入函数"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
|
||||
252
app/core/db_monitor.py
Normal file
252
app/core/db_monitor.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""
|
||||
数据库连接池监控模块
|
||||
监控连接池状态、事务执行情况,预防雪崩效应
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
import threading
|
||||
from sqlalchemy import create_engine, event
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.pool import QueuePool
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from .database import engine
|
||||
from .logging_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# 全局监控数据
|
||||
_pool_stats = {
|
||||
'total_connections': 0,
|
||||
'checked_in': 0,
|
||||
'checked_out': 0,
|
||||
'overflow': 0,
|
||||
'invalidate_count': 0,
|
||||
'transactions': [], # 事务统计
|
||||
'slow_queries': [], # 慢查询
|
||||
'connection_errors': [], # 连接错误
|
||||
'peak_connections': 0,
|
||||
'last_reset': datetime.now()
|
||||
}
|
||||
|
||||
# 告警配置
|
||||
_alert_thresholds = {
|
||||
'pool_usage_percent': 80, # 连接池使用率告警阈值
|
||||
'slow_query_time': 5.0, # 慢查询阈值(秒)
|
||||
'max_transaction_time': 30.0, # 最大事务执行时间
|
||||
'connection_error_count': 10, # 连接错误告警阈值
|
||||
'alert_cooldown': 300 # 告警冷却时间(秒)
|
||||
}
|
||||
|
||||
_last_alerts = {}
|
||||
|
||||
def get_pool_status() -> Dict[str, Any]:
|
||||
"""获取连接池状态"""
|
||||
if hasattr(engine.pool, 'status'):
|
||||
pool = engine.pool
|
||||
stats = {
|
||||
'total': pool.size() if hasattr(pool, 'size') else 0,
|
||||
'checked_in': pool.checkedin() if hasattr(pool, 'checkedin') else 0,
|
||||
'checked_out': pool.checkedout() if hasattr(pool, 'checkedout') else 0,
|
||||
'overflow': pool.overflow() if hasattr(pool, 'overflow') else 0,
|
||||
'invalidate': pool.invalid() if hasattr(pool, 'invalid') else 0,
|
||||
}
|
||||
else:
|
||||
# 估算值
|
||||
stats = {
|
||||
'total': _pool_stats.get('total_connections', 0),
|
||||
'checked_in': _pool_stats.get('checked_in', 0),
|
||||
'checked_out': _pool_stats.get('checked_out', 0),
|
||||
'overflow': _pool_stats.get('overflow', 0),
|
||||
'invalidate': _pool_stats.get('invalidate_count', 0),
|
||||
}
|
||||
|
||||
# 计算使用率
|
||||
if stats['total'] > 0:
|
||||
usage_percent = (stats['checked_out'] / stats['total']) * 100
|
||||
stats['usage_percent'] = round(usage_percent, 2)
|
||||
else:
|
||||
stats['usage_percent'] = 0
|
||||
|
||||
# 更新峰值
|
||||
if stats['checked_out'] > _pool_stats['peak_connections']:
|
||||
_pool_stats['peak_connections'] = stats['checked_out']
|
||||
|
||||
return stats
|
||||
|
||||
def check_pool_alerts():
|
||||
"""检查连接池告警"""
|
||||
current_time = time.time()
|
||||
stats = get_pool_status()
|
||||
|
||||
# 连接池使用率告警
|
||||
if stats.get('usage_percent', 0) >= _alert_thresholds['pool_usage_percent']:
|
||||
alert_key = 'pool_usage'
|
||||
if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
|
||||
logger.warning(
|
||||
f"🚨 数据库连接池告警: 使用率 {stats['usage_percent']}% 超过阈值 {_alert_thresholds['pool_usage_percent']}% "
|
||||
f"(已使用: {stats['checked_out']}/{stats['total']})"
|
||||
)
|
||||
_last_alerts[alert_key] = current_time
|
||||
|
||||
# 连接错误告警
|
||||
error_count = len(_pool_stats['connection_errors'])
|
||||
if error_count >= _alert_thresholds['connection_error_count']:
|
||||
alert_key = 'connection_errors'
|
||||
if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
|
||||
recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300]
|
||||
logger.warning(
|
||||
f"🚨 数据库连接错误告警: 近5分钟内发生 {len(recent_errors)} 次连接错误"
|
||||
)
|
||||
_last_alerts[alert_key] = current_time
|
||||
|
||||
# 慢查询告警
|
||||
slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300]
|
||||
if len(slow_queries) >= 5:
|
||||
alert_key = 'slow_queries'
|
||||
if alert_key not in _last_alerts or (current_time - _last_alerts.get(alert_key, 0)) > _alert_thresholds['alert_cooldown']:
|
||||
avg_time = sum(q['duration'] for q in slow_queries) / len(slow_queries)
|
||||
logger.warning(
|
||||
f"🚨 慢查询告警: 近5分钟内 {len(slow_queries)} 个慢查询,平均耗时 {avg_time:.2f}s"
|
||||
)
|
||||
_last_alerts[alert_key] = current_time
|
||||
|
||||
def log_transaction_start(sql: str, params: Optional[Dict] = None):
|
||||
"""记录事务开始"""
|
||||
transaction_info = {
|
||||
'sql': sql[:100] + '...' if len(sql) > 100 else sql,
|
||||
'params': params,
|
||||
'start_time': time.time(),
|
||||
'thread_id': threading.get_ident()
|
||||
}
|
||||
_pool_stats['transactions'].append(transaction_info)
|
||||
|
||||
def log_transaction_end(success: bool = True, error: Optional[str] = None):
|
||||
"""记录事务结束"""
|
||||
if not _pool_stats['transactions']:
|
||||
return
|
||||
|
||||
current_time = time.time()
|
||||
transaction = _pool_stats['transactions'][-1]
|
||||
duration = current_time - transaction['start_time']
|
||||
|
||||
# 慢事务告警
|
||||
if duration >= _alert_thresholds['max_transaction_time']:
|
||||
logger.warning(
|
||||
f"🐌 慢事务告警: 执行时间 {duration:.2f}s 超过阈值 {_alert_thresholds['max_transaction_time']}s "
|
||||
f"SQL: {transaction['sql']}"
|
||||
)
|
||||
|
||||
# 记录慢查询
|
||||
if duration >= _alert_thresholds['slow_query_time']:
|
||||
_pool_stats['slow_queries'].append({
|
||||
'sql': transaction['sql'],
|
||||
'duration': duration,
|
||||
'timestamp': current_time
|
||||
})
|
||||
|
||||
# 清理旧记录(保留最近1000条)
|
||||
if len(_pool_stats['slow_queries']) > 1000:
|
||||
_pool_stats['slow_queries'] = _pool_stats['slow_queries'][-1000:]
|
||||
|
||||
_pool_stats['transactions'].pop()
|
||||
|
||||
def log_connection_error(error: str, sql: Optional[str] = None):
|
||||
"""记录连接错误"""
|
||||
_pool_stats['connection_errors'].append({
|
||||
'error': error,
|
||||
'sql': sql,
|
||||
'timestamp': time.time()
|
||||
})
|
||||
|
||||
# 清理旧记录(保留最近100条)
|
||||
if len(_pool_stats['connection_errors']) > 100:
|
||||
_pool_stats['connection_errors'] = _pool_stats['connection_errors'][-100:]
|
||||
|
||||
def reset_stats():
|
||||
"""重置统计信息"""
|
||||
global _pool_stats
|
||||
_pool_stats = {
|
||||
'total_connections': 0,
|
||||
'checked_in': 0,
|
||||
'checked_out': 0,
|
||||
'overflow': 0,
|
||||
'invalidate_count': 0,
|
||||
'transactions': [],
|
||||
'slow_queries': [],
|
||||
'connection_errors': [],
|
||||
'peak_connections': 0,
|
||||
'last_reset': datetime.now()
|
||||
}
|
||||
logger.info("数据库监控统计已重置")
|
||||
|
||||
def get_monitoring_report() -> Dict[str, Any]:
|
||||
"""获取监控报告"""
|
||||
stats = get_pool_status()
|
||||
current_time = time.time()
|
||||
|
||||
# 计算最近5分钟的数据
|
||||
recent_slow_queries = [q for q in _pool_stats['slow_queries'] if (current_time - q['timestamp']) < 300]
|
||||
recent_errors = [e for e in _pool_stats['connection_errors'] if (current_time - e['timestamp']) < 300]
|
||||
|
||||
report = {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'pool_status': stats,
|
||||
'peak_connections': _pool_stats['peak_connections'],
|
||||
'recent_5min': {
|
||||
'slow_queries_count': len(recent_slow_queries),
|
||||
'connection_errors_count': len(recent_errors),
|
||||
'avg_slow_query_time': sum(q['duration'] for q in recent_slow_queries) / len(recent_slow_queries) if recent_slow_queries else 0
|
||||
},
|
||||
'slow_queries': recent_slow_queries[-10:], # 最近10条慢查询
|
||||
'connection_errors': recent_errors[-10:], # 最近10条连接错误
|
||||
'last_reset': _pool_stats['last_reset'].isoformat()
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
# 定时监控任务
|
||||
def monitoring_task():
|
||||
"""定时监控任务"""
|
||||
while True:
|
||||
try:
|
||||
check_pool_alerts()
|
||||
time.sleep(30) # 每30秒检查一次
|
||||
except Exception as e:
|
||||
logger.error(f"数据库监控任务异常: {e}")
|
||||
time.sleep(60) # 异常时等待更长时间
|
||||
|
||||
# 启动后台监控线程
|
||||
def start_monitoring():
|
||||
"""启动后台监控"""
|
||||
monitor_thread = threading.Thread(target=monitoring_task, daemon=True)
|
||||
monitor_thread.start()
|
||||
logger.info("数据库连接池监控已启动")
|
||||
|
||||
# SQLAlchemy事件监听
|
||||
@event.listens_for(Engine, "before_cursor_execute")
|
||||
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
|
||||
"""SQL执行前监听"""
|
||||
log_transaction_start(statement, params)
|
||||
|
||||
@event.listens_for(Engine, "after_cursor_execute")
|
||||
def receive_after_cursor_execute(conn, cursor, statement, params, context, executemany):
|
||||
"""SQL执行后监听"""
|
||||
log_transaction_end(success=True)
|
||||
|
||||
@event.listens_for(Engine, "handle_error")
|
||||
def receive_handle_error(exception, context):
|
||||
"""错误监听"""
|
||||
error_msg = str(exception)
|
||||
sql = context.statement if context and hasattr(context, 'statement') else None
|
||||
log_connection_error(error_msg, sql)
|
||||
log_transaction_end(success=False, error=error_msg)
|
||||
|
||||
def log_pool_status():
|
||||
"""记录连接池状态到日志"""
|
||||
stats = get_pool_status()
|
||||
logger.info(
|
||||
f"数据库连接池状态: 使用率 {stats['usage_percent']}% "
|
||||
f"(已用: {stats['checked_out']}, 空闲: {stats['checked_in']}, 总计: {stats['total']}) "
|
||||
f"峰值: {_pool_stats['peak_connections']}"
|
||||
)
|
||||
211
app/core/retry.py
Normal file
211
app/core/retry.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
重试机制和雪崩效应防护
|
||||
提供指数退避、熔断器、重试装饰器等功能
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
import functools
|
||||
from typing import Callable, Any, Optional
|
||||
from enum import Enum
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CircuitBreakerState(Enum):
|
||||
"""熔断器状态"""
|
||||
CLOSED = "closed" # 正常状态
|
||||
OPEN = "open" # 熔断状态
|
||||
HALF_OPEN = "half_open" # 半开状态
|
||||
|
||||
class CircuitBreaker:
|
||||
"""熔断器实现"""
|
||||
|
||||
def __init__(self, failure_threshold: int = 5, recovery_timeout: int = 60):
|
||||
"""
|
||||
初始化熔断器
|
||||
|
||||
Args:
|
||||
failure_threshold: 失败阈值,达到此数量后触发熔断
|
||||
recovery_timeout: 恢复超时时间(秒)
|
||||
"""
|
||||
self.failure_threshold = failure_threshold
|
||||
self.recovery_timeout = recovery_timeout
|
||||
self.failure_count = 0
|
||||
self.last_failure_time = None
|
||||
self.state = CircuitBreakerState.CLOSED
|
||||
|
||||
def call(self, func: Callable, *args, **kwargs):
|
||||
"""通过熔断器执行函数"""
|
||||
if self.state == CircuitBreakerState.OPEN:
|
||||
# 检查是否应该进入半开状态
|
||||
if self.last_failure_time and \
|
||||
(datetime.now() - self.last_failure_time).seconds >= self.recovery_timeout:
|
||||
self.state = CircuitBreakerState.HALF_OPEN
|
||||
logger.info("熔断器进入半开状态")
|
||||
else:
|
||||
raise Exception("熔断器开启,直接拒绝请求")
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
# 执行成功,重置状态
|
||||
if self.state == CircuitBreakerState.HALF_OPEN:
|
||||
self.state = CircuitBreakerState.CLOSED
|
||||
self.failure_count = 0
|
||||
logger.info("熔断器关闭,恢复正常")
|
||||
return result
|
||||
except Exception as e:
|
||||
self.failure_count += 1
|
||||
self.last_failure_time = datetime.now()
|
||||
|
||||
if self.failure_count >= self.failure_threshold:
|
||||
self.state = CircuitBreakerState.OPEN
|
||||
logger.warning(f"熔断器开启,失败次数: {self.failure_count}")
|
||||
|
||||
raise e
|
||||
|
||||
class RetryConfig:
|
||||
"""重试配置"""
|
||||
def __init__(
|
||||
self,
|
||||
max_attempts: int = 3,
|
||||
base_delay: float = 1.0,
|
||||
max_delay: float = 60.0,
|
||||
exponential_base: float = 2.0,
|
||||
jitter: bool = True
|
||||
):
|
||||
"""
|
||||
初始化重试配置
|
||||
|
||||
Args:
|
||||
max_attempts: 最大重试次数
|
||||
base_delay: 基础延迟时间(秒)
|
||||
max_delay: 最大延迟时间(秒)
|
||||
exponential_base: 指数退避基数
|
||||
jitter: 是否添加随机抖动
|
||||
"""
|
||||
self.max_attempts = max_attempts
|
||||
self.base_delay = base_delay
|
||||
self.max_delay = max_delay
|
||||
self.exponential_base = exponential_base
|
||||
self.jitter = jitter
|
||||
|
||||
def retry(
|
||||
config: Optional[RetryConfig] = None,
|
||||
exceptions: tuple = (Exception,)
|
||||
):
|
||||
"""
|
||||
重试装饰器
|
||||
|
||||
Args:
|
||||
config: 重试配置,如果为None则使用默认配置
|
||||
exceptions: 需要重试的异常类型
|
||||
"""
|
||||
if config is None:
|
||||
config = RetryConfig()
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(config.max_attempts):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except exceptions as e:
|
||||
last_exception = e
|
||||
|
||||
if attempt == config.max_attempts - 1:
|
||||
# 最后一次尝试失败,抛出异常
|
||||
logger.error(
|
||||
f"函数 {func.__name__} 经过 {config.max_attempts} 次重试后仍然失败: {str(e)}"
|
||||
)
|
||||
raise e
|
||||
|
||||
# 计算延迟时间
|
||||
delay = min(
|
||||
config.base_delay * (config.exponential_base ** attempt),
|
||||
config.max_delay
|
||||
)
|
||||
|
||||
# 添加抖动
|
||||
if config.jitter:
|
||||
import random
|
||||
delay = delay * (0.5 + random.random() * 0.5)
|
||||
|
||||
logger.warning(
|
||||
f"函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, "
|
||||
f"{delay:.2f} 秒后重试"
|
||||
)
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
# 理论上不会到达这里,但为了安全起见
|
||||
if last_exception:
|
||||
raise last_exception
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
# 全局熔断器实例
|
||||
_default_circuit_breaker = CircuitBreaker()
|
||||
|
||||
def circuit_breaker(
|
||||
failure_threshold: int = 5,
|
||||
recovery_timeout: int = 60
|
||||
):
|
||||
"""
|
||||
熔断器装饰器
|
||||
|
||||
Args:
|
||||
failure_threshold: 失败阈值
|
||||
recovery_timeout: 恢复超时时间
|
||||
"""
|
||||
def decorator(func: Callable) -> Callable:
|
||||
breaker = CircuitBreaker(failure_threshold, recovery_timeout)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
return breaker.call(func, *args, **kwargs)
|
||||
|
||||
# 将熔断器实例附加到函数上,方便外部查看状态
|
||||
wrapper.circuit_breaker = breaker
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
def with_circuit_breaker(func: Callable, *args, **kwargs):
|
||||
"""使用默认熔断器执行函数"""
|
||||
return _default_circuit_breaker.call(func, *args, **kwargs)
|
||||
|
||||
# 预定义的重试配置
|
||||
RETRY_CONFIG_FAST = RetryConfig(
|
||||
max_attempts=3,
|
||||
base_delay=0.5,
|
||||
max_delay=5.0,
|
||||
exponential_base=2.0,
|
||||
jitter=True
|
||||
)
|
||||
|
||||
RETRY_CONFIG_SLOW = RetryConfig(
|
||||
max_attempts=5,
|
||||
base_delay=2.0,
|
||||
max_delay=60.0,
|
||||
exponential_base=2.0,
|
||||
jitter=True
|
||||
)
|
||||
|
||||
RETRY_CONFIG_DB = RetryConfig(
|
||||
max_attempts=3,
|
||||
base_delay=1.0,
|
||||
max_delay=10.0,
|
||||
exponential_base=2.0,
|
||||
jitter=True
|
||||
)
|
||||
|
||||
# 数据库操作重试装饰器
|
||||
def retry_db_operation(max_attempts: int = 3):
|
||||
"""数据库操作重试装饰器"""
|
||||
return retry(
|
||||
config=RETRY_CONFIG_DB,
|
||||
exceptions=(Exception,)
|
||||
)
|
||||
Reference in New Issue
Block a user