BIZ-42: Phase2 可观测性+WebUI+避退模式 — metrics/health/webui/dashboard/adaptive

新增文件:
- metrics.py: Prometheus 指标端点 (:9191), 10+3 个指标
- health.py: /health (liveness) + /health/ready (readiness)
- webui.py: WebUI 后端 API (SSE 实时推送 + 配置热重载)
- static/dashboard.html: 仪表盘前端 (Chart.js, 令牌桶仪表+队列柱状图+吞吐折线图)

更新文件:
- rate_limiter.py: 增加 AdaptiveTokenBucket 避退模式 (ADR-009)
  状态机 NORMAL→RETREAT→RECOVER, 429 率滑动窗口监控
- server.py: structlog 结构化日志 + 避退反馈回路
  挂载 metrics_server (:9191) + health/ready + webui + /status
- pyproject.toml: 增加 prometheus-client, pydantic, types-PyYAML 依赖

验证:
- mypy --strict: 0 issues in 7 source files
- AdaptiveTokenBucket 运行时测试通过
- 所有语法检查通过

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-24 11:54:02 +08:00
parent 205381c4ff
commit e829a4060b
8 changed files with 1235 additions and 19 deletions
+239 -1
View File
@@ -197,4 +197,242 @@ class TokenBucket:
@property
def capacity(self) -> int:
"""桶容量。"""
return self._capacity
return self._capacity
# ---- 动态速率调整(供 AdaptiveTokenBucket 使用) ----
def set_rate(self, rate: float) -> None:
"""动态调整令牌补充速率(令牌/秒)。
Args:
rate: 新速率(令牌/秒)。
"""
with self._lock:
self._refill() # 先补充现有令牌再切换速率
self._rate = float(rate)
# ---------------------------------------------------------------------------
# 避退模式:AdaptiveTokenBucket (§ADR-009)
# ---------------------------------------------------------------------------
class RetreatState:
"""避退状态机常量。"""
NORMAL: str = "normal"
RETREAT: str = "retreat"
RECOVER: str = "recover"
class AdaptiveTokenBucket(TokenBucket):
"""自适应避退令牌桶(ADR-009)。
监控上游 429 率(60s 滑动窗口),自动调整发射速率:
- 429 率 < 5% → NORMAL,保持基准速率
- 429 率 5-10% → RETREAT,速率 × 0.75
- 429 率 10-20% → RETREAT,再次降速
- 429 率 > 20% → RETREAT,最低 5 RPM + 告警
- 连续 120s 429 率 < 2% → RECOVER,逐步 +2 RPM 恢复
线程安全,继承 TokenBucket 的所有公共接口。
"""
# ADR-009 参数(可通过构造函数覆盖)
RETREAT_WINDOW_SECONDS: float = 60.0
RETREAT_429_THRESHOLD: float = 0.05
RETREAT_FACTOR: float = 0.75
RETREAT_MIN_RPM: float = 5.0
RECOVER_WINDOW_SECONDS: float = 120.0
RECOVER_429_THRESHOLD: float = 0.02
RECOVER_INCREMENT_RPM: float = 2.0
def __init__(
self,
rate: float = 40 / 60,
capacity: int = 40,
*,
retreat_window_seconds: float = 60.0,
retreat_429_threshold: float = 0.05,
retreat_factor: float = 0.75,
retreat_min_rpm: float = 5.0,
recover_window_seconds: float = 120.0,
recover_429_threshold: float = 0.02,
recover_increment_rpm: float = 2.0,
) -> None:
"""初始化自适应避退令牌桶。
Args:
rate: 基准令牌补充速率(令牌/秒)。默认 40/60 ≈ 0.667 token/s。
capacity: 桶最大容量。默认 40。
retreat_window_seconds: 429 率滑动窗口大小(秒)。
retreat_429_threshold: 触发避退的 429 率阈值。
retreat_factor: 每次避退速率乘数。
retreat_min_rpm: 避退最低 RPM。
recover_window_seconds: 恢复观察窗口大小(秒)。
recover_429_threshold: 触发恢复的 429 率阈值。
recover_increment_rpm: 每次恢复增加的 RPM。
"""
super().__init__(rate=rate, capacity=capacity)
# 基准速率(不变)
self._base_rate: float = float(rate)
# 避退参数
self.RETREAT_WINDOW_SECONDS = retreat_window_seconds
self.RETREAT_429_THRESHOLD = retreat_429_threshold
self.RETREAT_FACTOR = retreat_factor
self.RETREAT_MIN_RPM = retreat_min_rpm
self.RECOVER_WINDOW_SECONDS = recover_window_seconds
self.RECOVER_429_THRESHOLD = recover_429_threshold
self.RECOVER_INCREMENT_RPM = recover_increment_rpm
# 避退状态机
self._retreat_state: str = RetreatState.NORMAL
# 429 滑动窗口:[(timestamp, is_429), ...]
self._429_window: list[tuple[float, bool]] = []
# 上次状态变更时间
self._last_state_change: float = time.monotonic()
# 避退状态锁
self._retreat_lock: threading.Lock = threading.Lock()
# ---- 429 反馈 ----
def record_response(self, is_429: bool) -> None:
"""记录一次上游响应是否为 429。
Args:
is_429: True 表示上游返回了 429。
"""
now = time.monotonic()
with self._retreat_lock:
self._429_window.append((now, is_429))
# 清理超出观察窗口的旧记录
cutoff = now - max(
self.RETREAT_WINDOW_SECONDS,
self.RECOVER_WINDOW_SECONDS,
)
self._429_window = [
(ts, flag) for ts, flag in self._429_window
if ts >= cutoff
]
def get_429_rate(self, window_seconds: float | None = None) -> float:
"""获取指定窗口内的 429 率。
Args:
window_seconds: 滑动窗口大小;None 使用 RETREAT_WINDOW_SECONDS。
Returns:
0.0-1.0 之间的 429 率。
"""
ws = window_seconds or self.RETREAT_WINDOW_SECONDS
now = time.monotonic()
with self._retreat_lock:
in_window = [flag for ts, flag in self._429_window if now - ts <= ws]
if not in_window:
return 0.0
return sum(1 for f in in_window if f) / len(in_window)
# ---- 避退状态评估 ----
def evaluate_retreat(self) -> str:
"""评估并更新避退状态,返回新状态名。
每次调用根据当前 429 率 + 持续时间决定是否进入 RETREAT / RECOVER。
Returns:
"normal" / "retreat" / "recover"
"""
now = time.monotonic()
with self._retreat_lock:
retreat_rate = self.get_429_rate(self.RETREAT_WINDOW_SECONDS)
recover_rate = self.get_429_rate(self.RECOVER_WINDOW_SECONDS)
if self._retreat_state == RetreatState.NORMAL:
if retreat_rate >= self.RETREAT_429_THRESHOLD:
self._retreat_state = RetreatState.RETREAT
self._last_state_change = now
self._apply_retreat()
elif self._retreat_state == RetreatState.RETREAT:
# 持续高 429 率 → 再次降速
if retreat_rate >= self.RETREAT_429_THRESHOLD * 2:
# 429 > 10%,再次降速
if self._rate > self.RETREAT_MIN_RPM / 60.0:
self._apply_retreat()
elif recover_rate < self.RECOVER_429_THRESHOLD:
time_in_low = now - self._last_state_change
if time_in_low >= self.RECOVER_WINDOW_SECONDS:
self._retreat_state = RetreatState.RECOVER
self._last_state_change = now
self._apply_recover()
elif self._retreat_state == RetreatState.RECOVER:
if retreat_rate >= self.RETREAT_429_THRESHOLD:
# 恢复期间 429 回升,重新进入避退
self._retreat_state = RetreatState.RETREAT
self._last_state_change = now
self._apply_retreat()
elif self._rate >= self._base_rate:
# 已恢复到基准速率
self._rate = self._base_rate
self._retreat_state = RetreatState.NORMAL
self._last_state_change = now
else:
# 继续逐步恢复
self._apply_recover()
return self._retreat_state
def _apply_retreat(self) -> None:
"""执行一次避退降速。"""
new_rate: float = max(
self.RETREAT_MIN_RPM / 60.0,
self._rate * self.RETREAT_FACTOR,
)
self._rate = new_rate
def _apply_recover(self) -> None:
"""执行一次恢复提速。"""
increment: float = self.RECOVER_INCREMENT_RPM / 60.0
new_rate: float = min(self._base_rate, self._rate + increment)
self._rate = new_rate
# ---- 状态查询 ----
def get_retreat_state(self) -> str:
"""获取当前避退状态。
Returns:
"normal" / "retreat" / "recover"
"""
with self._retreat_lock:
return self._retreat_state
def get_effective_rate_rpm(self) -> float:
"""获取当前实际速率(RPM),考虑避退乘数。
Returns:
当前每分钟速率。
"""
with self._lock:
return self._rate * 60.0
def get_base_rate_rpm(self) -> float:
"""获取基准速率(RPM),即未避退时的速率。
Returns:
基准每分钟速率。
"""
return self._base_rate * 60.0
def reset_to_base(self) -> None:
"""手动重置到基准速率(用于运维干预)。"""
with self._retreat_lock:
self._rate = self._base_rate
self._retreat_state = RetreatState.NORMAL
self._last_state_change = time.monotonic()
self._429_window.clear()