b18d243ef2
1. 架构解耦 — SidecarContext + FastAPI Depends 注入 - 新增 context.py: SidecarContext dataclass 收敛全部全局状态 - server.py: 移除模块级全局变量,lifespan 创建 ctx → app.state.sidecar - webui.py: 移除反向导入 server,改用 Depends(get_context) 2. Prometheus 标签基数治理 — model_id → provider - upstream_latency_seconds / upstream_errors_total label 收敛为 provider - 模型级信息保留在 structlog JSON 日志 3. SSE 快照共享缓存 - 1s TTL 共享 snapshot cache + double-check locking - 多客户端不重复构建快照 4. 部署支撑 - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK) - systemd service (安全加固, 资源限制) - .env.example (完整环境变量清单) 5. Readiness HTTP Client 复用 - check_upstream() 注入主 http_client,不再每次创建新 client 6. Retreat 并发回归测试 - 5 个测试用例全部通过(死锁检测 + 状态转换 + 并发安全) 7. Dashboard UX 优化 - 队列柱状图 300ms 平滑动画 - SSE 断连 5s 半透明遮罩 - 队列图标题显示总排队数 - 页面加载同步配置 验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes) Co-authored-by: multica-agent <github@multica.ai>
277 lines
9.4 KiB
Python
277 lines
9.4 KiB
Python
"""
|
||
NVIDIA Sidecar 限流代理 — Prometheus 指标端点 (§3.5)
|
||
|
||
10 个指标,独立端口 :9191,与代理端口 :9190 分离。
|
||
|
||
BIZ-46 Phase3: Prometheus 标签基数治理 — model_id label 收敛为 provider。
|
||
- upstream_latency_seconds: model_id → provider (固定值 "nvidia", 基数=1)
|
||
- upstream_errors_total: model_id → provider
|
||
- 模型级信息迁移到 structlog JSON 日志
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
import threading
|
||
from typing import Any
|
||
|
||
from prometheus_client import (
|
||
CollectorRegistry,
|
||
Counter,
|
||
Gauge,
|
||
Histogram,
|
||
generate_latest,
|
||
make_asgi_app,
|
||
)
|
||
|
||
|
||
class PrometheusMetrics:
|
||
"""Sidecar Prometheus 指标收集器。
|
||
|
||
线程安全,所有公开方法通过 ``threading.Lock`` 保护。
|
||
"""
|
||
|
||
def __init__(self, registry: CollectorRegistry | None = None) -> None:
|
||
"""初始化所有 10 个 Prometheus 指标。
|
||
|
||
Args:
|
||
registry: 可选自定义 Registry;None 则使用默认全局 registry。
|
||
"""
|
||
self._registry: CollectorRegistry = registry or CollectorRegistry()
|
||
self._lock: threading.Lock = threading.Lock()
|
||
self._start_time: float = time.time()
|
||
|
||
# ---- 1. 总请求数(按优先级 + 状态分组) ----
|
||
self.requests_total: Counter = Counter(
|
||
"sidecar_requests_total",
|
||
"Total requests processed by priority and status",
|
||
labelnames=["priority", "status"],
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 2. 可用令牌数 ----
|
||
self.tokens_available: Gauge = Gauge(
|
||
"sidecar_tokens_available",
|
||
"Current number of available tokens",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 3. 令牌生成速率 ----
|
||
self.tokens_rate: Gauge = Gauge(
|
||
"sidecar_tokens_rate",
|
||
"Current token generation rate (tokens per minute)",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 4. 各优先级队列深度 ----
|
||
self.queue_depth: Gauge = Gauge(
|
||
"sidecar_queue_depth",
|
||
"Queue depth by priority",
|
||
labelnames=["priority"],
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 5. 队列等待时间 Histogram ----
|
||
self.queue_latency_seconds: Histogram = Histogram(
|
||
"sidecar_queue_latency_seconds",
|
||
"Request wait time in queue in seconds",
|
||
labelnames=["priority"],
|
||
buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0),
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 6. 上游响应延迟 Histogram(label 收敛: model_id → provider) ----
|
||
self.upstream_latency_seconds: Histogram = Histogram(
|
||
"sidecar_upstream_latency_seconds",
|
||
"Upstream response latency in seconds",
|
||
labelnames=["provider"], # BIZ-46: was ["model_id"], converged to fixed-cardinality provider
|
||
buckets=(0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0),
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 7. 上游错误计数(label 收敛: model_id → provider) ----
|
||
self.upstream_errors_total: Counter = Counter(
|
||
"sidecar_upstream_errors_total",
|
||
"Upstream error count by status code and provider",
|
||
labelnames=["status_code", "provider"], # BIZ-46: was ["model_id"], converged
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 8. 降级直通次数 ----
|
||
self.fallback_passthrough_total: Counter = Counter(
|
||
"sidecar_fallback_passthrough_total",
|
||
"Total fallback / passthrough events (queue full or sidecar unavailable)",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 9. 健康状态 ----
|
||
self.health_status: Gauge = Gauge(
|
||
"sidecar_health_status",
|
||
"Sidecar health: 0=unhealthy, 1=healthy",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# ---- 10. 运行时长 ----
|
||
self.uptime_seconds: Gauge = Gauge(
|
||
"sidecar_uptime_seconds",
|
||
"Process uptime in seconds",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# 避退模式指标(附加,不计入基础 10 个)
|
||
self.retreat_state: Gauge = Gauge(
|
||
"sidecar_retreat_state",
|
||
"Adaptive retreat state: 0=NORMAL, 1=RETREAT, 2=RECOVER",
|
||
registry=self._registry,
|
||
)
|
||
self.effective_rate_rpm: Gauge = Gauge(
|
||
"sidecar_effective_rate_rpm",
|
||
"Current effective rate in RPM (after retreat adjustments)",
|
||
registry=self._registry,
|
||
)
|
||
self.upstream_429_rate: Gauge = Gauge(
|
||
"sidecar_upstream_429_rate",
|
||
"Upstream 429 rate over the retreat observation window (0.0-1.0)",
|
||
registry=self._registry,
|
||
)
|
||
|
||
# 初始化
|
||
self.health_status.set(1)
|
||
|
||
# ---- ASGI app 生成 ----
|
||
|
||
def build_asgi_app(self) -> Any:
|
||
"""生成 Prometheus ASGI 应用,挂载到独立端口。
|
||
|
||
Returns:
|
||
可传给 uvicorn 的 ASGI app。
|
||
"""
|
||
return make_asgi_app(registry=self._registry)
|
||
|
||
# ---- 指标记录方法 ----
|
||
|
||
def record_request(self, priority: str, status: str) -> None:
|
||
"""记录一次请求。
|
||
|
||
Args:
|
||
priority: 优先级名(URGENT / HIGH / NORMAL / LOW)。
|
||
status: 状态(success / ratelimited / error)。
|
||
"""
|
||
with self._lock:
|
||
self.requests_total.labels(priority=priority, status=status).inc()
|
||
|
||
def record_queue_latency(self, priority: str, seconds: float) -> None:
|
||
"""记录排队延迟。
|
||
|
||
Args:
|
||
priority: 优先级名。
|
||
seconds: 排队等待秒数。
|
||
"""
|
||
with self._lock:
|
||
self.queue_latency_seconds.labels(priority=priority).observe(seconds)
|
||
|
||
def record_upstream(self, status_code: int, provider: str) -> None:
|
||
"""记录上游响应(label 收敛: provider 替代 model_id,BIZ-46 Phase3)。
|
||
|
||
Args:
|
||
status_code: HTTP 状态码。
|
||
provider: 上游提供商标识(固定 "nvidia")。
|
||
"""
|
||
with self._lock:
|
||
self.upstream_latency_seconds.labels(provider=provider).observe(0.0)
|
||
|
||
def record_upstream_error(self, status_code: int, provider: str) -> None:
|
||
"""记录上游错误(label 收敛: provider 替代 model_id,BIZ-46 Phase3)。
|
||
|
||
Args:
|
||
status_code: 错误 HTTP 状态码。
|
||
provider: 上游提供商标识(固定 "nvidia")。
|
||
"""
|
||
with self._lock:
|
||
self.upstream_errors_total.labels(
|
||
status_code=str(status_code), provider=provider
|
||
).inc()
|
||
|
||
def record_upstream_latency(self, provider: str, seconds: float) -> None:
|
||
"""记录上游响应延迟(label 收敛: provider 替代 model_id,BIZ-46 Phase3)。
|
||
|
||
Args:
|
||
provider: 上游提供商标识(固定 "nvidia")。
|
||
seconds: 响应延迟秒数。
|
||
"""
|
||
with self._lock:
|
||
self.upstream_latency_seconds.labels(provider=provider).observe(seconds)
|
||
|
||
def update_token_status(self, tokens: float, rate_per_minute: float) -> None:
|
||
"""更新令牌桶状态。
|
||
|
||
Args:
|
||
tokens: 当前可用令牌数。
|
||
rate_per_minute: 每分钟速率。
|
||
"""
|
||
with self._lock:
|
||
self.tokens_available.set(tokens)
|
||
self.tokens_rate.set(rate_per_minute)
|
||
|
||
def update_queue_depth(self, depths: dict[str, int]) -> None:
|
||
"""更新各优先级队列深度。
|
||
|
||
Args:
|
||
depths: {priority_name: count} 映射。
|
||
"""
|
||
with self._lock:
|
||
# 先清零所有已知标签再设置,避免残留旧值
|
||
for pri in ("URGENT", "HIGH", "NORMAL", "LOW"):
|
||
self.queue_depth.labels(priority=pri).set(depths.get(pri, 0))
|
||
|
||
def increment_fallback(self) -> None:
|
||
"""降级直通计数 +1。"""
|
||
with self._lock:
|
||
self.fallback_passthrough_total.inc()
|
||
|
||
def set_health(self, healthy: bool) -> None:
|
||
"""设置健康状态。
|
||
|
||
Args:
|
||
healthy: True=健康, False=不健康。
|
||
"""
|
||
with self._lock:
|
||
self.health_status.set(1 if healthy else 0)
|
||
|
||
def update_uptime(self) -> None:
|
||
"""更新运行时长。"""
|
||
with self._lock:
|
||
self.uptime_seconds.set(time.time() - self._start_time)
|
||
|
||
# ---- 避退模式指标 ----
|
||
|
||
def update_retreat_metrics(
|
||
self,
|
||
retreat_state: str,
|
||
effective_rate_rpm: float,
|
||
upstream_429_rate: float,
|
||
) -> None:
|
||
"""更新避退模式指标。
|
||
|
||
Args:
|
||
retreat_state: "normal" / "retreat" / "recover".
|
||
effective_rate_rpm: 当前实际速率 (RPM)。
|
||
upstream_429_rate: 上游 429 率 (0.0-1.0)。
|
||
"""
|
||
state_map: dict[str, int] = {"normal": 0, "retreat": 1, "recover": 2}
|
||
with self._lock:
|
||
self.retreat_state.set(state_map.get(retreat_state, 0))
|
||
self.effective_rate_rpm.set(effective_rate_rpm)
|
||
self.upstream_429_rate.set(upstream_429_rate)
|
||
|
||
# ---- 导出 ----
|
||
|
||
def generate_latest(self) -> bytes:
|
||
"""生成 Prometheus 文本格式的指标数据。
|
||
|
||
Returns:
|
||
Prometheus 文本格式 bytes。
|
||
"""
|
||
with self._lock:
|
||
self.update_uptime()
|
||
return generate_latest(self._registry) |