Files
EnterpriseArchitect/services/nvidia_sidecar/metrics.py
T
vincent b18d243ef2 BIZ-46 Phase3: 7项 follow-up 开发完成
1. 架构解耦 — SidecarContext + FastAPI Depends 注入
   - 新增 context.py: SidecarContext dataclass 收敛全部全局状态
   - server.py: 移除模块级全局变量,lifespan 创建 ctx → app.state.sidecar
   - webui.py: 移除反向导入 server,改用 Depends(get_context)

2. Prometheus 标签基数治理 — model_id → provider
   - upstream_latency_seconds / upstream_errors_total label 收敛为 provider
   - 模型级信息保留在 structlog JSON 日志

3. SSE 快照共享缓存
   - 1s TTL 共享 snapshot cache + double-check locking
   - 多客户端不重复构建快照

4. 部署支撑
   - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK)
   - systemd service (安全加固, 资源限制)
   - .env.example (完整环境变量清单)

5. Readiness HTTP Client 复用
   - check_upstream() 注入主 http_client,不再每次创建新 client

6. Retreat 并发回归测试
   - 5 个测试用例全部通过(死锁检测 + 状态转换 + 并发安全)

7. Dashboard UX 优化
   - 队列柱状图 300ms 平滑动画
   - SSE 断连 5s 半透明遮罩
   - 队列图标题显示总排队数
   - 页面加载同步配置

验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes)

Co-authored-by: multica-agent <github@multica.ai>
2026-06-24 22:26:35 +08:00

277 lines
9.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
NVIDIA Sidecar 限流代理 — Prometheus 指标端点 (§3.5)
10 个指标,独立端口 :9191,与代理端口 :9190 分离。
BIZ-46 Phase3: Prometheus 标签基数治理 — model_id label 收敛为 provider。
- upstream_latency_seconds: model_id → provider (固定值 "nvidia", 基数=1)
- upstream_errors_total: model_id → provider
- 模型级信息迁移到 structlog JSON 日志
"""
from __future__ import annotations
import time
import threading
from typing import Any
from prometheus_client import (
CollectorRegistry,
Counter,
Gauge,
Histogram,
generate_latest,
make_asgi_app,
)
class PrometheusMetrics:
"""Sidecar Prometheus 指标收集器。
线程安全,所有公开方法通过 ``threading.Lock`` 保护。
"""
def __init__(self, registry: CollectorRegistry | None = None) -> None:
"""初始化所有 10 个 Prometheus 指标。
Args:
registry: 可选自定义 RegistryNone 则使用默认全局 registry。
"""
self._registry: CollectorRegistry = registry or CollectorRegistry()
self._lock: threading.Lock = threading.Lock()
self._start_time: float = time.time()
# ---- 1. 总请求数(按优先级 + 状态分组) ----
self.requests_total: Counter = Counter(
"sidecar_requests_total",
"Total requests processed by priority and status",
labelnames=["priority", "status"],
registry=self._registry,
)
# ---- 2. 可用令牌数 ----
self.tokens_available: Gauge = Gauge(
"sidecar_tokens_available",
"Current number of available tokens",
registry=self._registry,
)
# ---- 3. 令牌生成速率 ----
self.tokens_rate: Gauge = Gauge(
"sidecar_tokens_rate",
"Current token generation rate (tokens per minute)",
registry=self._registry,
)
# ---- 4. 各优先级队列深度 ----
self.queue_depth: Gauge = Gauge(
"sidecar_queue_depth",
"Queue depth by priority",
labelnames=["priority"],
registry=self._registry,
)
# ---- 5. 队列等待时间 Histogram ----
self.queue_latency_seconds: Histogram = Histogram(
"sidecar_queue_latency_seconds",
"Request wait time in queue in seconds",
labelnames=["priority"],
buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0),
registry=self._registry,
)
# ---- 6. 上游响应延迟 Histogramlabel 收敛: model_id → provider ----
self.upstream_latency_seconds: Histogram = Histogram(
"sidecar_upstream_latency_seconds",
"Upstream response latency in seconds",
labelnames=["provider"], # BIZ-46: was ["model_id"], converged to fixed-cardinality provider
buckets=(0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0),
registry=self._registry,
)
# ---- 7. 上游错误计数(label 收敛: model_id → provider ----
self.upstream_errors_total: Counter = Counter(
"sidecar_upstream_errors_total",
"Upstream error count by status code and provider",
labelnames=["status_code", "provider"], # BIZ-46: was ["model_id"], converged
registry=self._registry,
)
# ---- 8. 降级直通次数 ----
self.fallback_passthrough_total: Counter = Counter(
"sidecar_fallback_passthrough_total",
"Total fallback / passthrough events (queue full or sidecar unavailable)",
registry=self._registry,
)
# ---- 9. 健康状态 ----
self.health_status: Gauge = Gauge(
"sidecar_health_status",
"Sidecar health: 0=unhealthy, 1=healthy",
registry=self._registry,
)
# ---- 10. 运行时长 ----
self.uptime_seconds: Gauge = Gauge(
"sidecar_uptime_seconds",
"Process uptime in seconds",
registry=self._registry,
)
# 避退模式指标(附加,不计入基础 10 个)
self.retreat_state: Gauge = Gauge(
"sidecar_retreat_state",
"Adaptive retreat state: 0=NORMAL, 1=RETREAT, 2=RECOVER",
registry=self._registry,
)
self.effective_rate_rpm: Gauge = Gauge(
"sidecar_effective_rate_rpm",
"Current effective rate in RPM (after retreat adjustments)",
registry=self._registry,
)
self.upstream_429_rate: Gauge = Gauge(
"sidecar_upstream_429_rate",
"Upstream 429 rate over the retreat observation window (0.0-1.0)",
registry=self._registry,
)
# 初始化
self.health_status.set(1)
# ---- ASGI app 生成 ----
def build_asgi_app(self) -> Any:
"""生成 Prometheus ASGI 应用,挂载到独立端口。
Returns:
可传给 uvicorn 的 ASGI app。
"""
return make_asgi_app(registry=self._registry)
# ---- 指标记录方法 ----
def record_request(self, priority: str, status: str) -> None:
"""记录一次请求。
Args:
priority: 优先级名(URGENT / HIGH / NORMAL / LOW)。
status: 状态(success / ratelimited / error)。
"""
with self._lock:
self.requests_total.labels(priority=priority, status=status).inc()
def record_queue_latency(self, priority: str, seconds: float) -> None:
"""记录排队延迟。
Args:
priority: 优先级名。
seconds: 排队等待秒数。
"""
with self._lock:
self.queue_latency_seconds.labels(priority=priority).observe(seconds)
def record_upstream(self, status_code: int, provider: str) -> None:
"""记录上游响应(label 收敛: provider 替代 model_idBIZ-46 Phase3)。
Args:
status_code: HTTP 状态码。
provider: 上游提供商标识(固定 "nvidia")。
"""
with self._lock:
self.upstream_latency_seconds.labels(provider=provider).observe(0.0)
def record_upstream_error(self, status_code: int, provider: str) -> None:
"""记录上游错误(label 收敛: provider 替代 model_idBIZ-46 Phase3)。
Args:
status_code: 错误 HTTP 状态码。
provider: 上游提供商标识(固定 "nvidia")。
"""
with self._lock:
self.upstream_errors_total.labels(
status_code=str(status_code), provider=provider
).inc()
def record_upstream_latency(self, provider: str, seconds: float) -> None:
"""记录上游响应延迟(label 收敛: provider 替代 model_idBIZ-46 Phase3)。
Args:
provider: 上游提供商标识(固定 "nvidia")。
seconds: 响应延迟秒数。
"""
with self._lock:
self.upstream_latency_seconds.labels(provider=provider).observe(seconds)
def update_token_status(self, tokens: float, rate_per_minute: float) -> None:
"""更新令牌桶状态。
Args:
tokens: 当前可用令牌数。
rate_per_minute: 每分钟速率。
"""
with self._lock:
self.tokens_available.set(tokens)
self.tokens_rate.set(rate_per_minute)
def update_queue_depth(self, depths: dict[str, int]) -> None:
"""更新各优先级队列深度。
Args:
depths: {priority_name: count} 映射。
"""
with self._lock:
# 先清零所有已知标签再设置,避免残留旧值
for pri in ("URGENT", "HIGH", "NORMAL", "LOW"):
self.queue_depth.labels(priority=pri).set(depths.get(pri, 0))
def increment_fallback(self) -> None:
"""降级直通计数 +1。"""
with self._lock:
self.fallback_passthrough_total.inc()
def set_health(self, healthy: bool) -> None:
"""设置健康状态。
Args:
healthy: True=健康, False=不健康。
"""
with self._lock:
self.health_status.set(1 if healthy else 0)
def update_uptime(self) -> None:
"""更新运行时长。"""
with self._lock:
self.uptime_seconds.set(time.time() - self._start_time)
# ---- 避退模式指标 ----
def update_retreat_metrics(
self,
retreat_state: str,
effective_rate_rpm: float,
upstream_429_rate: float,
) -> None:
"""更新避退模式指标。
Args:
retreat_state: "normal" / "retreat" / "recover".
effective_rate_rpm: 当前实际速率 (RPM)。
upstream_429_rate: 上游 429 率 (0.0-1.0)。
"""
state_map: dict[str, int] = {"normal": 0, "retreat": 1, "recover": 2}
with self._lock:
self.retreat_state.set(state_map.get(retreat_state, 0))
self.effective_rate_rpm.set(effective_rate_rpm)
self.upstream_429_rate.set(upstream_429_rate)
# ---- 导出 ----
def generate_latest(self) -> bytes:
"""生成 Prometheus 文本格式的指标数据。
Returns:
Prometheus 文本格式 bytes。
"""
with self._lock:
self.update_uptime()
return generate_latest(self._registry)