Files
EnterpriseArchitect/services/nvidia_sidecar/metrics.py
T
vincent e829a4060b BIZ-42: Phase2 可观测性+WebUI+避退模式 — metrics/health/webui/dashboard/adaptive
新增文件:
- metrics.py: Prometheus 指标端点 (:9191), 10+3 个指标
- health.py: /health (liveness) + /health/ready (readiness)
- webui.py: WebUI 后端 API (SSE 实时推送 + 配置热重载)
- static/dashboard.html: 仪表盘前端 (Chart.js, 令牌桶仪表+队列柱状图+吞吐折线图)

更新文件:
- rate_limiter.py: 增加 AdaptiveTokenBucket 避退模式 (ADR-009)
  状态机 NORMAL→RETREAT→RECOVER, 429 率滑动窗口监控
- server.py: structlog 结构化日志 + 避退反馈回路
  挂载 metrics_server (:9191) + health/ready + webui + /status
- pyproject.toml: 增加 prometheus-client, pydantic, types-PyYAML 依赖

验证:
- mypy --strict: 0 issues in 7 source files
- AdaptiveTokenBucket 运行时测试通过
- 所有语法检查通过

Co-authored-by: multica-agent <github@multica.ai>
2026-06-24 11:54:02 +08:00

272 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
NVIDIA Sidecar 限流代理 — Prometheus 指标端点 (§3.5)
10 个指标,独立端口 :9191,与代理端口 :9190 分离。
"""
from __future__ import annotations
import time
import threading
from typing import Any
from prometheus_client import (
CollectorRegistry,
Counter,
Gauge,
Histogram,
generate_latest,
make_asgi_app,
)
class PrometheusMetrics:
"""Sidecar Prometheus 指标收集器。
线程安全,所有公开方法通过 ``threading.Lock`` 保护。
"""
def __init__(self, registry: CollectorRegistry | None = None) -> None:
"""初始化所有 10 个 Prometheus 指标。
Args:
registry: 可选自定义 RegistryNone 则使用默认全局 registry。
"""
self._registry: CollectorRegistry = registry or CollectorRegistry()
self._lock: threading.Lock = threading.Lock()
self._start_time: float = time.time()
# ---- 1. 总请求数(按优先级 + 状态分组) ----
self.requests_total: Counter = Counter(
"sidecar_requests_total",
"Total requests processed by priority and status",
labelnames=["priority", "status"],
registry=self._registry,
)
# ---- 2. 可用令牌数 ----
self.tokens_available: Gauge = Gauge(
"sidecar_tokens_available",
"Current number of available tokens",
registry=self._registry,
)
# ---- 3. 令牌生成速率 ----
self.tokens_rate: Gauge = Gauge(
"sidecar_tokens_rate",
"Current token generation rate (tokens per minute)",
registry=self._registry,
)
# ---- 4. 各优先级队列深度 ----
self.queue_depth: Gauge = Gauge(
"sidecar_queue_depth",
"Queue depth by priority",
labelnames=["priority"],
registry=self._registry,
)
# ---- 5. 队列等待时间 Histogram ----
self.queue_latency_seconds: Histogram = Histogram(
"sidecar_queue_latency_seconds",
"Request wait time in queue in seconds",
labelnames=["priority"],
buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0),
registry=self._registry,
)
# ---- 6. 上游响应延迟 Histogram ----
self.upstream_latency_seconds: Histogram = Histogram(
"sidecar_upstream_latency_seconds",
"Upstream response latency in seconds",
labelnames=["model_id"],
buckets=(0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0),
registry=self._registry,
)
# ---- 7. 上游错误计数 ----
self.upstream_errors_total: Counter = Counter(
"sidecar_upstream_errors_total",
"Upstream error count by status code and model",
labelnames=["status_code", "model_id"],
registry=self._registry,
)
# ---- 8. 降级直通次数 ----
self.fallback_passthrough_total: Counter = Counter(
"sidecar_fallback_passthrough_total",
"Total fallback / passthrough events (queue full or sidecar unavailable)",
registry=self._registry,
)
# ---- 9. 健康状态 ----
self.health_status: Gauge = Gauge(
"sidecar_health_status",
"Sidecar health: 0=unhealthy, 1=healthy",
registry=self._registry,
)
# ---- 10. 运行时长 ----
self.uptime_seconds: Gauge = Gauge(
"sidecar_uptime_seconds",
"Process uptime in seconds",
registry=self._registry,
)
# 避退模式指标(附加,不计入基础 10 个)
self.retreat_state: Gauge = Gauge(
"sidecar_retreat_state",
"Adaptive retreat state: 0=NORMAL, 1=RETREAT, 2=RECOVER",
registry=self._registry,
)
self.effective_rate_rpm: Gauge = Gauge(
"sidecar_effective_rate_rpm",
"Current effective rate in RPM (after retreat adjustments)",
registry=self._registry,
)
self.upstream_429_rate: Gauge = Gauge(
"sidecar_upstream_429_rate",
"Upstream 429 rate over the retreat observation window (0.0-1.0)",
registry=self._registry,
)
# 初始化
self.health_status.set(1)
# ---- ASGI app 生成 ----
def build_asgi_app(self) -> Any:
"""生成 Prometheus ASGI 应用,挂载到独立端口。
Returns:
可传给 uvicorn 的 ASGI app。
"""
return make_asgi_app(registry=self._registry)
# ---- 指标记录方法 ----
def record_request(self, priority: str, status: str) -> None:
"""记录一次请求。
Args:
priority: 优先级名(URGENT / HIGH / NORMAL / LOW)。
status: 状态(success / ratelimited / error)。
"""
with self._lock:
self.requests_total.labels(priority=priority, status=status).inc()
def record_queue_latency(self, priority: str, seconds: float) -> None:
"""记录排队延迟。
Args:
priority: 优先级名。
seconds: 排队等待秒数。
"""
with self._lock:
self.queue_latency_seconds.labels(priority=priority).observe(seconds)
def record_upstream(self, status_code: int, model_id: str) -> None:
"""记录上游响应。
Args:
status_code: HTTP 状态码。
model_id: 模型标识符。
"""
with self._lock:
self.upstream_latency_seconds.labels(model_id=model_id).observe(0.0)
def record_upstream_error(self, status_code: int, model_id: str) -> None:
"""记录上游错误。
Args:
status_code: 错误 HTTP 状态码。
model_id: 模型标识符。
"""
with self._lock:
self.upstream_errors_total.labels(
status_code=str(status_code), model_id=model_id
).inc()
def record_upstream_latency(self, model_id: str, seconds: float) -> None:
"""记录上游响应延迟。
Args:
model_id: 模型标识符。
seconds: 响应延迟秒数。
"""
with self._lock:
self.upstream_latency_seconds.labels(model_id=model_id).observe(seconds)
def update_token_status(self, tokens: float, rate_per_minute: float) -> None:
"""更新令牌桶状态。
Args:
tokens: 当前可用令牌数。
rate_per_minute: 每分钟速率。
"""
with self._lock:
self.tokens_available.set(tokens)
self.tokens_rate.set(rate_per_minute)
def update_queue_depth(self, depths: dict[str, int]) -> None:
"""更新各优先级队列深度。
Args:
depths: {priority_name: count} 映射。
"""
with self._lock:
# 先清零所有已知标签再设置,避免残留旧值
for pri in ("URGENT", "HIGH", "NORMAL", "LOW"):
self.queue_depth.labels(priority=pri).set(depths.get(pri, 0))
def increment_fallback(self) -> None:
"""降级直通计数 +1。"""
with self._lock:
self.fallback_passthrough_total.inc()
def set_health(self, healthy: bool) -> None:
"""设置健康状态。
Args:
healthy: True=健康, False=不健康。
"""
with self._lock:
self.health_status.set(1 if healthy else 0)
def update_uptime(self) -> None:
"""更新运行时长。"""
with self._lock:
self.uptime_seconds.set(time.time() - self._start_time)
# ---- 避退模式指标 ----
def update_retreat_metrics(
self,
retreat_state: str,
effective_rate_rpm: float,
upstream_429_rate: float,
) -> None:
"""更新避退模式指标。
Args:
retreat_state: "normal" / "retreat" / "recover".
effective_rate_rpm: 当前实际速率 (RPM)。
upstream_429_rate: 上游 429 率 (0.0-1.0)。
"""
state_map: dict[str, int] = {"normal": 0, "retreat": 1, "recover": 2}
with self._lock:
self.retreat_state.set(state_map.get(retreat_state, 0))
self.effective_rate_rpm.set(effective_rate_rpm)
self.upstream_429_rate.set(upstream_429_rate)
# ---- 导出 ----
def generate_latest(self) -> bytes:
"""生成 Prometheus 文本格式的指标数据。
Returns:
Prometheus 文本格式 bytes。
"""
with self._lock:
self.update_uptime()
return generate_latest(self._registry)