""" NVIDIA Sidecar 限流代理 — Prometheus 指标端点 (§3.5) 10 个指标,独立端口 :9191,与代理端口 :9190 分离。 """ from __future__ import annotations import time import threading from typing import Any from prometheus_client import ( CollectorRegistry, Counter, Gauge, Histogram, generate_latest, make_asgi_app, ) class PrometheusMetrics: """Sidecar Prometheus 指标收集器。 线程安全,所有公开方法通过 ``threading.Lock`` 保护。 """ def __init__(self, registry: CollectorRegistry | None = None) -> None: """初始化所有 10 个 Prometheus 指标。 Args: registry: 可选自定义 Registry;None 则使用默认全局 registry。 """ self._registry: CollectorRegistry = registry or CollectorRegistry() self._lock: threading.Lock = threading.Lock() self._start_time: float = time.time() # ---- 1. 总请求数(按优先级 + 状态分组) ---- self.requests_total: Counter = Counter( "sidecar_requests_total", "Total requests processed by priority and status", labelnames=["priority", "status"], registry=self._registry, ) # ---- 2. 可用令牌数 ---- self.tokens_available: Gauge = Gauge( "sidecar_tokens_available", "Current number of available tokens", registry=self._registry, ) # ---- 3. 令牌生成速率 ---- self.tokens_rate: Gauge = Gauge( "sidecar_tokens_rate", "Current token generation rate (tokens per minute)", registry=self._registry, ) # ---- 4. 各优先级队列深度 ---- self.queue_depth: Gauge = Gauge( "sidecar_queue_depth", "Queue depth by priority", labelnames=["priority"], registry=self._registry, ) # ---- 5. 队列等待时间 Histogram ---- self.queue_latency_seconds: Histogram = Histogram( "sidecar_queue_latency_seconds", "Request wait time in queue in seconds", labelnames=["priority"], buckets=(0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0), registry=self._registry, ) # ---- 6. 上游响应延迟 Histogram ---- self.upstream_latency_seconds: Histogram = Histogram( "sidecar_upstream_latency_seconds", "Upstream response latency in seconds", labelnames=["model_id"], buckets=(0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0), registry=self._registry, ) # ---- 7. 上游错误计数 ---- self.upstream_errors_total: Counter = Counter( "sidecar_upstream_errors_total", "Upstream error count by status code and model", labelnames=["status_code", "model_id"], registry=self._registry, ) # ---- 8. 降级直通次数 ---- self.fallback_passthrough_total: Counter = Counter( "sidecar_fallback_passthrough_total", "Total fallback / passthrough events (queue full or sidecar unavailable)", registry=self._registry, ) # ---- 9. 健康状态 ---- self.health_status: Gauge = Gauge( "sidecar_health_status", "Sidecar health: 0=unhealthy, 1=healthy", registry=self._registry, ) # ---- 10. 运行时长 ---- self.uptime_seconds: Gauge = Gauge( "sidecar_uptime_seconds", "Process uptime in seconds", registry=self._registry, ) # 避退模式指标(附加,不计入基础 10 个) self.retreat_state: Gauge = Gauge( "sidecar_retreat_state", "Adaptive retreat state: 0=NORMAL, 1=RETREAT, 2=RECOVER", registry=self._registry, ) self.effective_rate_rpm: Gauge = Gauge( "sidecar_effective_rate_rpm", "Current effective rate in RPM (after retreat adjustments)", registry=self._registry, ) self.upstream_429_rate: Gauge = Gauge( "sidecar_upstream_429_rate", "Upstream 429 rate over the retreat observation window (0.0-1.0)", registry=self._registry, ) # 初始化 self.health_status.set(1) # ---- ASGI app 生成 ---- def build_asgi_app(self) -> Any: """生成 Prometheus ASGI 应用,挂载到独立端口。 Returns: 可传给 uvicorn 的 ASGI app。 """ return make_asgi_app(registry=self._registry) # ---- 指标记录方法 ---- def record_request(self, priority: str, status: str) -> None: """记录一次请求。 Args: priority: 优先级名(URGENT / HIGH / NORMAL / LOW)。 status: 状态(success / ratelimited / error)。 """ with self._lock: self.requests_total.labels(priority=priority, status=status).inc() def record_queue_latency(self, priority: str, seconds: float) -> None: """记录排队延迟。 Args: priority: 优先级名。 seconds: 排队等待秒数。 """ with self._lock: self.queue_latency_seconds.labels(priority=priority).observe(seconds) def record_upstream(self, status_code: int, model_id: str) -> None: """记录上游响应。 Args: status_code: HTTP 状态码。 model_id: 模型标识符。 """ with self._lock: self.upstream_latency_seconds.labels(model_id=model_id).observe(0.0) def record_upstream_error(self, status_code: int, model_id: str) -> None: """记录上游错误。 Args: status_code: 错误 HTTP 状态码。 model_id: 模型标识符。 """ with self._lock: self.upstream_errors_total.labels( status_code=str(status_code), model_id=model_id ).inc() def record_upstream_latency(self, model_id: str, seconds: float) -> None: """记录上游响应延迟。 Args: model_id: 模型标识符。 seconds: 响应延迟秒数。 """ with self._lock: self.upstream_latency_seconds.labels(model_id=model_id).observe(seconds) def update_token_status(self, tokens: float, rate_per_minute: float) -> None: """更新令牌桶状态。 Args: tokens: 当前可用令牌数。 rate_per_minute: 每分钟速率。 """ with self._lock: self.tokens_available.set(tokens) self.tokens_rate.set(rate_per_minute) def update_queue_depth(self, depths: dict[str, int]) -> None: """更新各优先级队列深度。 Args: depths: {priority_name: count} 映射。 """ with self._lock: # 先清零所有已知标签再设置,避免残留旧值 for pri in ("URGENT", "HIGH", "NORMAL", "LOW"): self.queue_depth.labels(priority=pri).set(depths.get(pri, 0)) def increment_fallback(self) -> None: """降级直通计数 +1。""" with self._lock: self.fallback_passthrough_total.inc() def set_health(self, healthy: bool) -> None: """设置健康状态。 Args: healthy: True=健康, False=不健康。 """ with self._lock: self.health_status.set(1 if healthy else 0) def update_uptime(self) -> None: """更新运行时长。""" with self._lock: self.uptime_seconds.set(time.time() - self._start_time) # ---- 避退模式指标 ---- def update_retreat_metrics( self, retreat_state: str, effective_rate_rpm: float, upstream_429_rate: float, ) -> None: """更新避退模式指标。 Args: retreat_state: "normal" / "retreat" / "recover". effective_rate_rpm: 当前实际速率 (RPM)。 upstream_429_rate: 上游 429 率 (0.0-1.0)。 """ state_map: dict[str, int] = {"normal": 0, "retreat": 1, "recover": 2} with self._lock: self.retreat_state.set(state_map.get(retreat_state, 0)) self.effective_rate_rpm.set(effective_rate_rpm) self.upstream_429_rate.set(upstream_429_rate) # ---- 导出 ---- def generate_latest(self) -> bytes: """生成 Prometheus 文本格式的指标数据。 Returns: Prometheus 文本格式 bytes。 """ with self._lock: self.update_uptime() return generate_latest(self._registry)