""" NVIDIA Sidecar 限流代理 — 健康检查端点 (§3.6) 提供 Kubernetes / systemd 兼容的健康检查: GET /health — 存活检查 GET /health/ready — 就绪检查(含上游连通性) """ from __future__ import annotations import asyncio import time from dataclasses import dataclass from typing import Any import httpx @dataclass class HealthService: """健康检查服务。 封装存活检查和就绪检查的逻辑,供 server.py 路由调用。 """ start_time: float = 0.0 version: str = "0.1.0" def __post_init__(self) -> None: if self.start_time == 0.0: self.start_time = time.time() @property def uptime_seconds(self) -> float: """服务运行时长(秒)。""" return time.time() - self.start_time async def check_upstream( self, upstream_url: str, timeout: float = 5.0, api_key: str = "", ) -> bool: """检查上游连通性。 Args: upstream_url: NVIDIA API base URL。 timeout: 超时秒数。 api_key: 可选的 API Key 用于认证。 Returns: True 上游可达。 """ try: headers: dict[str, str] = {} if api_key: headers["authorization"] = f"Bearer {api_key}" async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.get( f"{upstream_url.rstrip('/')}/v1/models", headers=headers, ) return resp.status_code < 500 except Exception: return False def check_queue_healthy( self, current_size: int, max_size: int, threshold_ratio: float = 0.9, ) -> bool: """检查队列是否健康(未接近满载)。 Args: current_size: 当前队列长度。 max_size: 队列最大容量。 threshold_ratio: 告警阈值比例,默认 0.9。 Returns: True 队列健康。 """ if max_size <= 0: return True return current_size < max_size * threshold_ratio def check_token_bucket_healthy( self, available_tokens: float, capacity: int, threshold: float = 0.05, ) -> bool: """检查令牌桶是否健康(token 未耗尽)。 Args: available_tokens: 当前可用令牌数。 capacity: 桶容量。 threshold: 令牌数低于此比例视为不健康。 Returns: True 令牌桶健康。 """ if capacity <= 0: return False return available_tokens > capacity * threshold def liveness(self) -> dict[str, Any]: """存活检查响应。 Returns: liveness JSON payload。 """ return { "status": "ok", "uptime": round(self.uptime_seconds, 1), "version": self.version, } async def readiness( self, upstream_url: str, upstream_api_key: str = "", queue_current_size: int = 0, queue_max_size: int = 500, available_tokens: float = 0.0, bucket_capacity: int = 40, ) -> dict[str, Any]: """就绪检查响应。 Args: upstream_url: 上游 API 地址。 upstream_api_key: API Key。 queue_current_size: 当前队列长度。 queue_max_size: 队列最大容量。 available_tokens: 当前令牌数。 bucket_capacity: 桶容量。 Returns: readiness JSON payload。 """ upstream_ok = await self.check_upstream(upstream_url, api_key=upstream_api_key) queue_ok = self.check_queue_healthy(queue_current_size, queue_max_size) token_ok = self.check_token_bucket_healthy(available_tokens, bucket_capacity) all_ready = upstream_ok and queue_ok and token_ok return { "ready": all_ready, "upstream_reachable": upstream_ok, "queue_healthy": queue_ok, "token_bucket_healthy": token_ok, }