EnterpriseArchitect/services/nvidia_sidecar/health.py

"""
NVIDIA Sidecar 限流代理 — 健康检查端点 (§3.6)

提供 Kubernetes / systemd 兼容的健康检查：
    GET /health       — 存活检查
    GET /health/ready — 就绪检查（含上游连通性）

BIZ-46 Phase3: Readiness HTTP Client 复用 — 注入主 http_client，
不再每次检查创建新 client，降低 K8s/systemd 高频探测的连接开销。
"""

from __future__ import annotations

import time
from dataclasses import dataclass
from typing import Any

import httpx


@dataclass
class HealthService:
    """健康检查服务。

    封装存活检查和就绪检查的逻辑，供 server.py 路由调用。
    """

    start_time: float = 0.0
    version: str = "0.1.0"

    def __post_init__(self) -> None:
        if self.start_time == 0.0:
            self.start_time = time.time()

    @property
    def uptime_seconds(self) -> float:
        """服务运行时长（秒）。"""
        return time.time() - self.start_time

    async def check_upstream(
        self,
        upstream_url: str,
        http_client: httpx.AsyncClient,
        timeout: float = 5.0,
        api_key: str = "",
    ) -> bool:
        """检查上游连通性（复用注入的 http_client，BIZ-46 Phase3）。

        Args:
            upstream_url: NVIDIA API base URL。
            http_client: 复用的 httpx.AsyncClient（来自 ctx）。
            timeout: 超时秒数（per-request override）。
            api_key: 可选的 API Key 用于认证。

        Returns:
            True 上游可达。
        """
        try:
            headers: dict[str, str] = {}
            if api_key:
                headers["authorization"] = f"Bearer {api_key}"

            resp = await http_client.get(
                f"{upstream_url.rstrip('/')}/v1/models",
                headers=headers,
                timeout=timeout,
            )
            return resp.status_code < 500
        except Exception:
            return False

    def check_queue_healthy(
        self,
        current_size: int,
        max_size: int,
        threshold_ratio: float = 0.9,
    ) -> bool:
        """检查队列是否健康（未接近满载）。

        Args:
            current_size: 当前队列长度。
            max_size: 队列最大容量。
            threshold_ratio: 告警阈值比例，默认 0.9。

        Returns:
            True 队列健康。
        """
        if max_size <= 0:
            return True
        return current_size < max_size * threshold_ratio

    def check_token_bucket_healthy(
        self,
        available_tokens: float,
        capacity: int,
        threshold: float = 0.05,
    ) -> bool:
        """检查令牌桶是否健康（token 未耗尽）。

        Args:
            available_tokens: 当前可用令牌数。
            capacity: 桶容量。
            threshold: 令牌数低于此比例视为不健康。

        Returns:
            True 令牌桶健康。
        """
        if capacity <= 0:
            return False
        return available_tokens > capacity * threshold

    def liveness(self) -> dict[str, Any]:
        """存活检查响应。

        Returns:
            liveness JSON payload。
        """
        return {
            "status": "ok",
            "uptime": round(self.uptime_seconds, 1),
            "version": self.version,
        }

    async def readiness(
        self,
        upstream_url: str,
        upstream_api_key: str = "",
        queue_current_size: int = 0,
        queue_max_size: int = 500,
        available_tokens: float = 0.0,
        bucket_capacity: int = 40,
        http_client: httpx.AsyncClient | None = None,
    ) -> dict[str, Any]:
        """就绪检查响应。

        Args:
            upstream_url: 上游 API 地址。
            upstream_api_key: API Key。
            queue_current_size: 当前队列长度。
            queue_max_size: 队列最大容量。
            available_tokens: 当前令牌数。
            bucket_capacity: 桶容量。
            http_client: 复用的 httpx.AsyncClient（BIZ-46 Phase3）。
                为 None 时回退到每次创建新 client（兼容旧调用）。

        Returns:
            readiness JSON payload。
        """
        if http_client is not None:
            upstream_ok = await self.check_upstream(
                upstream_url, http_client=http_client, api_key=upstream_api_key,
            )
        else:
            # 向后兼容：无 http_client 时沿用旧行为
            upstream_ok = await self.check_upstream_standalone(
                upstream_url, api_key=upstream_api_key,
            )

        queue_ok = self.check_queue_healthy(queue_current_size, queue_max_size)
        token_ok = self.check_token_bucket_healthy(available_tokens, bucket_capacity)
        all_ready = upstream_ok and queue_ok and token_ok

        return {
            "ready": all_ready,
            "upstream_reachable": upstream_ok,
            "queue_healthy": queue_ok,
            "token_bucket_healthy": token_ok,
        }

    async def check_upstream_standalone(
        self,
        upstream_url: str,
        timeout: float = 5.0,
        api_key: str = "",
    ) -> bool:
        """独立检查上游连通性（向后兼容，每次创建新 client）。

        Args:
            upstream_url: NVIDIA API base URL。
            timeout: 超时秒数。
            api_key: 可选的 API Key。

        Returns:
            True 上游可达。
        """
        try:
            headers: dict[str, str] = {}
            if api_key:
                headers["authorization"] = f"Bearer {api_key}"

            async with httpx.AsyncClient(timeout=timeout) as client:
                resp = await client.get(
                    f"{upstream_url.rstrip('/')}/v1/models",
                    headers=headers,
                )
                return resp.status_code < 500
        except Exception:
            return False