e829a4060b
新增文件: - metrics.py: Prometheus 指标端点 (:9191), 10+3 个指标 - health.py: /health (liveness) + /health/ready (readiness) - webui.py: WebUI 后端 API (SSE 实时推送 + 配置热重载) - static/dashboard.html: 仪表盘前端 (Chart.js, 令牌桶仪表+队列柱状图+吞吐折线图) 更新文件: - rate_limiter.py: 增加 AdaptiveTokenBucket 避退模式 (ADR-009) 状态机 NORMAL→RETREAT→RECOVER, 429 率滑动窗口监控 - server.py: structlog 结构化日志 + 避退反馈回路 挂载 metrics_server (:9191) + health/ready + webui + /status - pyproject.toml: 增加 prometheus-client, pydantic, types-PyYAML 依赖 验证: - mypy --strict: 0 issues in 7 source files - AdaptiveTokenBucket 运行时测试通过 - 所有语法检查通过 Co-authored-by: multica-agent <github@multica.ai>
152 lines
4.2 KiB
Python
152 lines
4.2 KiB
Python
"""
|
|
NVIDIA Sidecar 限流代理 — 健康检查端点 (§3.6)
|
|
|
|
提供 Kubernetes / systemd 兼容的健康检查:
|
|
GET /health — 存活检查
|
|
GET /health/ready — 就绪检查(含上游连通性)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
|
|
@dataclass
|
|
class HealthService:
|
|
"""健康检查服务。
|
|
|
|
封装存活检查和就绪检查的逻辑,供 server.py 路由调用。
|
|
"""
|
|
|
|
start_time: float = 0.0
|
|
version: str = "0.1.0"
|
|
|
|
def __post_init__(self) -> None:
|
|
if self.start_time == 0.0:
|
|
self.start_time = time.time()
|
|
|
|
@property
|
|
def uptime_seconds(self) -> float:
|
|
"""服务运行时长(秒)。"""
|
|
return time.time() - self.start_time
|
|
|
|
async def check_upstream(
|
|
self,
|
|
upstream_url: str,
|
|
timeout: float = 5.0,
|
|
api_key: str = "",
|
|
) -> bool:
|
|
"""检查上游连通性。
|
|
|
|
Args:
|
|
upstream_url: NVIDIA API base URL。
|
|
timeout: 超时秒数。
|
|
api_key: 可选的 API Key 用于认证。
|
|
|
|
Returns:
|
|
True 上游可达。
|
|
"""
|
|
try:
|
|
headers: dict[str, str] = {}
|
|
if api_key:
|
|
headers["authorization"] = f"Bearer {api_key}"
|
|
|
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
resp = await client.get(
|
|
f"{upstream_url.rstrip('/')}/v1/models",
|
|
headers=headers,
|
|
)
|
|
return resp.status_code < 500
|
|
except Exception:
|
|
return False
|
|
|
|
def check_queue_healthy(
|
|
self,
|
|
current_size: int,
|
|
max_size: int,
|
|
threshold_ratio: float = 0.9,
|
|
) -> bool:
|
|
"""检查队列是否健康(未接近满载)。
|
|
|
|
Args:
|
|
current_size: 当前队列长度。
|
|
max_size: 队列最大容量。
|
|
threshold_ratio: 告警阈值比例,默认 0.9。
|
|
|
|
Returns:
|
|
True 队列健康。
|
|
"""
|
|
if max_size <= 0:
|
|
return True
|
|
return current_size < max_size * threshold_ratio
|
|
|
|
def check_token_bucket_healthy(
|
|
self,
|
|
available_tokens: float,
|
|
capacity: int,
|
|
threshold: float = 0.05,
|
|
) -> bool:
|
|
"""检查令牌桶是否健康(token 未耗尽)。
|
|
|
|
Args:
|
|
available_tokens: 当前可用令牌数。
|
|
capacity: 桶容量。
|
|
threshold: 令牌数低于此比例视为不健康。
|
|
|
|
Returns:
|
|
True 令牌桶健康。
|
|
"""
|
|
if capacity <= 0:
|
|
return False
|
|
return available_tokens > capacity * threshold
|
|
|
|
def liveness(self) -> dict[str, Any]:
|
|
"""存活检查响应。
|
|
|
|
Returns:
|
|
liveness JSON payload。
|
|
"""
|
|
return {
|
|
"status": "ok",
|
|
"uptime": round(self.uptime_seconds, 1),
|
|
"version": self.version,
|
|
}
|
|
|
|
async def readiness(
|
|
self,
|
|
upstream_url: str,
|
|
upstream_api_key: str = "",
|
|
queue_current_size: int = 0,
|
|
queue_max_size: int = 500,
|
|
available_tokens: float = 0.0,
|
|
bucket_capacity: int = 40,
|
|
) -> dict[str, Any]:
|
|
"""就绪检查响应。
|
|
|
|
Args:
|
|
upstream_url: 上游 API 地址。
|
|
upstream_api_key: API Key。
|
|
queue_current_size: 当前队列长度。
|
|
queue_max_size: 队列最大容量。
|
|
available_tokens: 当前令牌数。
|
|
bucket_capacity: 桶容量。
|
|
|
|
Returns:
|
|
readiness JSON payload。
|
|
"""
|
|
upstream_ok = await self.check_upstream(upstream_url, api_key=upstream_api_key)
|
|
queue_ok = self.check_queue_healthy(queue_current_size, queue_max_size)
|
|
token_ok = self.check_token_bucket_healthy(available_tokens, bucket_capacity)
|
|
all_ready = upstream_ok and queue_ok and token_ok
|
|
|
|
return {
|
|
"ready": all_ready,
|
|
"upstream_reachable": upstream_ok,
|
|
"queue_healthy": queue_ok,
|
|
"token_bucket_healthy": token_ok,
|
|
} |