b18d243ef2
1. 架构解耦 — SidecarContext + FastAPI Depends 注入 - 新增 context.py: SidecarContext dataclass 收敛全部全局状态 - server.py: 移除模块级全局变量,lifespan 创建 ctx → app.state.sidecar - webui.py: 移除反向导入 server,改用 Depends(get_context) 2. Prometheus 标签基数治理 — model_id → provider - upstream_latency_seconds / upstream_errors_total label 收敛为 provider - 模型级信息保留在 structlog JSON 日志 3. SSE 快照共享缓存 - 1s TTL 共享 snapshot cache + double-check locking - 多客户端不重复构建快照 4. 部署支撑 - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK) - systemd service (安全加固, 资源限制) - .env.example (完整环境变量清单) 5. Readiness HTTP Client 复用 - check_upstream() 注入主 http_client,不再每次创建新 client 6. Retreat 并发回归测试 - 5 个测试用例全部通过(死锁检测 + 状态转换 + 并发安全) 7. Dashboard UX 优化 - 队列柱状图 300ms 平滑动画 - SSE 断连 5s 半透明遮罩 - 队列图标题显示总排队数 - 页面加载同步配置 验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes) Co-authored-by: multica-agent <github@multica.ai>
198 lines
5.8 KiB
Python
198 lines
5.8 KiB
Python
"""
|
||
NVIDIA Sidecar 限流代理 — 健康检查端点 (§3.6)
|
||
|
||
提供 Kubernetes / systemd 兼容的健康检查:
|
||
GET /health — 存活检查
|
||
GET /health/ready — 就绪检查(含上游连通性)
|
||
|
||
BIZ-46 Phase3: Readiness HTTP Client 复用 — 注入主 http_client,
|
||
不再每次检查创建新 client,降低 K8s/systemd 高频探测的连接开销。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
import httpx
|
||
|
||
|
||
@dataclass
|
||
class HealthService:
|
||
"""健康检查服务。
|
||
|
||
封装存活检查和就绪检查的逻辑,供 server.py 路由调用。
|
||
"""
|
||
|
||
start_time: float = 0.0
|
||
version: str = "0.1.0"
|
||
|
||
def __post_init__(self) -> None:
|
||
if self.start_time == 0.0:
|
||
self.start_time = time.time()
|
||
|
||
@property
|
||
def uptime_seconds(self) -> float:
|
||
"""服务运行时长(秒)。"""
|
||
return time.time() - self.start_time
|
||
|
||
async def check_upstream(
|
||
self,
|
||
upstream_url: str,
|
||
http_client: httpx.AsyncClient,
|
||
timeout: float = 5.0,
|
||
api_key: str = "",
|
||
) -> bool:
|
||
"""检查上游连通性(复用注入的 http_client,BIZ-46 Phase3)。
|
||
|
||
Args:
|
||
upstream_url: NVIDIA API base URL。
|
||
http_client: 复用的 httpx.AsyncClient(来自 ctx)。
|
||
timeout: 超时秒数(per-request override)。
|
||
api_key: 可选的 API Key 用于认证。
|
||
|
||
Returns:
|
||
True 上游可达。
|
||
"""
|
||
try:
|
||
headers: dict[str, str] = {}
|
||
if api_key:
|
||
headers["authorization"] = f"Bearer {api_key}"
|
||
|
||
resp = await http_client.get(
|
||
f"{upstream_url.rstrip('/')}/v1/models",
|
||
headers=headers,
|
||
timeout=timeout,
|
||
)
|
||
return resp.status_code < 500
|
||
except Exception:
|
||
return False
|
||
|
||
def check_queue_healthy(
|
||
self,
|
||
current_size: int,
|
||
max_size: int,
|
||
threshold_ratio: float = 0.9,
|
||
) -> bool:
|
||
"""检查队列是否健康(未接近满载)。
|
||
|
||
Args:
|
||
current_size: 当前队列长度。
|
||
max_size: 队列最大容量。
|
||
threshold_ratio: 告警阈值比例,默认 0.9。
|
||
|
||
Returns:
|
||
True 队列健康。
|
||
"""
|
||
if max_size <= 0:
|
||
return True
|
||
return current_size < max_size * threshold_ratio
|
||
|
||
def check_token_bucket_healthy(
|
||
self,
|
||
available_tokens: float,
|
||
capacity: int,
|
||
threshold: float = 0.05,
|
||
) -> bool:
|
||
"""检查令牌桶是否健康(token 未耗尽)。
|
||
|
||
Args:
|
||
available_tokens: 当前可用令牌数。
|
||
capacity: 桶容量。
|
||
threshold: 令牌数低于此比例视为不健康。
|
||
|
||
Returns:
|
||
True 令牌桶健康。
|
||
"""
|
||
if capacity <= 0:
|
||
return False
|
||
return available_tokens > capacity * threshold
|
||
|
||
def liveness(self) -> dict[str, Any]:
|
||
"""存活检查响应。
|
||
|
||
Returns:
|
||
liveness JSON payload。
|
||
"""
|
||
return {
|
||
"status": "ok",
|
||
"uptime": round(self.uptime_seconds, 1),
|
||
"version": self.version,
|
||
}
|
||
|
||
async def readiness(
|
||
self,
|
||
upstream_url: str,
|
||
upstream_api_key: str = "",
|
||
queue_current_size: int = 0,
|
||
queue_max_size: int = 500,
|
||
available_tokens: float = 0.0,
|
||
bucket_capacity: int = 40,
|
||
http_client: httpx.AsyncClient | None = None,
|
||
) -> dict[str, Any]:
|
||
"""就绪检查响应。
|
||
|
||
Args:
|
||
upstream_url: 上游 API 地址。
|
||
upstream_api_key: API Key。
|
||
queue_current_size: 当前队列长度。
|
||
queue_max_size: 队列最大容量。
|
||
available_tokens: 当前令牌数。
|
||
bucket_capacity: 桶容量。
|
||
http_client: 复用的 httpx.AsyncClient(BIZ-46 Phase3)。
|
||
为 None 时回退到每次创建新 client(兼容旧调用)。
|
||
|
||
Returns:
|
||
readiness JSON payload。
|
||
"""
|
||
if http_client is not None:
|
||
upstream_ok = await self.check_upstream(
|
||
upstream_url, http_client=http_client, api_key=upstream_api_key,
|
||
)
|
||
else:
|
||
# 向后兼容:无 http_client 时沿用旧行为
|
||
upstream_ok = await self.check_upstream_standalone(
|
||
upstream_url, api_key=upstream_api_key,
|
||
)
|
||
|
||
queue_ok = self.check_queue_healthy(queue_current_size, queue_max_size)
|
||
token_ok = self.check_token_bucket_healthy(available_tokens, bucket_capacity)
|
||
all_ready = upstream_ok and queue_ok and token_ok
|
||
|
||
return {
|
||
"ready": all_ready,
|
||
"upstream_reachable": upstream_ok,
|
||
"queue_healthy": queue_ok,
|
||
"token_bucket_healthy": token_ok,
|
||
}
|
||
|
||
async def check_upstream_standalone(
|
||
self,
|
||
upstream_url: str,
|
||
timeout: float = 5.0,
|
||
api_key: str = "",
|
||
) -> bool:
|
||
"""独立检查上游连通性(向后兼容,每次创建新 client)。
|
||
|
||
Args:
|
||
upstream_url: NVIDIA API base URL。
|
||
timeout: 超时秒数。
|
||
api_key: 可选的 API Key。
|
||
|
||
Returns:
|
||
True 上游可达。
|
||
"""
|
||
try:
|
||
headers: dict[str, str] = {}
|
||
if api_key:
|
||
headers["authorization"] = f"Bearer {api_key}"
|
||
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
resp = await client.get(
|
||
f"{upstream_url.rstrip('/')}/v1/models",
|
||
headers=headers,
|
||
)
|
||
return resp.status_code < 500
|
||
except Exception:
|
||
return False |