BIZ-46 Phase3: 7项 follow-up 开发完成

1. 架构解耦 — SidecarContext + FastAPI Depends 注入
   - 新增 context.py: SidecarContext dataclass 收敛全部全局状态
   - server.py: 移除模块级全局变量,lifespan 创建 ctx → app.state.sidecar
   - webui.py: 移除反向导入 server,改用 Depends(get_context)

2. Prometheus 标签基数治理 — model_id → provider
   - upstream_latency_seconds / upstream_errors_total label 收敛为 provider
   - 模型级信息保留在 structlog JSON 日志

3. SSE 快照共享缓存
   - 1s TTL 共享 snapshot cache + double-check locking
   - 多客户端不重复构建快照

4. 部署支撑
   - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK)
   - systemd service (安全加固, 资源限制)
   - .env.example (完整环境变量清单)

5. Readiness HTTP Client 复用
   - check_upstream() 注入主 http_client,不再每次创建新 client

6. Retreat 并发回归测试
   - 5 个测试用例全部通过(死锁检测 + 状态转换 + 并发安全)

7. Dashboard UX 优化
   - 队列柱状图 300ms 平滑动画
   - SSE 断连 5s 半透明遮罩
   - 队列图标题显示总排队数
   - 页面加载同步配置

验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes)

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-24 22:26:35 +08:00
parent 8a12ff9693
commit b18d243ef2
12 changed files with 928 additions and 312 deletions
+178 -169
View File
@@ -5,6 +5,8 @@ NVIDIA Sidecar 限流代理 — FastAPI 代理主入口 (§3.4)
接收 → 网关识别 → [NVIDIA: 排队 → 令牌限流] → httpx 转发 → 返回
非 NVIDIA 请求直通上游,NVIDIA 请求经过四级优先级队列 + 令牌桶限流。
BIZ-46 Phase3: 架构解耦 — 所有全局状态收敛为 SidecarContext (§1)
"""
from __future__ import annotations
@@ -19,11 +21,12 @@ from typing import Any
import httpx
import structlog
import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi import Depends, FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from nvidia_sidecar.config import load_config, SidecarConfig
from nvidia_sidecar.context import SidecarContext
from nvidia_sidecar.rate_limiter import (
Priority,
AdaptiveTokenBucket,
@@ -64,42 +67,18 @@ logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar")
# ---------------------------------------------------------------------------
# 全局状态(通过 lifespan 初始化,模块级引用方便路由访问)
# FastAPI 依赖注入
# ---------------------------------------------------------------------------
_config: SidecarConfig
_http_client: httpx.AsyncClient
_priority_queue: PriorityRequestQueue
_token_bucket: AdaptiveTokenBucket
_prometheus: PrometheusMetrics
_health_service: HealthService
_pending_requests: dict[str, tuple[asyncio.Future[httpx.Response], float]]
"""request_id → (response future, enqueued_at) 的映射。"""
_metrics_task: asyncio.Task[None] | None = None
# 统计计数器(受 _stats_lock 保护, 修复梁思筑评审 #1: data race
_stats: dict[str, int] = {
"total_requests": 0,
"nvidia_requests": 0,
"passthrough_requests": 0,
"ratelimited_requests": 0,
"queue_full_rejects": 0,
"upstream_errors": 0,
"start_time": 0,
}
_stats_lock: asyncio.Lock = asyncio.Lock()
def get_context(request: Request) -> SidecarContext:
"""从 app.state 获取 SidecarContextFastAPI 依赖注入)。"""
return request.app.state.sidecar # type: ignore[no-any-return]
# ---------------------------------------------------------------------------
# 工具函数
# ---------------------------------------------------------------------------
async def _increment_stat(key: str, delta: int = 1) -> None:
"""线程安全的 _stats 计数器自增(梁思筑评审 #1 修复:消除 data race)。"""
async with _stats_lock:
_stats[key] = _stats.get(key, 0) + delta
def _extract_model(body: Any) -> str | None:
"""从请求体中提取模型标识符(兼容 OpenAI Chat/Completions 格式)。
@@ -135,6 +114,7 @@ def _resolve_priority(headers: dict[str, str]) -> Priority:
# ---------------------------------------------------------------------------
async def _forward_to_upstream(
ctx: SidecarContext,
method: str,
path: str,
body: bytes | None,
@@ -144,6 +124,7 @@ async def _forward_to_upstream(
"""将请求转发到 NVIDIA 上游 API。
Args:
ctx: SidecarContext 运行时上下文。
method: HTTP 方法。
path: 请求路径(如 ``/v1/chat/completions``)。
body: 原始请求体 bytes。
@@ -156,28 +137,28 @@ async def _forward_to_upstream(
Raises:
httpx.HTTPError: HTTP 请求失败。
"""
upstream_url = _config.upstream_url.rstrip("/") + path
upstream_url = ctx.config.upstream_url.rstrip("/") + path
forward_headers: dict[str, str] = {
k: v for k, v in headers.items()
if k.lower() not in ("host", "content-length", "transfer-encoding")
}
if _config.upstream_api_key:
forward_headers["authorization"] = f"Bearer {_config.upstream_api_key}"
if ctx.config.upstream_api_key:
forward_headers["authorization"] = f"Bearer {ctx.config.upstream_api_key}"
elif "authorization" not in {k.lower() for k in forward_headers}:
forward_headers["authorization"] = "Bearer nvidia"
try:
req = _http_client.build_request(
req = ctx.http_client.build_request(
method=method,
url=upstream_url,
headers=forward_headers,
content=body,
timeout=_config.request_timeout,
timeout=ctx.config.request_timeout,
)
response = await _http_client.send(req, stream=stream)
response = await ctx.http_client.send(req, stream=stream)
return response
except httpx.TimeoutException:
logger.warning("upstream_timeout", path=path, timeout=_config.request_timeout)
logger.warning("upstream_timeout", path=path, timeout=ctx.config.request_timeout)
raise
except httpx.HTTPError as exc:
logger.error("upstream_error", path=path, error=str(exc))
@@ -188,14 +169,18 @@ async def _forward_to_upstream(
# worker 协程:消费优先级队列 + 令牌桶 + 转发
# ---------------------------------------------------------------------------
async def _worker_loop() -> None:
"""后台 worker:持续从优先级队列取请求 → 令牌限流 → 转发 → 设置 future 结果。"""
async def _worker_loop(ctx: SidecarContext) -> None:
"""后台 worker:持续从优先级队列取请求 → 令牌限流 → 转发 → 设置 future 结果。
Args:
ctx: SidecarContext 运行时上下文。
"""
log = logger.bind(worker="main")
log.info("worker_started")
while True:
try:
queue_item = await _priority_queue.get(timeout=1.0)
queue_item = await ctx.priority_queue.get(timeout=1.0)
if queue_item is None:
continue
@@ -205,7 +190,7 @@ async def _worker_loop() -> None:
enqueued_at = queue_item.enqueued_at
# 查找对应的 pending future
pending_entry = _pending_requests.get(request_id)
pending_entry = ctx.pending_requests.get(request_id)
if pending_entry is None:
log.warning("orphan_request", request_id=request_id)
continue
@@ -215,31 +200,30 @@ async def _worker_loop() -> None:
if queue_item.priority == Priority.LOW:
# 放线程池执行阻塞的令牌桶调用
got_token = await asyncio.to_thread(
_token_bucket.try_consume,
ctx.token_bucket.try_consume,
tokens=1,
timeout=_config.low_priority_timeout,
timeout=ctx.config.low_priority_timeout,
)
if not got_token:
log.info("low_priority_timeout", request_id=request_id)
await _increment_stat("ratelimited_requests")
_prometheus.record_request(queue_item.priority.name, "ratelimited")
await ctx.increment_stat("ratelimited_requests")
ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
if not future.done():
future.set_exception(
_RateLimitedError(
f"低优先级请求令牌等待超时 ({_config.low_priority_timeout}s)"
f"低优先级请求令牌等待超时 ({ctx.config.low_priority_timeout}s)"
)
)
_pending_requests.pop(request_id, None)
ctx.pending_requests.pop(request_id, None)
continue
else:
# 非低优先级:在 worker 内轮询等待令牌,避免重入队导致 future 悬挂
# (重入队会生成新 request_id,原 future 永不 resolve → 客户端永久 hang
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
if not got_token:
token_deadline = time.monotonic() + _config.request_timeout
token_deadline = time.monotonic() + ctx.config.request_timeout
while not got_token:
await asyncio.sleep(0.1)
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
if time.monotonic() > token_deadline:
break
if not got_token:
@@ -247,17 +231,17 @@ async def _worker_loop() -> None:
"token_wait_timeout",
request_id=request_id,
priority=queue_item.priority.name,
timeout=_config.request_timeout,
timeout=ctx.config.request_timeout,
)
await _increment_stat("ratelimited_requests")
_prometheus.record_request(queue_item.priority.name, "ratelimited")
await ctx.increment_stat("ratelimited_requests")
ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
if not future.done():
future.set_exception(
_RateLimitedError(
f"令牌等待超时 ({_config.request_timeout:.0f}s)"
f"令牌等待超时 ({ctx.config.request_timeout:.0f}s)"
)
)
_pending_requests.pop(request_id, None)
ctx.pending_requests.pop(request_id, None)
continue
# 转发到上游
@@ -272,6 +256,7 @@ async def _worker_loop() -> None:
}
resp = await _forward_to_upstream(
ctx=ctx,
method=method,
path=path,
body=payload.get("_raw_body"),
@@ -284,19 +269,22 @@ async def _worker_loop() -> None:
total_latency = upstream_latency + queue_latency
is_429: bool = resp.status_code == 429
_token_bucket.record_response(is_429)
ctx.token_bucket.record_response(is_429)
# 避退状态评估 + 指标更新
_token_bucket.evaluate_retreat()
retreat_state = _token_bucket.get_retreat_state()
effective_rpm = _token_bucket.get_effective_rate_rpm()
upstream_429_rate = _token_bucket.get_429_rate()
_prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
ctx.token_bucket.evaluate_retreat()
retreat_state = ctx.token_bucket.get_retreat_state()
effective_rpm = ctx.token_bucket.get_effective_rate_rpm()
upstream_429_rate = ctx.token_bucket.get_429_rate()
ctx.prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
# 模型级信息写入 JSON 日志 (BIZ-46 Phase3: provider label 收敛后保留)
model_id = _extract_model(payload) or "unknown"
log.info(
"request_completed",
request_id=request_id,
status=resp.status_code,
model_id=model_id,
upstream_latency=round(upstream_latency, 3),
queue_latency=round(queue_latency, 3),
total_latency=round(total_latency, 3),
@@ -304,26 +292,26 @@ async def _worker_loop() -> None:
effective_rpm=round(effective_rpm, 1),
)
# 记录 Prometheus 指标
model_id = _extract_model(payload) or "unknown"
_prometheus.record_upstream_latency(model_id, upstream_latency)
# 记录 Prometheus 指标 — provider 收敛(BIZ-46 Phase3
provider = "nvidia"
ctx.prometheus.record_upstream_latency(provider, upstream_latency)
if not resp.is_success:
_prometheus.record_upstream_error(resp.status_code, model_id)
_prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
_prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
ctx.prometheus.record_upstream_error(resp.status_code, provider)
ctx.prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
ctx.prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
if not future.done():
future.set_result(resp)
except (httpx.HTTPError, OSError) as exc:
log.error("upstream_request_failed", request_id=request_id, error=str(exc))
await _increment_stat("upstream_errors")
_prometheus.record_request(queue_item.priority.name, "error")
_prometheus.set_health(False)
await ctx.increment_stat("upstream_errors")
ctx.prometheus.record_request(queue_item.priority.name, "error")
ctx.prometheus.set_health(False)
if not future.done():
future.set_exception(exc)
_pending_requests.pop(request_id, None)
ctx.pending_requests.pop(request_id, None)
except asyncio.CancelledError:
log.info("worker_cancelled")
@@ -337,6 +325,7 @@ async def _worker_loop() -> None:
# ---------------------------------------------------------------------------
async def _passthrough_with_rate_limit(
ctx: SidecarContext,
request: Request,
path: str,
body_bytes: bytes,
@@ -346,6 +335,7 @@ async def _passthrough_with_rate_limit(
"""队列满时的 PASSSTHROUGH 直通路径:仍受令牌桶限流,但不排队。
Args:
ctx: SidecarContext 运行时上下文。
request: FastAPI Request。
path: 请求路径。
body_bytes: 原始请求体。
@@ -355,45 +345,43 @@ async def _passthrough_with_rate_limit(
Returns:
FastAPI Response。
"""
await _increment_stat("passthrough_requests")
_prometheus.increment_fallback()
await ctx.increment_stat("passthrough_requests")
ctx.prometheus.increment_fallback()
# 低优先级走令牌桶等待
if priority == Priority.LOW:
got_token = await asyncio.to_thread(
_token_bucket.try_consume,
ctx.token_bucket.try_consume,
tokens=1,
timeout=_config.low_priority_timeout,
timeout=ctx.config.low_priority_timeout,
)
if not got_token:
await _increment_stat("ratelimited_requests")
_prometheus.record_request(priority.name, "ratelimited")
await ctx.increment_stat("ratelimited_requests")
ctx.prometheus.record_request(priority.name, "ratelimited")
return JSONResponse(
status_code=429,
content={
"error": {
"message": f"令牌不足(队列满 + passthrough),超时 {_config.low_priority_timeout}s",
"message": f"令牌不足(队列满 + passthrough),超时 {ctx.config.low_priority_timeout}s",
"type": "RateLimitedError",
}
},
)
else:
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
if not got_token:
# 非低优先级轮询等待,使用 config.request_timeout 替代硬编码 30s
# (严维序评审 minor / 梁思筑评审 #3hot-reload 假生效修复)
deadline = time.monotonic() + _config.request_timeout
deadline = time.monotonic() + ctx.config.request_timeout
while not got_token:
await asyncio.sleep(0.1)
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
if time.monotonic() > deadline:
await _increment_stat("ratelimited_requests")
_prometheus.record_request(priority.name, "ratelimited")
await ctx.increment_stat("ratelimited_requests")
ctx.prometheus.record_request(priority.name, "ratelimited")
return JSONResponse(
status_code=429,
content={
"error": {
"message": f"令牌不足(队列满 + passthrough),等待超时 {_config.request_timeout:.0f}s",
"message": f"令牌不足(队列满 + passthrough),等待超时 {ctx.config.request_timeout:.0f}s",
"type": "RateLimitedError",
}
},
@@ -403,24 +391,25 @@ async def _passthrough_with_rate_limit(
try:
clean_headers = {k: v for k, v in raw_headers.items()}
resp = await _forward_to_upstream(
ctx=ctx,
method=request.method,
path=path,
body=body_bytes if body_bytes else None,
headers=clean_headers,
stream=False,
)
retreat_state = _token_bucket.get_retreat_state()
_token_bucket.evaluate_retreat()
_prometheus.update_retreat_metrics(
retreat_state = ctx.token_bucket.get_retreat_state()
ctx.token_bucket.evaluate_retreat()
ctx.prometheus.update_retreat_metrics(
retreat_state,
_token_bucket.get_effective_rate_rpm(),
_token_bucket.get_429_rate(),
ctx.token_bucket.get_effective_rate_rpm(),
ctx.token_bucket.get_429_rate(),
)
return _build_response(resp)
except Exception as exc:
status, msg = _map_exception(exc)
logger.error("passthrough_error", path=path, error=str(exc))
_prometheus.set_health(False)
ctx.prometheus.set_health(False)
return JSONResponse(
status_code=status,
content={"error": {"message": msg, "type": type(exc).__name__}},
@@ -463,40 +452,49 @@ def _map_exception(exc: Exception) -> tuple[int, str]:
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
"""应用生命周期管理:初始化/清理全局资源。"""
global _config, _http_client, _priority_queue, _token_bucket, _pending_requests
global _prometheus, _health_service, _metrics_task
"""应用生命周期管理:初始化/清理全局资源。
BIZ-46 Phase3: 所有资源收敛到 SidecarContext,挂载于 app.state.sidecar。
"""
# 启动
_config = load_config()
logging.getLogger().setLevel(_config.log_level.upper())
config: SidecarConfig = load_config()
logging.getLogger().setLevel(config.log_level.upper())
_http_client = httpx.AsyncClient(
timeout=httpx.Timeout(_config.request_timeout),
http_client: httpx.AsyncClient = httpx.AsyncClient(
timeout=httpx.Timeout(config.request_timeout),
limits=httpx.Limits(
max_connections=100,
max_keepalive_connections=20,
),
)
_priority_queue = PriorityRequestQueue(max_size=_config.queue_max_size)
_token_bucket = AdaptiveTokenBucket(
rate=_config.rate_rpm / 60.0,
capacity=_config.bucket_capacity,
priority_queue: PriorityRequestQueue = PriorityRequestQueue(max_size=config.queue_max_size)
token_bucket: AdaptiveTokenBucket = AdaptiveTokenBucket(
rate=config.rate_rpm / 60.0,
capacity=config.bucket_capacity,
)
_prometheus = PrometheusMetrics()
_health_service = HealthService()
_pending_requests = {}
_stats["start_time"] = int(time.time())
prometheus: PrometheusMetrics = PrometheusMetrics()
health: HealthService = HealthService()
ctx: SidecarContext = SidecarContext(
config=config,
http_client=http_client,
token_bucket=token_bucket,
priority_queue=priority_queue,
prometheus=prometheus,
health=health,
)
ctx.stats["start_time"] = int(time.time())
app.state.sidecar = ctx # 注入 FastAPI
# 启动 worker 协程
worker_task = asyncio.create_task(_worker_loop())
worker_task = asyncio.create_task(_worker_loop(ctx))
# 在独立端口 :9191 启动 Prometheus metrics 服务器
metrics_app = _prometheus.build_asgi_app()
metrics_app = prometheus.build_asgi_app()
metrics_config = uvicorn.Config(
metrics_app,
host=_config.listen_host,
port=_config.metrics_port,
host=config.listen_host,
port=config.metrics_port,
log_level="error",
)
metrics_server = uvicorn.Server(metrics_config)
@@ -515,7 +513,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
app.include_router(webui_router)
# upstream_api_key 启动检查(严维序评审 #5)
if not _config.upstream_api_key:
if not config.upstream_api_key:
logger.warning(
"upstream_api_key_empty",
message="SIDECAR_API_KEY 未设置,NVIDIA 请求将因 401 认证失败",
@@ -523,11 +521,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
logger.info(
"sidecar_started",
host=_config.listen_host,
port=_config.listen_port,
metrics_port=_config.metrics_port,
rate_rpm=_config.rate_rpm,
queue_max=_config.queue_max_size,
host=config.listen_host,
port=config.listen_port,
metrics_port=config.metrics_port,
rate_rpm=config.rate_rpm,
queue_max=config.queue_max_size,
retreat_enabled=True,
)
@@ -540,17 +538,25 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
except asyncio.CancelledError:
pass
if _metrics_task is not None:
_metrics_task.cancel()
try:
await _metrics_task
except asyncio.CancelledError:
pass
_metrics_task.cancel()
try:
await _metrics_task
except asyncio.CancelledError:
pass
await _http_client.aclose()
await http_client.aclose()
logger.info("sidecar_stopped")
def _mask_api_key(key: str) -> str:
"""对 API Key 进行脱敏处理,仅保留前 4 位以供识别。"""
if not key:
return ""
if len(key) <= 4:
return key[:2] + "****"
return key[:4] + "****"
app: FastAPI = FastAPI(
title="NVIDIA Sidecar Rate-Limiting Proxy",
version="0.1.0",
@@ -562,7 +568,7 @@ app: FastAPI = FastAPI(
# 核心代理处理器
# ---------------------------------------------------------------------------
async def _handle_proxy_request(request: Request, path: str) -> Response:
async def _handle_proxy_request(ctx: SidecarContext, request: Request, path: str) -> Response:
"""统一的代理请求处理入口。
执行完整链路:
@@ -570,7 +576,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
2. 网关识别 → 非 NVIDIA 直通
3. NVIDIA → 排队 + 令牌限流 + 转发
"""
await _increment_stat("total_requests")
await ctx.increment_stat("total_requests")
# 解析请求
body_bytes: bytes = await request.body()
@@ -590,9 +596,10 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
# 非 NVIDIA → 直接转发
if not is_nvidia:
await _increment_stat("passthrough_requests")
await ctx.increment_stat("passthrough_requests")
try:
resp = await _forward_to_upstream(
ctx=ctx,
method=request.method,
path=path,
body=body_bytes if body_bytes else None,
@@ -609,7 +616,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
)
# NVIDIA → 排队 + 限流 + 转发
await _increment_stat("nvidia_requests")
await ctx.increment_stat("nvidia_requests")
priority: Priority = _resolve_priority(raw_headers)
# 注入内部元数据到 payload
@@ -618,7 +625,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
# 尝试入队;PASSTHROUGH 策略下队列满时走直通路径
try:
request_id = await _priority_queue.put(
request_id = await ctx.priority_queue.put(
item=payload_for_queue,
priority=priority,
headers={
@@ -628,7 +635,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
},
)
except QueueFullError:
await _increment_stat("queue_full_rejects")
await ctx.increment_stat("queue_full_rejects")
return JSONResponse(
status_code=503,
content={
@@ -639,18 +646,16 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
},
)
except QueueFullPassthrough:
# 队列满 + PASSTHROUGH:绕过排队,尝试令牌桶后直接转发
await _increment_stat("passthrough_requests")
await ctx.increment_stat("passthrough_requests")
logger.info("queue_full_passthrough", path=path)
return await _passthrough_with_rate_limit(request, path, body_bytes, raw_headers, priority)
return await _passthrough_with_rate_limit(ctx, request, path, body_bytes, raw_headers, priority)
# 创建 future 并注册到 pending
loop = asyncio.get_running_loop()
future: asyncio.Future[httpx.Response] = loop.create_future()
_pending_requests[request_id] = (future, time.monotonic())
ctx.pending_requests[request_id] = (future, time.monotonic())
try:
# 等待 worker 完成处理
resp = await future
return _build_response(resp)
except _RateLimitedError as exc:
@@ -708,89 +713,93 @@ def _build_response(resp: httpx.Response) -> Response:
# ---------------------------------------------------------------------------
@app.get("/health")
async def health() -> dict[str, Any]:
async def health(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
"""存活检查 (liveness)。"""
return _health_service.liveness()
return ctx.health.liveness()
@app.get("/health/ready")
async def health_ready() -> dict[str, Any]:
"""就绪检查 (readiness),含上游连通性。"""
queue_size = await _priority_queue.get_queue_size()
bucket_status = _token_bucket.get_status()
return await _health_service.readiness(
upstream_url=_config.upstream_url,
upstream_api_key=_config.upstream_api_key or "",
async def health_ready(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
"""就绪检查 (readiness),含上游连通性。
BIZ-46 Phase3: 复用 ctx.http_client,不再每次创建新 client。
"""
queue_size = await ctx.priority_queue.get_queue_size()
bucket_status = ctx.token_bucket.get_status()
return await ctx.health.readiness(
upstream_url=ctx.config.upstream_url,
upstream_api_key=ctx.config.upstream_api_key or "",
queue_current_size=queue_size,
queue_max_size=_config.queue_max_size,
queue_max_size=ctx.config.queue_max_size,
available_tokens=bucket_status["tokens"],
bucket_capacity=bucket_status["capacity"],
http_client=ctx.http_client, # 复用主 client
)
@app.get("/status")
async def status() -> dict[str, Any]:
async def status(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
"""调试用:限流器 + 队列 + 避退完整状态。"""
queue_stats = await _priority_queue.get_stats()
bucket_status = _token_bucket.get_status()
queue_stats = await ctx.priority_queue.get_stats()
bucket_status = ctx.token_bucket.get_status()
return {
"requests": {
"total": _stats["total_requests"],
"nvidia": _stats["nvidia_requests"],
"passthrough": _stats["passthrough_requests"],
"ratelimited": _stats["ratelimited_requests"],
"total": ctx.stats["total_requests"],
"nvidia": ctx.stats["nvidia_requests"],
"passthrough": ctx.stats["passthrough_requests"],
"ratelimited": ctx.stats["ratelimited_requests"],
},
"errors": {
"queue_full_rejects": _stats["queue_full_rejects"],
"upstream_errors": _stats["upstream_errors"],
"queue_full_rejects": ctx.stats["queue_full_rejects"],
"upstream_errors": ctx.stats["upstream_errors"],
},
"queue": queue_stats,
"token_bucket": bucket_status,
"retreat": {
"state": _token_bucket.get_retreat_state(),
"effective_rpm": round(_token_bucket.get_effective_rate_rpm(), 1),
"base_rpm": round(_token_bucket.get_base_rate_rpm(), 1),
"upstream_429_rate": round(_token_bucket.get_429_rate(), 4),
"state": ctx.token_bucket.get_retreat_state(),
"effective_rpm": round(ctx.token_bucket.get_effective_rate_rpm(), 1),
"base_rpm": round(ctx.token_bucket.get_base_rate_rpm(), 1),
"upstream_429_rate": round(ctx.token_bucket.get_429_rate(), 4),
},
"uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
"uptime_seconds": ctx.uptime_seconds,
}
# ---- OpenAI 兼容端点 ----
@app.post("/v1/chat/completions")
async def chat_completions(request: Request) -> Response:
async def chat_completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
"""OpenAI Chat Completions API 代理(含流式支持)。"""
return await _handle_proxy_request(request, "/v1/chat/completions")
return await _handle_proxy_request(ctx, request, "/v1/chat/completions")
@app.post("/v1/completions")
async def completions(request: Request) -> Response:
async def completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
"""OpenAI Completions API 代理(legacy)。"""
return await _handle_proxy_request(request, "/v1/completions")
return await _handle_proxy_request(ctx, request, "/v1/completions")
@app.post("/v1/embeddings")
async def embeddings(request: Request) -> Response:
async def embeddings(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
"""OpenAI Embeddings API 代理。"""
return await _handle_proxy_request(request, "/v1/embeddings")
return await _handle_proxy_request(ctx, request, "/v1/embeddings")
@app.get("/v1/models")
@app.get("/v1/models/{model_id:path}")
async def list_models(request: Request, model_id: str | None = None) -> Response:
async def list_models(request: Request, model_id: str | None = None, ctx: SidecarContext = Depends(get_context)) -> Response:
"""OpenAI Models API 代理。"""
path = f"/v1/models/{model_id}" if model_id else "/v1/models"
return await _handle_proxy_request(request, path)
return await _handle_proxy_request(ctx, request, path)
# ---- 通用代理(catch-all 用于非标准 NVIDIA 端点) ----
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
async def catch_all(request: Request, path: str) -> Response:
async def catch_all(request: Request, path: str, ctx: SidecarContext = Depends(get_context)) -> Response:
"""通用代理端点:转发任何未匹配的路径到上游。"""
target_path = f"/{path}" if not path.startswith("/") else path
return await _handle_proxy_request(request, target_path)
return await _handle_proxy_request(ctx, request, target_path)
# ---------------------------------------------------------------------------