b18d243ef2
1. 架构解耦 — SidecarContext + FastAPI Depends 注入 - 新增 context.py: SidecarContext dataclass 收敛全部全局状态 - server.py: 移除模块级全局变量,lifespan 创建 ctx → app.state.sidecar - webui.py: 移除反向导入 server,改用 Depends(get_context) 2. Prometheus 标签基数治理 — model_id → provider - upstream_latency_seconds / upstream_errors_total label 收敛为 provider - 模型级信息保留在 structlog JSON 日志 3. SSE 快照共享缓存 - 1s TTL 共享 snapshot cache + double-check locking - 多客户端不重复构建快照 4. 部署支撑 - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK) - systemd service (安全加固, 资源限制) - .env.example (完整环境变量清单) 5. Readiness HTTP Client 复用 - check_upstream() 注入主 http_client,不再每次创建新 client 6. Retreat 并发回归测试 - 5 个测试用例全部通过(死锁检测 + 状态转换 + 并发安全) 7. Dashboard UX 优化 - 队列柱状图 300ms 平滑动画 - SSE 断连 5s 半透明遮罩 - 队列图标题显示总排队数 - 页面加载同步配置 验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes) Co-authored-by: multica-agent <github@multica.ai>
822 lines
29 KiB
Python
822 lines
29 KiB
Python
"""
|
||
NVIDIA Sidecar 限流代理 — FastAPI 代理主入口 (§3.4)
|
||
|
||
完整的 API 代理链路:
|
||
接收 → 网关识别 → [NVIDIA: 排队 → 令牌限流] → httpx 转发 → 返回
|
||
|
||
非 NVIDIA 请求直通上游,NVIDIA 请求经过四级优先级队列 + 令牌桶限流。
|
||
|
||
BIZ-46 Phase3: 架构解耦 — 所有全局状态收敛为 SidecarContext (§1)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import time
|
||
from collections.abc import AsyncGenerator
|
||
from contextlib import asynccontextmanager
|
||
from typing import Any
|
||
|
||
import httpx
|
||
import structlog
|
||
import uvicorn
|
||
from fastapi import Depends, FastAPI, Request, Response
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import JSONResponse, StreamingResponse
|
||
|
||
from nvidia_sidecar.config import load_config, SidecarConfig
|
||
from nvidia_sidecar.context import SidecarContext
|
||
from nvidia_sidecar.rate_limiter import (
|
||
Priority,
|
||
AdaptiveTokenBucket,
|
||
is_nvidia_gateway,
|
||
)
|
||
from nvidia_sidecar.priority_queue import (
|
||
PriorityRequestQueue,
|
||
QueueFullError,
|
||
QueueFullPassthrough,
|
||
QueueFullPolicy,
|
||
)
|
||
from nvidia_sidecar.metrics import PrometheusMetrics
|
||
from nvidia_sidecar.health import HealthService
|
||
from nvidia_sidecar.webui import webui_router
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 结构化日志
|
||
# ---------------------------------------------------------------------------
|
||
|
||
structlog.configure(
|
||
processors=[
|
||
structlog.stdlib.filter_by_level,
|
||
structlog.stdlib.add_logger_name,
|
||
structlog.stdlib.add_log_level,
|
||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||
structlog.processors.TimeStamper(fmt="iso"),
|
||
structlog.processors.StackInfoRenderer(),
|
||
structlog.processors.format_exc_info,
|
||
structlog.processors.UnicodeDecoder(),
|
||
structlog.processors.JSONRenderer(),
|
||
],
|
||
context_class=dict,
|
||
logger_factory=structlog.PrintLoggerFactory(),
|
||
wrapper_class=structlog.stdlib.BoundLogger,
|
||
cache_logger_on_first_use=True,
|
||
)
|
||
logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# FastAPI 依赖注入
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def get_context(request: Request) -> SidecarContext:
|
||
"""从 app.state 获取 SidecarContext(FastAPI 依赖注入)。"""
|
||
return request.app.state.sidecar # type: ignore[no-any-return]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 工具函数
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _extract_model(body: Any) -> str | None:
|
||
"""从请求体中提取模型标识符(兼容 OpenAI Chat/Completions 格式)。
|
||
|
||
Args:
|
||
body: 已解析的 JSON 请求体。
|
||
|
||
Returns:
|
||
模型标识符字符串,或 None。
|
||
"""
|
||
if isinstance(body, dict):
|
||
return str(body.get("model", "")) or None
|
||
return None
|
||
|
||
|
||
def _resolve_priority(headers: dict[str, str]) -> Priority:
|
||
"""从请求 headers 解析优先级。
|
||
|
||
检查 ``X-Priority`` header,值为 ``urgent``/``high``/``normal``/``low``,
|
||
不区分大小写。默认 NORMAL。
|
||
"""
|
||
raw = headers.get("x-priority", "").strip().lower()
|
||
mapping: dict[str, Priority] = {
|
||
"urgent": Priority.URGENT,
|
||
"high": Priority.HIGH,
|
||
"normal": Priority.NORMAL,
|
||
"low": Priority.LOW,
|
||
}
|
||
return mapping.get(raw, Priority.NORMAL)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 上游转发
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _forward_to_upstream(
|
||
ctx: SidecarContext,
|
||
method: str,
|
||
path: str,
|
||
body: bytes | None,
|
||
headers: dict[str, str],
|
||
stream: bool = False,
|
||
) -> httpx.Response:
|
||
"""将请求转发到 NVIDIA 上游 API。
|
||
|
||
Args:
|
||
ctx: SidecarContext 运行时上下文。
|
||
method: HTTP 方法。
|
||
path: 请求路径(如 ``/v1/chat/completions``)。
|
||
body: 原始请求体 bytes。
|
||
headers: 要转发的请求 headers(会追加 Authorization)。
|
||
stream: 是否请求流式响应。
|
||
|
||
Returns:
|
||
httpx.Response 对象。
|
||
|
||
Raises:
|
||
httpx.HTTPError: HTTP 请求失败。
|
||
"""
|
||
upstream_url = ctx.config.upstream_url.rstrip("/") + path
|
||
forward_headers: dict[str, str] = {
|
||
k: v for k, v in headers.items()
|
||
if k.lower() not in ("host", "content-length", "transfer-encoding")
|
||
}
|
||
if ctx.config.upstream_api_key:
|
||
forward_headers["authorization"] = f"Bearer {ctx.config.upstream_api_key}"
|
||
elif "authorization" not in {k.lower() for k in forward_headers}:
|
||
forward_headers["authorization"] = "Bearer nvidia"
|
||
|
||
try:
|
||
req = ctx.http_client.build_request(
|
||
method=method,
|
||
url=upstream_url,
|
||
headers=forward_headers,
|
||
content=body,
|
||
timeout=ctx.config.request_timeout,
|
||
)
|
||
response = await ctx.http_client.send(req, stream=stream)
|
||
return response
|
||
except httpx.TimeoutException:
|
||
logger.warning("upstream_timeout", path=path, timeout=ctx.config.request_timeout)
|
||
raise
|
||
except httpx.HTTPError as exc:
|
||
logger.error("upstream_error", path=path, error=str(exc))
|
||
raise
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# worker 协程:消费优先级队列 + 令牌桶 + 转发
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _worker_loop(ctx: SidecarContext) -> None:
|
||
"""后台 worker:持续从优先级队列取请求 → 令牌限流 → 转发 → 设置 future 结果。
|
||
|
||
Args:
|
||
ctx: SidecarContext 运行时上下文。
|
||
"""
|
||
log = logger.bind(worker="main")
|
||
log.info("worker_started")
|
||
|
||
while True:
|
||
try:
|
||
queue_item = await ctx.priority_queue.get(timeout=1.0)
|
||
if queue_item is None:
|
||
continue
|
||
|
||
request_id = queue_item.request_id
|
||
payload = queue_item.payload
|
||
headers = queue_item.headers
|
||
enqueued_at = queue_item.enqueued_at
|
||
|
||
# 查找对应的 pending future
|
||
pending_entry = ctx.pending_requests.get(request_id)
|
||
if pending_entry is None:
|
||
log.warning("orphan_request", request_id=request_id)
|
||
continue
|
||
future, _ = pending_entry
|
||
|
||
# 低优先级令牌等待超时处理
|
||
if queue_item.priority == Priority.LOW:
|
||
# 放线程池执行阻塞的令牌桶调用
|
||
got_token = await asyncio.to_thread(
|
||
ctx.token_bucket.try_consume,
|
||
tokens=1,
|
||
timeout=ctx.config.low_priority_timeout,
|
||
)
|
||
if not got_token:
|
||
log.info("low_priority_timeout", request_id=request_id)
|
||
await ctx.increment_stat("ratelimited_requests")
|
||
ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
|
||
if not future.done():
|
||
future.set_exception(
|
||
_RateLimitedError(
|
||
f"低优先级请求令牌等待超时 ({ctx.config.low_priority_timeout}s)"
|
||
)
|
||
)
|
||
ctx.pending_requests.pop(request_id, None)
|
||
continue
|
||
else:
|
||
# 非低优先级:在 worker 内轮询等待令牌,避免重入队导致 future 悬挂
|
||
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
|
||
if not got_token:
|
||
token_deadline = time.monotonic() + ctx.config.request_timeout
|
||
while not got_token:
|
||
await asyncio.sleep(0.1)
|
||
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
|
||
if time.monotonic() > token_deadline:
|
||
break
|
||
if not got_token:
|
||
log.warning(
|
||
"token_wait_timeout",
|
||
request_id=request_id,
|
||
priority=queue_item.priority.name,
|
||
timeout=ctx.config.request_timeout,
|
||
)
|
||
await ctx.increment_stat("ratelimited_requests")
|
||
ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
|
||
if not future.done():
|
||
future.set_exception(
|
||
_RateLimitedError(
|
||
f"令牌等待超时 ({ctx.config.request_timeout:.0f}s)"
|
||
)
|
||
)
|
||
ctx.pending_requests.pop(request_id, None)
|
||
continue
|
||
|
||
# 转发到上游
|
||
upstream_start = time.monotonic()
|
||
try:
|
||
path = headers.get("x-original-path", "/v1/chat/completions")
|
||
method = headers.get("x-original-method", "POST")
|
||
# 过滤内部 headers
|
||
clean_headers = {
|
||
k: v for k, v in headers.items()
|
||
if not k.startswith("x-original-") and not k.startswith("x-request-id")
|
||
}
|
||
|
||
resp = await _forward_to_upstream(
|
||
ctx=ctx,
|
||
method=method,
|
||
path=path,
|
||
body=payload.get("_raw_body"),
|
||
headers=clean_headers,
|
||
stream=payload.get("stream", False),
|
||
)
|
||
|
||
upstream_latency = time.monotonic() - upstream_start
|
||
queue_latency = time.monotonic() - enqueued_at
|
||
total_latency = upstream_latency + queue_latency
|
||
|
||
is_429: bool = resp.status_code == 429
|
||
ctx.token_bucket.record_response(is_429)
|
||
|
||
# 避退状态评估 + 指标更新
|
||
ctx.token_bucket.evaluate_retreat()
|
||
retreat_state = ctx.token_bucket.get_retreat_state()
|
||
effective_rpm = ctx.token_bucket.get_effective_rate_rpm()
|
||
upstream_429_rate = ctx.token_bucket.get_429_rate()
|
||
ctx.prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
|
||
|
||
# 模型级信息写入 JSON 日志 (BIZ-46 Phase3: provider label 收敛后保留)
|
||
model_id = _extract_model(payload) or "unknown"
|
||
log.info(
|
||
"request_completed",
|
||
request_id=request_id,
|
||
status=resp.status_code,
|
||
model_id=model_id,
|
||
upstream_latency=round(upstream_latency, 3),
|
||
queue_latency=round(queue_latency, 3),
|
||
total_latency=round(total_latency, 3),
|
||
retreat_state=retreat_state,
|
||
effective_rpm=round(effective_rpm, 1),
|
||
)
|
||
|
||
# 记录 Prometheus 指标 — provider 收敛(BIZ-46 Phase3)
|
||
provider = "nvidia"
|
||
ctx.prometheus.record_upstream_latency(provider, upstream_latency)
|
||
if not resp.is_success:
|
||
ctx.prometheus.record_upstream_error(resp.status_code, provider)
|
||
ctx.prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
|
||
ctx.prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
|
||
|
||
if not future.done():
|
||
future.set_result(resp)
|
||
|
||
except (httpx.HTTPError, OSError) as exc:
|
||
log.error("upstream_request_failed", request_id=request_id, error=str(exc))
|
||
await ctx.increment_stat("upstream_errors")
|
||
ctx.prometheus.record_request(queue_item.priority.name, "error")
|
||
ctx.prometheus.set_health(False)
|
||
if not future.done():
|
||
future.set_exception(exc)
|
||
|
||
ctx.pending_requests.pop(request_id, None)
|
||
|
||
except asyncio.CancelledError:
|
||
log.info("worker_cancelled")
|
||
break
|
||
except Exception:
|
||
log.exception("worker_unexpected_error")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# PASSTHROUGH 直通路径(队列满 + PASSTHROUGH 策略)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _passthrough_with_rate_limit(
|
||
ctx: SidecarContext,
|
||
request: Request,
|
||
path: str,
|
||
body_bytes: bytes,
|
||
raw_headers: dict[str, str],
|
||
priority: Priority,
|
||
) -> Response:
|
||
"""队列满时的 PASSSTHROUGH 直通路径:仍受令牌桶限流,但不排队。
|
||
|
||
Args:
|
||
ctx: SidecarContext 运行时上下文。
|
||
request: FastAPI Request。
|
||
path: 请求路径。
|
||
body_bytes: 原始请求体。
|
||
raw_headers: 请求 headers。
|
||
priority: 请求优先级。
|
||
|
||
Returns:
|
||
FastAPI Response。
|
||
"""
|
||
await ctx.increment_stat("passthrough_requests")
|
||
ctx.prometheus.increment_fallback()
|
||
|
||
# 低优先级走令牌桶等待
|
||
if priority == Priority.LOW:
|
||
got_token = await asyncio.to_thread(
|
||
ctx.token_bucket.try_consume,
|
||
tokens=1,
|
||
timeout=ctx.config.low_priority_timeout,
|
||
)
|
||
if not got_token:
|
||
await ctx.increment_stat("ratelimited_requests")
|
||
ctx.prometheus.record_request(priority.name, "ratelimited")
|
||
return JSONResponse(
|
||
status_code=429,
|
||
content={
|
||
"error": {
|
||
"message": f"令牌不足(队列满 + passthrough),超时 {ctx.config.low_priority_timeout}s",
|
||
"type": "RateLimitedError",
|
||
}
|
||
},
|
||
)
|
||
else:
|
||
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
|
||
if not got_token:
|
||
deadline = time.monotonic() + ctx.config.request_timeout
|
||
while not got_token:
|
||
await asyncio.sleep(0.1)
|
||
got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
|
||
if time.monotonic() > deadline:
|
||
await ctx.increment_stat("ratelimited_requests")
|
||
ctx.prometheus.record_request(priority.name, "ratelimited")
|
||
return JSONResponse(
|
||
status_code=429,
|
||
content={
|
||
"error": {
|
||
"message": f"令牌不足(队列满 + passthrough),等待超时 {ctx.config.request_timeout:.0f}s",
|
||
"type": "RateLimitedError",
|
||
}
|
||
},
|
||
)
|
||
|
||
# 拿到令牌,直接转发
|
||
try:
|
||
clean_headers = {k: v for k, v in raw_headers.items()}
|
||
resp = await _forward_to_upstream(
|
||
ctx=ctx,
|
||
method=request.method,
|
||
path=path,
|
||
body=body_bytes if body_bytes else None,
|
||
headers=clean_headers,
|
||
stream=False,
|
||
)
|
||
retreat_state = ctx.token_bucket.get_retreat_state()
|
||
ctx.token_bucket.evaluate_retreat()
|
||
ctx.prometheus.update_retreat_metrics(
|
||
retreat_state,
|
||
ctx.token_bucket.get_effective_rate_rpm(),
|
||
ctx.token_bucket.get_429_rate(),
|
||
)
|
||
return _build_response(resp)
|
||
except Exception as exc:
|
||
status, msg = _map_exception(exc)
|
||
logger.error("passthrough_error", path=path, error=str(exc))
|
||
ctx.prometheus.set_health(False)
|
||
return JSONResponse(
|
||
status_code=status,
|
||
content={"error": {"message": msg, "type": type(exc).__name__}},
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 自定义异常
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class _RateLimitedError(Exception):
|
||
"""429 限流错误。"""
|
||
pass
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 异常处理矩阵 (§3.4)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_EXCEPTION_MATRIX: dict[type[Exception], tuple[int, str]] = {
|
||
_RateLimitedError: (429, "Too Many Requests — 令牌不足"),
|
||
QueueFullError: (503, "Service Unavailable — 队列已满"),
|
||
httpx.TimeoutException: (504, "Gateway Timeout — 上游超时"),
|
||
httpx.ConnectError: (502, "Bad Gateway — 上游连接失败"),
|
||
httpx.HTTPStatusError: (502, "Bad Gateway — 上游返回错误状态"),
|
||
}
|
||
|
||
|
||
def _map_exception(exc: Exception) -> tuple[int, str]:
|
||
"""将异常映射为 HTTP 状态码 + 错误信息。"""
|
||
for exc_type, (status, msg) in _EXCEPTION_MATRIX.items():
|
||
if isinstance(exc, exc_type):
|
||
return status, msg
|
||
return 500, f"Internal Server Error — {type(exc).__name__}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# FastAPI 应用 + lifespan
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@asynccontextmanager
|
||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
|
||
"""应用生命周期管理:初始化/清理全局资源。
|
||
|
||
BIZ-46 Phase3: 所有资源收敛到 SidecarContext,挂载于 app.state.sidecar。
|
||
"""
|
||
# 启动
|
||
config: SidecarConfig = load_config()
|
||
logging.getLogger().setLevel(config.log_level.upper())
|
||
|
||
http_client: httpx.AsyncClient = httpx.AsyncClient(
|
||
timeout=httpx.Timeout(config.request_timeout),
|
||
limits=httpx.Limits(
|
||
max_connections=100,
|
||
max_keepalive_connections=20,
|
||
),
|
||
)
|
||
priority_queue: PriorityRequestQueue = PriorityRequestQueue(max_size=config.queue_max_size)
|
||
token_bucket: AdaptiveTokenBucket = AdaptiveTokenBucket(
|
||
rate=config.rate_rpm / 60.0,
|
||
capacity=config.bucket_capacity,
|
||
)
|
||
prometheus: PrometheusMetrics = PrometheusMetrics()
|
||
health: HealthService = HealthService()
|
||
|
||
ctx: SidecarContext = SidecarContext(
|
||
config=config,
|
||
http_client=http_client,
|
||
token_bucket=token_bucket,
|
||
priority_queue=priority_queue,
|
||
prometheus=prometheus,
|
||
health=health,
|
||
)
|
||
ctx.stats["start_time"] = int(time.time())
|
||
app.state.sidecar = ctx # 注入 FastAPI
|
||
|
||
# 启动 worker 协程
|
||
worker_task = asyncio.create_task(_worker_loop(ctx))
|
||
|
||
# 在独立端口 :9191 启动 Prometheus metrics 服务器
|
||
metrics_app = prometheus.build_asgi_app()
|
||
metrics_config = uvicorn.Config(
|
||
metrics_app,
|
||
host=config.listen_host,
|
||
port=config.metrics_port,
|
||
log_level="error",
|
||
)
|
||
metrics_server = uvicorn.Server(metrics_config)
|
||
_metrics_task = asyncio.create_task(metrics_server.serve())
|
||
|
||
# CORS 中间件(严维序评审 #8)
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_credentials=False,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# 挂载 webui 子路由
|
||
app.include_router(webui_router)
|
||
|
||
# upstream_api_key 启动检查(严维序评审 #5)
|
||
if not config.upstream_api_key:
|
||
logger.warning(
|
||
"upstream_api_key_empty",
|
||
message="SIDECAR_API_KEY 未设置,NVIDIA 请求将因 401 认证失败",
|
||
)
|
||
|
||
logger.info(
|
||
"sidecar_started",
|
||
host=config.listen_host,
|
||
port=config.listen_port,
|
||
metrics_port=config.metrics_port,
|
||
rate_rpm=config.rate_rpm,
|
||
queue_max=config.queue_max_size,
|
||
retreat_enabled=True,
|
||
)
|
||
|
||
yield # app 运行中
|
||
|
||
# 关闭
|
||
worker_task.cancel()
|
||
try:
|
||
await worker_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
_metrics_task.cancel()
|
||
try:
|
||
await _metrics_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
await http_client.aclose()
|
||
logger.info("sidecar_stopped")
|
||
|
||
|
||
def _mask_api_key(key: str) -> str:
|
||
"""对 API Key 进行脱敏处理,仅保留前 4 位以供识别。"""
|
||
if not key:
|
||
return ""
|
||
if len(key) <= 4:
|
||
return key[:2] + "****"
|
||
return key[:4] + "****"
|
||
|
||
|
||
app: FastAPI = FastAPI(
|
||
title="NVIDIA Sidecar Rate-Limiting Proxy",
|
||
version="0.1.0",
|
||
lifespan=lifespan,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 核心代理处理器
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def _handle_proxy_request(ctx: SidecarContext, request: Request, path: str) -> Response:
|
||
"""统一的代理请求处理入口。
|
||
|
||
执行完整链路:
|
||
1. 解析请求体 → 提取 model
|
||
2. 网关识别 → 非 NVIDIA 直通
|
||
3. NVIDIA → 排队 + 令牌限流 + 转发
|
||
"""
|
||
await ctx.increment_stat("total_requests")
|
||
|
||
# 解析请求
|
||
body_bytes: bytes = await request.body()
|
||
raw_headers: dict[str, str] = dict(request.headers)
|
||
|
||
# 尝试解析 JSON body
|
||
body_json: dict[str, Any] = {}
|
||
try:
|
||
if body_bytes:
|
||
body_json = __import__("json").loads(body_bytes)
|
||
except (ValueError, TypeError):
|
||
body_json = {}
|
||
|
||
# 提取 model 进行网关识别
|
||
model: str | None = _extract_model(body_json)
|
||
is_nvidia: bool = is_nvidia_gateway(model)
|
||
|
||
# 非 NVIDIA → 直接转发
|
||
if not is_nvidia:
|
||
await ctx.increment_stat("passthrough_requests")
|
||
try:
|
||
resp = await _forward_to_upstream(
|
||
ctx=ctx,
|
||
method=request.method,
|
||
path=path,
|
||
body=body_bytes if body_bytes else None,
|
||
headers=raw_headers,
|
||
stream=body_json.get("stream", False),
|
||
)
|
||
return _build_response(resp)
|
||
except Exception as exc:
|
||
status, msg = _map_exception(exc)
|
||
logger.error("passthrough_error", path=path, error=str(exc))
|
||
return JSONResponse(
|
||
status_code=status,
|
||
content={"error": {"message": msg, "type": type(exc).__name__}},
|
||
)
|
||
|
||
# NVIDIA → 排队 + 限流 + 转发
|
||
await ctx.increment_stat("nvidia_requests")
|
||
priority: Priority = _resolve_priority(raw_headers)
|
||
|
||
# 注入内部元数据到 payload
|
||
payload_for_queue: dict[str, Any] = dict(body_json)
|
||
payload_for_queue["_raw_body"] = body_bytes
|
||
|
||
# 尝试入队;PASSTHROUGH 策略下队列满时走直通路径
|
||
try:
|
||
request_id = await ctx.priority_queue.put(
|
||
item=payload_for_queue,
|
||
priority=priority,
|
||
headers={
|
||
**raw_headers,
|
||
"x-original-path": path,
|
||
"x-original-method": request.method,
|
||
},
|
||
)
|
||
except QueueFullError:
|
||
await ctx.increment_stat("queue_full_rejects")
|
||
return JSONResponse(
|
||
status_code=503,
|
||
content={
|
||
"error": {
|
||
"message": "队列已满,当前策略: reject",
|
||
"type": "QueueFullError",
|
||
}
|
||
},
|
||
)
|
||
except QueueFullPassthrough:
|
||
await ctx.increment_stat("passthrough_requests")
|
||
logger.info("queue_full_passthrough", path=path)
|
||
return await _passthrough_with_rate_limit(ctx, request, path, body_bytes, raw_headers, priority)
|
||
|
||
# 创建 future 并注册到 pending
|
||
loop = asyncio.get_running_loop()
|
||
future: asyncio.Future[httpx.Response] = loop.create_future()
|
||
ctx.pending_requests[request_id] = (future, time.monotonic())
|
||
|
||
try:
|
||
resp = await future
|
||
return _build_response(resp)
|
||
except _RateLimitedError as exc:
|
||
return JSONResponse(
|
||
status_code=429,
|
||
content={
|
||
"error": {
|
||
"message": str(exc),
|
||
"type": "RateLimitedError",
|
||
}
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
status, msg = _map_exception(exc)
|
||
logger.error("proxy_error", path=path, request_id=request_id, error=str(exc))
|
||
return JSONResponse(
|
||
status_code=status,
|
||
content={"error": {"message": msg, "type": type(exc).__name__}},
|
||
)
|
||
|
||
|
||
def _build_response(resp: httpx.Response) -> Response:
|
||
"""将 httpx.Response 转换为 FastAPI Response。
|
||
|
||
支持 JSON 和流式 (SSE) 两种响应类型。
|
||
"""
|
||
content_type = resp.headers.get("content-type", "")
|
||
|
||
# 流式响应 (SSE)
|
||
if "text/event-stream" in content_type or "stream" in content_type:
|
||
return StreamingResponse(
|
||
content=resp.aiter_bytes(),
|
||
status_code=resp.status_code,
|
||
headers={
|
||
k: v for k, v in resp.headers.items()
|
||
if k.lower() not in ("content-encoding", "transfer-encoding")
|
||
},
|
||
media_type="text/event-stream",
|
||
)
|
||
|
||
# 普通 JSON 响应
|
||
return Response(
|
||
content=resp.content,
|
||
status_code=resp.status_code,
|
||
headers={
|
||
k: v for k, v in resp.headers.items()
|
||
if k.lower() not in ("content-encoding", "transfer-encoding")
|
||
},
|
||
media_type=content_type or "application/json",
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 路由
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@app.get("/health")
|
||
async def health(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
|
||
"""存活检查 (liveness)。"""
|
||
return ctx.health.liveness()
|
||
|
||
|
||
@app.get("/health/ready")
|
||
async def health_ready(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
|
||
"""就绪检查 (readiness),含上游连通性。
|
||
|
||
BIZ-46 Phase3: 复用 ctx.http_client,不再每次创建新 client。
|
||
"""
|
||
queue_size = await ctx.priority_queue.get_queue_size()
|
||
bucket_status = ctx.token_bucket.get_status()
|
||
return await ctx.health.readiness(
|
||
upstream_url=ctx.config.upstream_url,
|
||
upstream_api_key=ctx.config.upstream_api_key or "",
|
||
queue_current_size=queue_size,
|
||
queue_max_size=ctx.config.queue_max_size,
|
||
available_tokens=bucket_status["tokens"],
|
||
bucket_capacity=bucket_status["capacity"],
|
||
http_client=ctx.http_client, # 复用主 client
|
||
)
|
||
|
||
|
||
@app.get("/status")
|
||
async def status(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
|
||
"""调试用:限流器 + 队列 + 避退完整状态。"""
|
||
queue_stats = await ctx.priority_queue.get_stats()
|
||
bucket_status = ctx.token_bucket.get_status()
|
||
return {
|
||
"requests": {
|
||
"total": ctx.stats["total_requests"],
|
||
"nvidia": ctx.stats["nvidia_requests"],
|
||
"passthrough": ctx.stats["passthrough_requests"],
|
||
"ratelimited": ctx.stats["ratelimited_requests"],
|
||
},
|
||
"errors": {
|
||
"queue_full_rejects": ctx.stats["queue_full_rejects"],
|
||
"upstream_errors": ctx.stats["upstream_errors"],
|
||
},
|
||
"queue": queue_stats,
|
||
"token_bucket": bucket_status,
|
||
"retreat": {
|
||
"state": ctx.token_bucket.get_retreat_state(),
|
||
"effective_rpm": round(ctx.token_bucket.get_effective_rate_rpm(), 1),
|
||
"base_rpm": round(ctx.token_bucket.get_base_rate_rpm(), 1),
|
||
"upstream_429_rate": round(ctx.token_bucket.get_429_rate(), 4),
|
||
},
|
||
"uptime_seconds": ctx.uptime_seconds,
|
||
}
|
||
|
||
|
||
# ---- OpenAI 兼容端点 ----
|
||
|
||
@app.post("/v1/chat/completions")
|
||
async def chat_completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
|
||
"""OpenAI Chat Completions API 代理(含流式支持)。"""
|
||
return await _handle_proxy_request(ctx, request, "/v1/chat/completions")
|
||
|
||
|
||
@app.post("/v1/completions")
|
||
async def completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
|
||
"""OpenAI Completions API 代理(legacy)。"""
|
||
return await _handle_proxy_request(ctx, request, "/v1/completions")
|
||
|
||
|
||
@app.post("/v1/embeddings")
|
||
async def embeddings(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
|
||
"""OpenAI Embeddings API 代理。"""
|
||
return await _handle_proxy_request(ctx, request, "/v1/embeddings")
|
||
|
||
|
||
@app.get("/v1/models")
|
||
@app.get("/v1/models/{model_id:path}")
|
||
async def list_models(request: Request, model_id: str | None = None, ctx: SidecarContext = Depends(get_context)) -> Response:
|
||
"""OpenAI Models API 代理。"""
|
||
path = f"/v1/models/{model_id}" if model_id else "/v1/models"
|
||
return await _handle_proxy_request(ctx, request, path)
|
||
|
||
|
||
# ---- 通用代理(catch-all 用于非标准 NVIDIA 端点) ----
|
||
|
||
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
|
||
async def catch_all(request: Request, path: str, ctx: SidecarContext = Depends(get_context)) -> Response:
|
||
"""通用代理端点:转发任何未匹配的路径到上游。"""
|
||
target_path = f"/{path}" if not path.startswith("/") else path
|
||
return await _handle_proxy_request(ctx, request, target_path)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 入口
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main() -> None:
|
||
"""开发/调试入口。"""
|
||
import uvicorn
|
||
cfg: SidecarConfig = load_config()
|
||
uvicorn.run(
|
||
"nvidia_sidecar.server:app",
|
||
host=cfg.listen_host,
|
||
port=cfg.listen_port,
|
||
log_level=cfg.log_level.lower(),
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |