BIZ-46 Phase3: 7项 follow-up 开发完成

1. 架构解耦 — SidecarContext + FastAPI Depends 注入 - 新增 context.py: SidecarContext dataclass 收敛全部全局状态 - server.py: 移除模块级全局变量，lifespan 创建 ctx → app.state.sidecar - webui.py: 移除反向导入 server，改用 Depends(get_context) 2. Prometheus 标签基数治理 — model_id → provider - upstream_latency_seconds / upstream_errors_total label 收敛为 provider - 模型级信息保留在 structlog JSON 日志 3. SSE 快照共享缓存 - 1s TTL 共享 snapshot cache + double-check locking - 多客户端不重复构建快照 4. 部署支撑 - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK) - systemd service (安全加固, 资源限制) - .env.example (完整环境变量清单) 5. Readiness HTTP Client 复用 - check_upstream() 注入主 http_client，不再每次创建新 client 6. Retreat 并发回归测试 - 5 个测试用例全部通过（死锁检测 + 状态转换 + 并发安全） 7. Dashboard UX 优化 - 队列柱状图 300ms 平滑动画 - SSE 断连 5s 半透明遮罩 - 队列图标题显示总排队数 - 页面加载同步配置验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes) Co-authored-by: multica-agent <github@multica.ai>
2026-06-24 22:26:35 +08:00
parent 8a12ff9693
commit b18d243ef2
12 changed files with 928 additions and 312 deletions
@@ -5,6 +5,8 @@ NVIDIA Sidecar 限流代理 — FastAPI 代理主入口 (§3.4)
    接收 → 网关识别 → [NVIDIA: 排队 → 令牌限流] → httpx 转发 → 返回

 非 NVIDIA 请求直通上游，NVIDIA 请求经过四级优先级队列 + 令牌桶限流。
+
+BIZ-46 Phase3: 架构解耦 — 所有全局状态收敛为 SidecarContext (§1)
 """

 from __future__ import annotations
@@ -19,11 +21,12 @@ from typing import Any
 import httpx
 import structlog
 import uvicorn
-from fastapi import FastAPI, Request, Response
+from fastapi import Depends, FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse

 from nvidia_sidecar.config import load_config, SidecarConfig
+from nvidia_sidecar.context import SidecarContext
 from nvidia_sidecar.rate_limiter import (
    Priority,
    AdaptiveTokenBucket,
@@ -64,42 +67,18 @@ logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar")


 # ---------------------------------------------------------------------------
-# 全局状态（通过 lifespan 初始化，模块级引用方便路由访问）
+# FastAPI 依赖注入
 # ---------------------------------------------------------------------------

-_config: SidecarConfig
-_http_client: httpx.AsyncClient
-_priority_queue: PriorityRequestQueue
-_token_bucket: AdaptiveTokenBucket
-_prometheus: PrometheusMetrics
-_health_service: HealthService
-_pending_requests: dict[str, tuple[asyncio.Future[httpx.Response], float]]
-"""request_id → (response future, enqueued_at) 的映射。"""
-_metrics_task: asyncio.Task[None] | None = None
-
-# 统计计数器（受 _stats_lock 保护, 修复梁思筑评审 #1: data race）
-_stats: dict[str, int] = {
-    "total_requests": 0,
-    "nvidia_requests": 0,
-    "passthrough_requests": 0,
-    "ratelimited_requests": 0,
-    "queue_full_rejects": 0,
-    "upstream_errors": 0,
-    "start_time": 0,
-}
-_stats_lock: asyncio.Lock = asyncio.Lock()
+def get_context(request: Request) -> SidecarContext:
+    """从 app.state 获取 SidecarContext（FastAPI 依赖注入）。"""
+    return request.app.state.sidecar  # type: ignore[no-any-return]


 # ---------------------------------------------------------------------------
 # 工具函数
 # ---------------------------------------------------------------------------

-async def _increment_stat(key: str, delta: int = 1) -> None:
-    """线程安全的 _stats 计数器自增（梁思筑评审 #1 修复：消除 data race）。"""
-    async with _stats_lock:
-        _stats[key] = _stats.get(key, 0) + delta
-
-
 def _extract_model(body: Any) -> str | None:
    """从请求体中提取模型标识符（兼容 OpenAI Chat/Completions 格式）。

@@ -135,6 +114,7 @@ def _resolve_priority(headers: dict[str, str]) -> Priority:
 # ---------------------------------------------------------------------------

 async def _forward_to_upstream(
+    ctx: SidecarContext,
    method: str,
    path: str,
    body: bytes | None,
@@ -144,6 +124,7 @@ async def _forward_to_upstream(
    """将请求转发到 NVIDIA 上游 API。

    Args:
+        ctx: SidecarContext 运行时上下文。
        method: HTTP 方法。
        path: 请求路径（如 ``/v1/chat/completions``）。
        body: 原始请求体 bytes。
@@ -156,28 +137,28 @@ async def _forward_to_upstream(
    Raises:
        httpx.HTTPError: HTTP 请求失败。
    """
-    upstream_url = _config.upstream_url.rstrip("/") + path
+    upstream_url = ctx.config.upstream_url.rstrip("/") + path
    forward_headers: dict[str, str] = {
        k: v for k, v in headers.items()
        if k.lower() not in ("host", "content-length", "transfer-encoding")
    }
-    if _config.upstream_api_key:
-        forward_headers["authorization"] = f"Bearer {_config.upstream_api_key}"
+    if ctx.config.upstream_api_key:
+        forward_headers["authorization"] = f"Bearer {ctx.config.upstream_api_key}"
    elif "authorization" not in {k.lower() for k in forward_headers}:
        forward_headers["authorization"] = "Bearer nvidia"

    try:
-        req = _http_client.build_request(
+        req = ctx.http_client.build_request(
            method=method,
            url=upstream_url,
            headers=forward_headers,
            content=body,
-            timeout=_config.request_timeout,
+            timeout=ctx.config.request_timeout,
        )
-        response = await _http_client.send(req, stream=stream)
+        response = await ctx.http_client.send(req, stream=stream)
        return response
    except httpx.TimeoutException:
-        logger.warning("upstream_timeout", path=path, timeout=_config.request_timeout)
+        logger.warning("upstream_timeout", path=path, timeout=ctx.config.request_timeout)
        raise
    except httpx.HTTPError as exc:
        logger.error("upstream_error", path=path, error=str(exc))
@@ -188,14 +169,18 @@ async def _forward_to_upstream(
 # worker 协程：消费优先级队列 + 令牌桶 + 转发
 # ---------------------------------------------------------------------------

-async def _worker_loop() -> None:
-    """后台 worker：持续从优先级队列取请求 → 令牌限流 → 转发 → 设置 future 结果。"""
+async def _worker_loop(ctx: SidecarContext) -> None:
+    """后台 worker：持续从优先级队列取请求 → 令牌限流 → 转发 → 设置 future 结果。
+
+    Args:
+        ctx: SidecarContext 运行时上下文。
+    """
    log = logger.bind(worker="main")
    log.info("worker_started")

    while True:
        try:
-            queue_item = await _priority_queue.get(timeout=1.0)
+            queue_item = await ctx.priority_queue.get(timeout=1.0)
            if queue_item is None:
                continue

@@ -205,7 +190,7 @@ async def _worker_loop() -> None:
            enqueued_at = queue_item.enqueued_at

            # 查找对应的 pending future
-            pending_entry = _pending_requests.get(request_id)
+            pending_entry = ctx.pending_requests.get(request_id)
            if pending_entry is None:
                log.warning("orphan_request", request_id=request_id)
                continue
@@ -215,31 +200,30 @@ async def _worker_loop() -> None:
            if queue_item.priority == Priority.LOW:
                # 放线程池执行阻塞的令牌桶调用
                got_token = await asyncio.to_thread(
-                    _token_bucket.try_consume,
+                    ctx.token_bucket.try_consume,
                    tokens=1,
-                    timeout=_config.low_priority_timeout,
+                    timeout=ctx.config.low_priority_timeout,
                )
                if not got_token:
                    log.info("low_priority_timeout", request_id=request_id)
-                    await _increment_stat("ratelimited_requests")
-                    _prometheus.record_request(queue_item.priority.name, "ratelimited")
+                    await ctx.increment_stat("ratelimited_requests")
+                    ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
                    if not future.done():
                        future.set_exception(
                            _RateLimitedError(
-                                f"低优先级请求令牌等待超时 ({_config.low_priority_timeout}s)"
+                                f"低优先级请求令牌等待超时 ({ctx.config.low_priority_timeout}s)"
                            )
                        )
-                    _pending_requests.pop(request_id, None)
+                    ctx.pending_requests.pop(request_id, None)
                    continue
            else:
                # 非低优先级：在 worker 内轮询等待令牌，避免重入队导致 future 悬挂
-                # （重入队会生成新 request_id，原 future 永不 resolve → 客户端永久 hang）
-                got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
+                got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
                if not got_token:
-                    token_deadline = time.monotonic() + _config.request_timeout
+                    token_deadline = time.monotonic() + ctx.config.request_timeout
                    while not got_token:
                        await asyncio.sleep(0.1)
-                        got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
+                        got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
                        if time.monotonic() > token_deadline:
                            break
                    if not got_token:
@@ -247,17 +231,17 @@ async def _worker_loop() -> None:
                            "token_wait_timeout",
                            request_id=request_id,
                            priority=queue_item.priority.name,
-                            timeout=_config.request_timeout,
+                            timeout=ctx.config.request_timeout,
                        )
-                        await _increment_stat("ratelimited_requests")
-                        _prometheus.record_request(queue_item.priority.name, "ratelimited")
+                        await ctx.increment_stat("ratelimited_requests")
+                        ctx.prometheus.record_request(queue_item.priority.name, "ratelimited")
                        if not future.done():
                            future.set_exception(
                                _RateLimitedError(
-                                    f"令牌等待超时 ({_config.request_timeout:.0f}s)"
+                                    f"令牌等待超时 ({ctx.config.request_timeout:.0f}s)"
                                )
                            )
-                        _pending_requests.pop(request_id, None)
+                        ctx.pending_requests.pop(request_id, None)
                        continue

            # 转发到上游
@@ -272,6 +256,7 @@ async def _worker_loop() -> None:
                }

                resp = await _forward_to_upstream(
+                    ctx=ctx,
                    method=method,
                    path=path,
                    body=payload.get("_raw_body"),
@@ -284,19 +269,22 @@ async def _worker_loop() -> None:
                total_latency = upstream_latency + queue_latency

                is_429: bool = resp.status_code == 429
-                _token_bucket.record_response(is_429)
+                ctx.token_bucket.record_response(is_429)

                # 避退状态评估 + 指标更新
-                _token_bucket.evaluate_retreat()
-                retreat_state = _token_bucket.get_retreat_state()
-                effective_rpm = _token_bucket.get_effective_rate_rpm()
-                upstream_429_rate = _token_bucket.get_429_rate()
-                _prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
+                ctx.token_bucket.evaluate_retreat()
+                retreat_state = ctx.token_bucket.get_retreat_state()
+                effective_rpm = ctx.token_bucket.get_effective_rate_rpm()
+                upstream_429_rate = ctx.token_bucket.get_429_rate()
+                ctx.prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)

+                # 模型级信息写入 JSON 日志 (BIZ-46 Phase3: provider label 收敛后保留)
+                model_id = _extract_model(payload) or "unknown"
                log.info(
                    "request_completed",
                    request_id=request_id,
                    status=resp.status_code,
+                    model_id=model_id,
                    upstream_latency=round(upstream_latency, 3),
                    queue_latency=round(queue_latency, 3),
                    total_latency=round(total_latency, 3),
@@ -304,26 +292,26 @@ async def _worker_loop() -> None:
                    effective_rpm=round(effective_rpm, 1),
                )

-                # 记录 Prometheus 指标
-                model_id = _extract_model(payload) or "unknown"
-                _prometheus.record_upstream_latency(model_id, upstream_latency)
+                # 记录 Prometheus 指标 — provider 收敛（BIZ-46 Phase3）
+                provider = "nvidia"
+                ctx.prometheus.record_upstream_latency(provider, upstream_latency)
                if not resp.is_success:
-                    _prometheus.record_upstream_error(resp.status_code, model_id)
-                _prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
-                _prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
+                    ctx.prometheus.record_upstream_error(resp.status_code, provider)
+                ctx.prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
+                ctx.prometheus.record_queue_latency(queue_item.priority.name, queue_latency)

                if not future.done():
                    future.set_result(resp)

            except (httpx.HTTPError, OSError) as exc:
                log.error("upstream_request_failed", request_id=request_id, error=str(exc))
-                await _increment_stat("upstream_errors")
-                _prometheus.record_request(queue_item.priority.name, "error")
-                _prometheus.set_health(False)
+                await ctx.increment_stat("upstream_errors")
+                ctx.prometheus.record_request(queue_item.priority.name, "error")
+                ctx.prometheus.set_health(False)
                if not future.done():
                    future.set_exception(exc)

-            _pending_requests.pop(request_id, None)
+            ctx.pending_requests.pop(request_id, None)

        except asyncio.CancelledError:
            log.info("worker_cancelled")
@@ -337,6 +325,7 @@ async def _worker_loop() -> None:
 # ---------------------------------------------------------------------------

 async def _passthrough_with_rate_limit(
+    ctx: SidecarContext,
    request: Request,
    path: str,
    body_bytes: bytes,
@@ -346,6 +335,7 @@ async def _passthrough_with_rate_limit(
    """队列满时的 PASSSTHROUGH 直通路径：仍受令牌桶限流，但不排队。

    Args:
+        ctx: SidecarContext 运行时上下文。
        request: FastAPI Request。
        path: 请求路径。
        body_bytes: 原始请求体。
@@ -355,45 +345,43 @@ async def _passthrough_with_rate_limit(
    Returns:
        FastAPI Response。
    """
-    await _increment_stat("passthrough_requests")
-    _prometheus.increment_fallback()
+    await ctx.increment_stat("passthrough_requests")
+    ctx.prometheus.increment_fallback()

    # 低优先级走令牌桶等待
    if priority == Priority.LOW:
        got_token = await asyncio.to_thread(
-            _token_bucket.try_consume,
+            ctx.token_bucket.try_consume,
            tokens=1,
-            timeout=_config.low_priority_timeout,
+            timeout=ctx.config.low_priority_timeout,
        )
        if not got_token:
-            await _increment_stat("ratelimited_requests")
-            _prometheus.record_request(priority.name, "ratelimited")
+            await ctx.increment_stat("ratelimited_requests")
+            ctx.prometheus.record_request(priority.name, "ratelimited")
            return JSONResponse(
                status_code=429,
                content={
                    "error": {
-                        "message": f"令牌不足（队列满 + passthrough），超时 {_config.low_priority_timeout}s",
+                        "message": f"令牌不足（队列满 + passthrough），超时 {ctx.config.low_priority_timeout}s",
                        "type": "RateLimitedError",
                    }
                },
            )
    else:
-        got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
+        got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
        if not got_token:
-            # 非低优先级轮询等待，使用 config.request_timeout 替代硬编码 30s
-            # （严维序评审 minor / 梁思筑评审 #3：hot-reload 假生效修复）
-            deadline = time.monotonic() + _config.request_timeout
+            deadline = time.monotonic() + ctx.config.request_timeout
            while not got_token:
                await asyncio.sleep(0.1)
-                got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
+                got_token = await asyncio.to_thread(ctx.token_bucket.consume, tokens=1)
                if time.monotonic() > deadline:
-                    await _increment_stat("ratelimited_requests")
-                    _prometheus.record_request(priority.name, "ratelimited")
+                    await ctx.increment_stat("ratelimited_requests")
+                    ctx.prometheus.record_request(priority.name, "ratelimited")
                    return JSONResponse(
                        status_code=429,
                        content={
                            "error": {
-                                "message": f"令牌不足（队列满 + passthrough），等待超时 {_config.request_timeout:.0f}s",
+                                "message": f"令牌不足（队列满 + passthrough），等待超时 {ctx.config.request_timeout:.0f}s",
                                "type": "RateLimitedError",
                            }
                        },
@@ -403,24 +391,25 @@ async def _passthrough_with_rate_limit(
    try:
        clean_headers = {k: v for k, v in raw_headers.items()}
        resp = await _forward_to_upstream(
+            ctx=ctx,
            method=request.method,
            path=path,
            body=body_bytes if body_bytes else None,
            headers=clean_headers,
            stream=False,
        )
-        retreat_state = _token_bucket.get_retreat_state()
-        _token_bucket.evaluate_retreat()
-        _prometheus.update_retreat_metrics(
+        retreat_state = ctx.token_bucket.get_retreat_state()
+        ctx.token_bucket.evaluate_retreat()
+        ctx.prometheus.update_retreat_metrics(
            retreat_state,
-            _token_bucket.get_effective_rate_rpm(),
-            _token_bucket.get_429_rate(),
+            ctx.token_bucket.get_effective_rate_rpm(),
+            ctx.token_bucket.get_429_rate(),
        )
        return _build_response(resp)
    except Exception as exc:
        status, msg = _map_exception(exc)
        logger.error("passthrough_error", path=path, error=str(exc))
-        _prometheus.set_health(False)
+        ctx.prometheus.set_health(False)
        return JSONResponse(
            status_code=status,
            content={"error": {"message": msg, "type": type(exc).__name__}},
@@ -463,40 +452,49 @@ def _map_exception(exc: Exception) -> tuple[int, str]:

@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
-    """应用生命周期管理：初始化/清理全局资源。"""
-    global _config, _http_client, _priority_queue, _token_bucket, _pending_requests
-    global _prometheus, _health_service, _metrics_task
+    """应用生命周期管理：初始化/清理全局资源。

+    BIZ-46 Phase3: 所有资源收敛到 SidecarContext，挂载于 app.state.sidecar。
+    """
    # 启动
-    _config = load_config()
-    logging.getLogger().setLevel(_config.log_level.upper())
+    config: SidecarConfig = load_config()
+    logging.getLogger().setLevel(config.log_level.upper())

-    _http_client = httpx.AsyncClient(
-        timeout=httpx.Timeout(_config.request_timeout),
+    http_client: httpx.AsyncClient = httpx.AsyncClient(
+        timeout=httpx.Timeout(config.request_timeout),
        limits=httpx.Limits(
            max_connections=100,
            max_keepalive_connections=20,
        ),
    )
-    _priority_queue = PriorityRequestQueue(max_size=_config.queue_max_size)
-    _token_bucket = AdaptiveTokenBucket(
-        rate=_config.rate_rpm / 60.0,
-        capacity=_config.bucket_capacity,
+    priority_queue: PriorityRequestQueue = PriorityRequestQueue(max_size=config.queue_max_size)
+    token_bucket: AdaptiveTokenBucket = AdaptiveTokenBucket(
+        rate=config.rate_rpm / 60.0,
+        capacity=config.bucket_capacity,
    )
-    _prometheus = PrometheusMetrics()
-    _health_service = HealthService()
-    _pending_requests = {}
-    _stats["start_time"] = int(time.time())
+    prometheus: PrometheusMetrics = PrometheusMetrics()
+    health: HealthService = HealthService()
+
+    ctx: SidecarContext = SidecarContext(
+        config=config,
+        http_client=http_client,
+        token_bucket=token_bucket,
+        priority_queue=priority_queue,
+        prometheus=prometheus,
+        health=health,
+    )
+    ctx.stats["start_time"] = int(time.time())
+    app.state.sidecar = ctx  # 注入 FastAPI

    # 启动 worker 协程
-    worker_task = asyncio.create_task(_worker_loop())
+    worker_task = asyncio.create_task(_worker_loop(ctx))

    # 在独立端口 :9191 启动 Prometheus metrics 服务器
-    metrics_app = _prometheus.build_asgi_app()
+    metrics_app = prometheus.build_asgi_app()
    metrics_config = uvicorn.Config(
        metrics_app,
-        host=_config.listen_host,
-        port=_config.metrics_port,
+        host=config.listen_host,
+        port=config.metrics_port,
        log_level="error",
    )
    metrics_server = uvicorn.Server(metrics_config)
@@ -515,7 +513,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
    app.include_router(webui_router)

    # upstream_api_key 启动检查（严维序评审 #5）
-    if not _config.upstream_api_key:
+    if not config.upstream_api_key:
        logger.warning(
            "upstream_api_key_empty",
            message="SIDECAR_API_KEY 未设置，NVIDIA 请求将因 401 认证失败",
@@ -523,11 +521,11 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:

    logger.info(
        "sidecar_started",
-        host=_config.listen_host,
-        port=_config.listen_port,
-        metrics_port=_config.metrics_port,
-        rate_rpm=_config.rate_rpm,
-        queue_max=_config.queue_max_size,
+        host=config.listen_host,
+        port=config.listen_port,
+        metrics_port=config.metrics_port,
+        rate_rpm=config.rate_rpm,
+        queue_max=config.queue_max_size,
        retreat_enabled=True,
    )

@@ -540,17 +538,25 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
    except asyncio.CancelledError:
        pass

-    if _metrics_task is not None:
-        _metrics_task.cancel()
-        try:
-            await _metrics_task
-        except asyncio.CancelledError:
-            pass
+    _metrics_task.cancel()
+    try:
+        await _metrics_task
+    except asyncio.CancelledError:
+        pass

-    await _http_client.aclose()
+    await http_client.aclose()
    logger.info("sidecar_stopped")


+def _mask_api_key(key: str) -> str:
+    """对 API Key 进行脱敏处理，仅保留前 4 位以供识别。"""
+    if not key:
+        return ""
+    if len(key) <= 4:
+        return key[:2] + "****"
+    return key[:4] + "****"
+
+
 app: FastAPI = FastAPI(
    title="NVIDIA Sidecar Rate-Limiting Proxy",
    version="0.1.0",
@@ -562,7 +568,7 @@ app: FastAPI = FastAPI(
 # 核心代理处理器
 # ---------------------------------------------------------------------------

-async def _handle_proxy_request(request: Request, path: str) -> Response:
+async def _handle_proxy_request(ctx: SidecarContext, request: Request, path: str) -> Response:
    """统一的代理请求处理入口。

    执行完整链路：
@@ -570,7 +576,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
    2. 网关识别 → 非 NVIDIA 直通
    3. NVIDIA → 排队 + 令牌限流 + 转发
    """
-    await _increment_stat("total_requests")
+    await ctx.increment_stat("total_requests")

    # 解析请求
    body_bytes: bytes = await request.body()
@@ -590,9 +596,10 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:

    # 非 NVIDIA → 直接转发
    if not is_nvidia:
-        await _increment_stat("passthrough_requests")
+        await ctx.increment_stat("passthrough_requests")
        try:
            resp = await _forward_to_upstream(
+                ctx=ctx,
                method=request.method,
                path=path,
                body=body_bytes if body_bytes else None,
@@ -609,7 +616,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
            )

    # NVIDIA → 排队 + 限流 + 转发
-    await _increment_stat("nvidia_requests")
+    await ctx.increment_stat("nvidia_requests")
    priority: Priority = _resolve_priority(raw_headers)

    # 注入内部元数据到 payload
@@ -618,7 +625,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:

    # 尝试入队；PASSTHROUGH 策略下队列满时走直通路径
    try:
-        request_id = await _priority_queue.put(
+        request_id = await ctx.priority_queue.put(
            item=payload_for_queue,
            priority=priority,
            headers={
@@ -628,7 +635,7 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
            },
        )
    except QueueFullError:
-        await _increment_stat("queue_full_rejects")
+        await ctx.increment_stat("queue_full_rejects")
        return JSONResponse(
            status_code=503,
            content={
@@ -639,18 +646,16 @@ async def _handle_proxy_request(request: Request, path: str) -> Response:
            },
        )
    except QueueFullPassthrough:
-        # 队列满 + PASSTHROUGH：绕过排队，尝试令牌桶后直接转发
-        await _increment_stat("passthrough_requests")
+        await ctx.increment_stat("passthrough_requests")
        logger.info("queue_full_passthrough", path=path)
-        return await _passthrough_with_rate_limit(request, path, body_bytes, raw_headers, priority)
+        return await _passthrough_with_rate_limit(ctx, request, path, body_bytes, raw_headers, priority)

    # 创建 future 并注册到 pending
    loop = asyncio.get_running_loop()
    future: asyncio.Future[httpx.Response] = loop.create_future()
-    _pending_requests[request_id] = (future, time.monotonic())
+    ctx.pending_requests[request_id] = (future, time.monotonic())

    try:
-        # 等待 worker 完成处理
        resp = await future
        return _build_response(resp)
    except _RateLimitedError as exc:
@@ -708,89 +713,93 @@ def _build_response(resp: httpx.Response) -> Response:
 # ---------------------------------------------------------------------------

@app.get("/health")
-async def health() -> dict[str, Any]:
+async def health(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
    """存活检查 (liveness)。"""
-    return _health_service.liveness()
+    return ctx.health.liveness()


@app.get("/health/ready")
-async def health_ready() -> dict[str, Any]:
-    """就绪检查 (readiness)，含上游连通性。"""
-    queue_size = await _priority_queue.get_queue_size()
-    bucket_status = _token_bucket.get_status()
-    return await _health_service.readiness(
-        upstream_url=_config.upstream_url,
-        upstream_api_key=_config.upstream_api_key or "",
+async def health_ready(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
+    """就绪检查 (readiness)，含上游连通性。
+
+    BIZ-46 Phase3: 复用 ctx.http_client，不再每次创建新 client。
+    """
+    queue_size = await ctx.priority_queue.get_queue_size()
+    bucket_status = ctx.token_bucket.get_status()
+    return await ctx.health.readiness(
+        upstream_url=ctx.config.upstream_url,
+        upstream_api_key=ctx.config.upstream_api_key or "",
        queue_current_size=queue_size,
-        queue_max_size=_config.queue_max_size,
+        queue_max_size=ctx.config.queue_max_size,
        available_tokens=bucket_status["tokens"],
        bucket_capacity=bucket_status["capacity"],
+        http_client=ctx.http_client,  # 复用主 client
    )


@app.get("/status")
-async def status() -> dict[str, Any]:
+async def status(ctx: SidecarContext = Depends(get_context)) -> dict[str, Any]:
    """调试用：限流器 + 队列 + 避退完整状态。"""
-    queue_stats = await _priority_queue.get_stats()
-    bucket_status = _token_bucket.get_status()
+    queue_stats = await ctx.priority_queue.get_stats()
+    bucket_status = ctx.token_bucket.get_status()
    return {
        "requests": {
-            "total": _stats["total_requests"],
-            "nvidia": _stats["nvidia_requests"],
-            "passthrough": _stats["passthrough_requests"],
-            "ratelimited": _stats["ratelimited_requests"],
+            "total": ctx.stats["total_requests"],
+            "nvidia": ctx.stats["nvidia_requests"],
+            "passthrough": ctx.stats["passthrough_requests"],
+            "ratelimited": ctx.stats["ratelimited_requests"],
        },
        "errors": {
-            "queue_full_rejects": _stats["queue_full_rejects"],
-            "upstream_errors": _stats["upstream_errors"],
+            "queue_full_rejects": ctx.stats["queue_full_rejects"],
+            "upstream_errors": ctx.stats["upstream_errors"],
        },
        "queue": queue_stats,
        "token_bucket": bucket_status,
        "retreat": {
-            "state": _token_bucket.get_retreat_state(),
-            "effective_rpm": round(_token_bucket.get_effective_rate_rpm(), 1),
-            "base_rpm": round(_token_bucket.get_base_rate_rpm(), 1),
-            "upstream_429_rate": round(_token_bucket.get_429_rate(), 4),
+            "state": ctx.token_bucket.get_retreat_state(),
+            "effective_rpm": round(ctx.token_bucket.get_effective_rate_rpm(), 1),
+            "base_rpm": round(ctx.token_bucket.get_base_rate_rpm(), 1),
+            "upstream_429_rate": round(ctx.token_bucket.get_429_rate(), 4),
        },
-        "uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
+        "uptime_seconds": ctx.uptime_seconds,
    }


 # ---- OpenAI 兼容端点 ----

@app.post("/v1/chat/completions")
-async def chat_completions(request: Request) -> Response:
+async def chat_completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
    """OpenAI Chat Completions API 代理（含流式支持）。"""
-    return await _handle_proxy_request(request, "/v1/chat/completions")
+    return await _handle_proxy_request(ctx, request, "/v1/chat/completions")


@app.post("/v1/completions")
-async def completions(request: Request) -> Response:
+async def completions(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
    """OpenAI Completions API 代理（legacy）。"""
-    return await _handle_proxy_request(request, "/v1/completions")
+    return await _handle_proxy_request(ctx, request, "/v1/completions")


@app.post("/v1/embeddings")
-async def embeddings(request: Request) -> Response:
+async def embeddings(request: Request, ctx: SidecarContext = Depends(get_context)) -> Response:
    """OpenAI Embeddings API 代理。"""
-    return await _handle_proxy_request(request, "/v1/embeddings")
+    return await _handle_proxy_request(ctx, request, "/v1/embeddings")


@app.get("/v1/models")
@app.get("/v1/models/{model_id:path}")
-async def list_models(request: Request, model_id: str | None = None) -> Response:
+async def list_models(request: Request, model_id: str | None = None, ctx: SidecarContext = Depends(get_context)) -> Response:
    """OpenAI Models API 代理。"""
    path = f"/v1/models/{model_id}" if model_id else "/v1/models"
-    return await _handle_proxy_request(request, path)
+    return await _handle_proxy_request(ctx, request, path)


 # ---- 通用代理（catch-all 用于非标准 NVIDIA 端点） ----

@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
-async def catch_all(request: Request, path: str) -> Response:
+async def catch_all(request: Request, path: str, ctx: SidecarContext = Depends(get_context)) -> Response:
    """通用代理端点：转发任何未匹配的路径到上游。"""
    target_path = f"/{path}" if not path.startswith("/") else path
-    return await _handle_proxy_request(request, target_path)
+    return await _handle_proxy_request(ctx, request, target_path)


 # ---------------------------------------------------------------------------