BIZ-46 Phase3: 7项 follow-up 开发完成

1. 架构解耦 — SidecarContext + FastAPI Depends 注入 - 新增 context.py: SidecarContext dataclass 收敛全部全局状态 - server.py: 移除模块级全局变量，lifespan 创建 ctx → app.state.sidecar - webui.py: 移除反向导入 server，改用 Depends(get_context) 2. Prometheus 标签基数治理 — model_id → provider - upstream_latency_seconds / upstream_errors_total label 收敛为 provider - 模型级信息保留在 structlog JSON 日志 3. SSE 快照共享缓存 - 1s TTL 共享 snapshot cache + double-check locking - 多客户端不重复构建快照 4. 部署支撑 - Dockerfile (python:3.12-slim, 非 root 用户, HEALTHCHECK) - systemd service (安全加固, 资源限制) - .env.example (完整环境变量清单) 5. Readiness HTTP Client 复用 - check_upstream() 注入主 http_client，不再每次创建新 client 6. Retreat 并发回归测试 - 5 个测试用例全部通过（死锁检测 + 状态转换 + 并发安全） 7. Dashboard UX 优化 - 队列柱状图 300ms 平滑动画 - SSE 断连 5s 半透明遮罩 - 队列图标题显示总排队数 - 页面加载同步配置验证: mypy strict 通过 (0 errors), pytest 5/5 通过, server 导入正常 (13 routes) Co-authored-by: multica-agent <github@multica.ai>
2026-06-24 22:26:35 +08:00
parent 8a12ff9693
commit b18d243ef2
12 changed files with 928 additions and 312 deletions
@@ -2,6 +2,11 @@
 NVIDIA Sidecar — WebUI 后端 API

 提供仪表盘 SSE 实时推送 + 配置热重载 API。
+
+BIZ-46 Phase3:
+- 架构解耦：移除反向导入 server，改用 Depends(get_context) (§1)
+- SSE 共享缓存：1s TTL snapshot cache，多客户端不重复构建 (§3)
+- Dashboard UX：页面加载同步配置 + 队列深度标题 (§7)
 """

 from __future__ import annotations
@@ -19,6 +24,8 @@ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 from pydantic import BaseModel

+from nvidia_sidecar.context import SidecarContext
+
 webui_router: APIRouter = APIRouter(prefix="/api", tags=["webui"])
 logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar.webui")

@@ -33,6 +40,11 @@ _ADMIN_TOKEN: str | None = os.environ.get("SIDECAR_ADMIN_TOKEN")
 _admin_auth_scheme: HTTPBearer = HTTPBearer(auto_error=False)


+def _get_ctx(request: Request) -> SidecarContext:
+    """获取 SidecarContext（webui 路由级注入，避免循环导入 server）。"""
+    return request.app.state.sidecar  # type: ignore[no-any-return]
+
+
 # ---------------------------------------------------------------------------
 # 配置热重载模型
 # ---------------------------------------------------------------------------
@@ -44,23 +56,109 @@ class ConfigPatch(BaseModel):
    fallback_enabled_passthrough: bool | None = None


+# ---------------------------------------------------------------------------
+# SSE 快照构建（BIZ-46 Phase3: 1s TTL 共享缓存）
+# ---------------------------------------------------------------------------
+
+async def _build_snapshot(ctx: SidecarContext) -> dict[str, Any]:
+    """构建当前状态快照（从 SidecarContext 读取，含队列深度）。
+
+    BIZ-46 Phase3: 不再通过反向导入 server 访问全局变量。
+    """
+    try:
+        bucket_status = ctx.token_bucket.get_status()
+        now = time.time()
+
+        queue_data: dict[str, Any] = {"current_size": 0, "per_priority": {}}
+        try:
+            queue_stats = await ctx.priority_queue.get_stats()
+            queue_data = {
+                "max_size": queue_stats.get("max_size", 0),
+                "current_size": queue_stats.get("current_size", 0),
+                "per_priority": queue_stats.get("depth_by_priority", {}),
+                "total_enqueued": queue_stats.get("total_enqueued", 0),
+                "total_dequeued": queue_stats.get("total_dequeued", 0),
+                "total_dropped": queue_stats.get("total_dropped", 0),
+            }
+        except Exception:
+            logger.warning(
+                "queue_stats_unavailable",
+                message="队列统计获取失败，仪表盘队列深度可能不准确",
+            )
+
+        return {
+            "timestamp": now,
+            "uptime_seconds": ctx.uptime_seconds,
+            "token_bucket": bucket_status,
+            "queue": queue_data,
+            "retreat": {
+                "state": ctx.token_bucket.get_retreat_state(),
+                "effective_rpm": round(ctx.token_bucket.get_effective_rate_rpm(), 1),
+                "base_rpm": round(ctx.token_bucket.get_base_rate_rpm(), 1),
+                "upstream_429_rate": round(ctx.token_bucket.get_429_rate(), 4),
+            },
+            "requests": {
+                "total": ctx.stats.get("total_requests", 0),
+                "nvidia": ctx.stats.get("nvidia_requests", 0),
+                "passthrough": ctx.stats.get("passthrough_requests", 0),
+                "ratelimited": ctx.stats.get("ratelimited_requests", 0),
+            },
+            "errors": {
+                "queue_full_rejects": ctx.stats.get("queue_full_rejects", 0),
+                "upstream_errors": ctx.stats.get("upstream_errors", 0),
+            },
+        }
+    except Exception:
+        logger.exception("snapshot_build_error")
+        return {"error": "snapshot_unavailable", "timestamp": time.time()}
+
+
+async def _build_snapshot_cached(ctx: SidecarContext) -> dict[str, Any]:
+    """带 1s TTL 的共享快照缓存（BIZ-46 Phase3 §3）。
+
+    多个 SSE 客户端共享同一份快照，避免重复计算和锁竞争。
+
+    性能收益：
+    - 1 客户端: 1 次/s 计算（无变化）
+    - 5 客户端: ~5 次/s → 1 次/s
+    - 20 客户端: ~20 次/s → 1 次/s
+    """
+    now_cache = time.monotonic()
+    if ctx.snapshot_cache is not None:
+        data, ts = ctx.snapshot_cache
+        if now_cache - ts < ctx.SNAPSHOT_CACHE_TTL:
+            return data
+
+    async with ctx.snapshot_cache_lock:
+        # Double-check（避免多个协程同时 miss 后重复构建）
+        if ctx.snapshot_cache is not None:
+            data, ts = ctx.snapshot_cache
+            if now_cache - ts < ctx.SNAPSHOT_CACHE_TTL:
+                return data
+
+        snapshot = await _build_snapshot(ctx)
+        ctx.snapshot_cache = (snapshot, now_cache)
+        return snapshot
+
+
 # ---------------------------------------------------------------------------
 # 仪表盘 SSE 推送
 # ---------------------------------------------------------------------------

-async def _dashboard_stream(request: Request) -> StreamingResponse:
+async def _dashboard_stream(request: Request, ctx: SidecarContext) -> StreamingResponse:
    """SSE 实时推送 Sidecar 完整状态快照（每秒一次）。

    供 dashboard.html 的 EventSource 消费。
+
+    BIZ-46 Phase3: 使用共享缓存 _build_snapshot_cached，多客户端不重复计算。
    """
    async def event_generator() -> AsyncGenerator[str, None]:
-        # 首帧发送 retry 字段（严维序评审 minor）：指示客户端断连后等待 3s 重试
        first_frame = True
        while True:
            if await request.is_disconnected():
                break
            try:
-                snapshot: dict[str, Any] = await _build_snapshot()
+                snapshot: dict[str, Any] = await _build_snapshot_cached(ctx)
                payload_sse = f"data: {json.dumps(snapshot, ensure_ascii=False)}\n\n"
                if first_frame:
                    payload_sse = f"retry: 3000\n{payload_sse}"
@@ -81,117 +179,54 @@ async def _dashboard_stream(request: Request) -> StreamingResponse:
    )


-# SSE 首帧写入 retry 字段（严维序评审 minor），在 event_generator 首次 yield 前注入
-# 通过在 StreamingResponse 返回前手动发送 retry header 实现
-# （SSE 协议支持 retry 字段作为重建连接间隔）
-# 注：在 event_generator 的首个 yield 中加入 retry 声明
-
-
-async def _build_snapshot() -> dict[str, Any]:
-    """构建当前状态快照（从全局状态读取，含队列深度）。"""
-    # 延迟导入避免循环依赖
-    from nvidia_sidecar import server
-
-    try:
-        _stats = server._stats
-        _token_bucket = server._token_bucket
-        bucket_status = _token_bucket.get_status()
-        now = time.time()
-        uptime = int(now - _stats["start_time"]) if _stats.get("start_time") else 0
-
-        # 获取队列统计数据（含 per-priority depth）
-        queue_data: dict[str, Any] = {"current_size": 0, "per_priority": {}}
-        try:
-            queue_stats = await server._priority_queue.get_stats()
-            queue_data = {
-                "max_size": queue_stats.get("max_size", 0),
-                "current_size": queue_stats.get("current_size", 0),
-                "per_priority": queue_stats.get("depth_by_priority", {}),
-                "total_enqueued": queue_stats.get("total_enqueued", 0),
-                "total_dequeued": queue_stats.get("total_dequeued", 0),
-                "total_dropped": queue_stats.get("total_dropped", 0),
-            }
-        except Exception:
-            logger.warning("queue_stats_unavailable", message="队列统计获取失败，仪表盘队列深度可能不准确")
-
-        return {
-            "timestamp": now,
-            "uptime_seconds": uptime,
-            "token_bucket": bucket_status,
-            "queue": queue_data,
-            "retreat": {
-                "state": getattr(_token_bucket, "_retreat_state", "normal"),
-                "effective_rpm": round(getattr(_token_bucket, "get_effective_rate_rpm", lambda: 40.0)(), 1),
-                "base_rpm": round(getattr(_token_bucket, "get_base_rate_rpm", lambda: 40.0)(), 1),
-                "upstream_429_rate": round(getattr(_token_bucket, "get_429_rate", lambda: 0.0)(), 4),
-            },
-            "requests": {
-                "total": _stats.get("total_requests", 0),
-                "nvidia": _stats.get("nvidia_requests", 0),
-                "passthrough": _stats.get("passthrough_requests", 0),
-                "ratelimited": _stats.get("ratelimited_requests", 0),
-            },
-            "errors": {
-                "queue_full_rejects": _stats.get("queue_full_rejects", 0),
-                "upstream_errors": _stats.get("upstream_errors", 0),
-            },
-        }
-    except Exception:
-        logger.exception("snapshot_build_error")
-        return {"error": "snapshot_unavailable", "timestamp": time.time()}
-
-
 # ---------------------------------------------------------------------------
 # 配置热重载
 # ---------------------------------------------------------------------------

-async def get_config() -> dict[str, Any]:
-    """获取当前完整配置。"""
-    from nvidia_sidecar import server
-
-    cfg = server._config
+async def get_config(ctx: SidecarContext) -> dict[str, Any]:
+    """获取当前完整配置（从 SidecarContext 读取）。"""
+    config = ctx.config
+    effective_rpm = float(ctx.token_bucket.get_effective_rate_rpm())
    return {
-        "listen_host": cfg.listen_host,
-        "listen_port": cfg.listen_port,
-        "metrics_port": cfg.metrics_port,
-        "upstream_url": cfg.upstream_url,
-        "upstream_api_key": _mask_api_key(cfg.upstream_api_key),
-        "rate_rpm": _get_current_rate(server),
-        "bucket_capacity": cfg.bucket_capacity,
-        "request_timeout": cfg.request_timeout,
-        "queue_max_size": cfg.queue_max_size,
-        "low_priority_timeout": cfg.low_priority_timeout,
-        "fallback_enabled_passthrough": cfg.fallback_enabled_passthrough,
-        "log_level": cfg.log_level,
+        "listen_host": config.listen_host,
+        "listen_port": config.listen_port,
+        "metrics_port": config.metrics_port,
+        "upstream_url": config.upstream_url,
+        "upstream_api_key": _mask_api_key(config.upstream_api_key),
+        "rate_rpm": round(effective_rpm, 1),
+        "bucket_capacity": config.bucket_capacity,
+        "request_timeout": config.request_timeout,
+        "queue_max_size": config.queue_max_size,
+        "low_priority_timeout": config.low_priority_timeout,
+        "fallback_enabled_passthrough": config.fallback_enabled_passthrough,
+        "log_level": config.log_level,
    }


-async def update_config(body: ConfigPatch) -> JSONResponse:
+async def update_config(body: ConfigPatch, ctx: SidecarContext) -> JSONResponse:
    """在线修改配置项并即时生效。"""
-    from nvidia_sidecar import server
-
-    cfg = server._config
+    config = ctx.config
    changed: list[str] = []

    if body.rate_rpm is not None:
        if body.rate_rpm <= 0:
            raise HTTPException(status_code=400, detail="rate_rpm must be > 0")
-        cfg.rate_rpm = body.rate_rpm
-        server._token_bucket.set_rate(body.rate_rpm / 60.0)
+        config.rate_rpm = body.rate_rpm
+        ctx.token_bucket.set_rate(body.rate_rpm / 60.0)
        changed.append("rate_rpm")

    if body.queue_max_size is not None:
        if body.queue_max_size <= 0:
            raise HTTPException(status_code=400, detail="queue_max_size must be > 0")
-        ok, msg = server._priority_queue.set_max_size(body.queue_max_size)
+        ok, msg = ctx.priority_queue.set_max_size(body.queue_max_size)
        if not ok:
            raise HTTPException(status_code=400, detail=msg)
-        cfg.queue_max_size = body.queue_max_size
+        config.queue_max_size = body.queue_max_size
        changed.append("queue_max_size")
        logger.info("queue_max_size_updated", detail=msg)

    if body.fallback_enabled_passthrough is not None:
-        cfg.fallback_enabled_passthrough = body.fallback_enabled_passthrough
+        config.fallback_enabled_passthrough = body.fallback_enabled_passthrough
        changed.append("fallback_enabled_passthrough")

    logger.info("config_updated", changed=changed)
@@ -212,22 +247,17 @@ def _mask_api_key(key: str) -> str:
    return key[:4] + "****"


-def _get_current_rate(server_module: Any) -> float:
-    """获取当前实际速率（避退调整后），兼容 AdaptiveTokenBucket。"""
-    tb = server_module._token_bucket
-    if hasattr(tb, "get_effective_rate_rpm"):
-        return float(round(tb.get_effective_rate_rpm(), 1))
-    return float(tb.rate * 60.0)
-
-
 # ---------------------------------------------------------------------------
 # 路由注册
 # ---------------------------------------------------------------------------

@webui_router.get("/dashboard/stream")
-async def dashboard_stream(request: Request) -> StreamingResponse:
-    """SSE 仪表盘实时推送端点。"""
-    return await _dashboard_stream(request)
+async def dashboard_stream(
+    request: Request,
+    ctx: SidecarContext = Depends(_get_ctx),
+) -> StreamingResponse:
+    """SSE 仪表盘实时推送端点（BIZ-46 Phase3: 使用共享缓存）。"""
+    return await _dashboard_stream(request, ctx)


 async def _verify_admin_auth(
@@ -249,18 +279,20 @@ async def _verify_admin_auth(
@webui_router.get("/admin/config")
 async def admin_get_config(
    _auth: None = Depends(_verify_admin_auth),
+    ctx: SidecarContext = Depends(_get_ctx),
 ) -> JSONResponse:
    """获取当前配置（需要 Admin 认证）。"""
-    return JSONResponse(content=await get_config())
+    return JSONResponse(content=await get_config(ctx))


@webui_router.post("/admin/config")
 async def admin_update_config(
    body: ConfigPatch,
    _auth: None = Depends(_verify_admin_auth),
+    ctx: SidecarContext = Depends(_get_ctx),
 ) -> JSONResponse:
    """在线修改配置（热重载，需要 Admin 认证）。"""
-    return await update_config(body)
+    return await update_config(body, ctx)


 # ---------------------------------------------------------------------------