BIZ-46: Phase3 架构设计 — SidecarContext解耦/Prometheus治理/部署支撑/测试/UX

Co-authored-by: multica-agent <github@multica.ai>
2026-06-24 20:01:25 +08:00
parent 4fd89b038d
commit 8a12ff9693
13 changed files with 3502 additions and 0 deletions
@@ -0,0 +1,293 @@
+"""
+NVIDIA Sidecar — WebUI 后端 API
+
+提供仪表盘 SSE 实时推送 + 配置热重载 API。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, AsyncGenerator
+
+import structlog
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from pydantic import BaseModel
+
+webui_router: APIRouter = APIRouter(prefix="/api", tags=["webui"])
+logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar.webui")
+
+STATIC_DIR: Path = Path(__file__).parent / "static"
+
+# dashboard.html 缓存（严维序评审 #6 / 梁思筑评审 #8：避免每次请求读磁盘）
+_dashboard_html_cache: tuple[str, float] | None = None
+_DASHBOARD_CACHE_TTL: float = 300.0  # 5 分钟
+
+# Admin API 认证（严维序评审 #1）
+_ADMIN_TOKEN: str | None = os.environ.get("SIDECAR_ADMIN_TOKEN")
+_admin_auth_scheme: HTTPBearer = HTTPBearer(auto_error=False)
+
+
+# ---------------------------------------------------------------------------
+# 配置热重载模型
+# ---------------------------------------------------------------------------
+
+class ConfigPatch(BaseModel):
+    """可在线修改的配置字段。"""
+    rate_rpm: int | None = None
+    queue_max_size: int | None = None
+    fallback_enabled_passthrough: bool | None = None
+
+
+# ---------------------------------------------------------------------------
+# 仪表盘 SSE 推送
+# ---------------------------------------------------------------------------
+
+async def _dashboard_stream(request: Request) -> StreamingResponse:
+    """SSE 实时推送 Sidecar 完整状态快照（每秒一次）。
+
+    供 dashboard.html 的 EventSource 消费。
+    """
+    async def event_generator() -> AsyncGenerator[str, None]:
+        # 首帧发送 retry 字段（严维序评审 minor）：指示客户端断连后等待 3s 重试
+        first_frame = True
+        while True:
+            if await request.is_disconnected():
+                break
+            try:
+                snapshot: dict[str, Any] = await _build_snapshot()
+                payload_sse = f"data: {json.dumps(snapshot, ensure_ascii=False)}\n\n"
+                if first_frame:
+                    payload_sse = f"retry: 3000\n{payload_sse}"
+                    first_frame = False
+                yield payload_sse
+            except Exception:
+                logger.exception("dashboard_sse_error")
+                yield f"data: {json.dumps({'error': 'internal'})}\n\n"
+            await asyncio.sleep(1.0)
+
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
+# SSE 首帧写入 retry 字段（严维序评审 minor），在 event_generator 首次 yield 前注入
+# 通过在 StreamingResponse 返回前手动发送 retry header 实现
+# （SSE 协议支持 retry 字段作为重建连接间隔）
+# 注：在 event_generator 的首个 yield 中加入 retry 声明
+
+
+async def _build_snapshot() -> dict[str, Any]:
+    """构建当前状态快照（从全局状态读取，含队列深度）。"""
+    # 延迟导入避免循环依赖
+    from nvidia_sidecar import server
+
+    try:
+        _stats = server._stats
+        _token_bucket = server._token_bucket
+        bucket_status = _token_bucket.get_status()
+        now = time.time()
+        uptime = int(now - _stats["start_time"]) if _stats.get("start_time") else 0
+
+        # 获取队列统计数据（含 per-priority depth）
+        queue_data: dict[str, Any] = {"current_size": 0, "per_priority": {}}
+        try:
+            queue_stats = await server._priority_queue.get_stats()
+            queue_data = {
+                "max_size": queue_stats.get("max_size", 0),
+                "current_size": queue_stats.get("current_size", 0),
+                "per_priority": queue_stats.get("depth_by_priority", {}),
+                "total_enqueued": queue_stats.get("total_enqueued", 0),
+                "total_dequeued": queue_stats.get("total_dequeued", 0),
+                "total_dropped": queue_stats.get("total_dropped", 0),
+            }
+        except Exception:
+            logger.warning("queue_stats_unavailable", message="队列统计获取失败，仪表盘队列深度可能不准确")
+
+        return {
+            "timestamp": now,
+            "uptime_seconds": uptime,
+            "token_bucket": bucket_status,
+            "queue": queue_data,
+            "retreat": {
+                "state": getattr(_token_bucket, "_retreat_state", "normal"),
+                "effective_rpm": round(getattr(_token_bucket, "get_effective_rate_rpm", lambda: 40.0)(), 1),
+                "base_rpm": round(getattr(_token_bucket, "get_base_rate_rpm", lambda: 40.0)(), 1),
+                "upstream_429_rate": round(getattr(_token_bucket, "get_429_rate", lambda: 0.0)(), 4),
+            },
+            "requests": {
+                "total": _stats.get("total_requests", 0),
+                "nvidia": _stats.get("nvidia_requests", 0),
+                "passthrough": _stats.get("passthrough_requests", 0),
+                "ratelimited": _stats.get("ratelimited_requests", 0),
+            },
+            "errors": {
+                "queue_full_rejects": _stats.get("queue_full_rejects", 0),
+                "upstream_errors": _stats.get("upstream_errors", 0),
+            },
+        }
+    except Exception:
+        logger.exception("snapshot_build_error")
+        return {"error": "snapshot_unavailable", "timestamp": time.time()}
+
+
+# ---------------------------------------------------------------------------
+# 配置热重载
+# ---------------------------------------------------------------------------
+
+async def get_config() -> dict[str, Any]:
+    """获取当前完整配置。"""
+    from nvidia_sidecar import server
+
+    cfg = server._config
+    return {
+        "listen_host": cfg.listen_host,
+        "listen_port": cfg.listen_port,
+        "metrics_port": cfg.metrics_port,
+        "upstream_url": cfg.upstream_url,
+        "upstream_api_key": _mask_api_key(cfg.upstream_api_key),
+        "rate_rpm": _get_current_rate(server),
+        "bucket_capacity": cfg.bucket_capacity,
+        "request_timeout": cfg.request_timeout,
+        "queue_max_size": cfg.queue_max_size,
+        "low_priority_timeout": cfg.low_priority_timeout,
+        "fallback_enabled_passthrough": cfg.fallback_enabled_passthrough,
+        "log_level": cfg.log_level,
+    }
+
+
+async def update_config(body: ConfigPatch) -> JSONResponse:
+    """在线修改配置项并即时生效。"""
+    from nvidia_sidecar import server
+
+    cfg = server._config
+    changed: list[str] = []
+
+    if body.rate_rpm is not None:
+        if body.rate_rpm <= 0:
+            raise HTTPException(status_code=400, detail="rate_rpm must be > 0")
+        cfg.rate_rpm = body.rate_rpm
+        server._token_bucket.set_rate(body.rate_rpm / 60.0)
+        changed.append("rate_rpm")
+
+    if body.queue_max_size is not None:
+        if body.queue_max_size <= 0:
+            raise HTTPException(status_code=400, detail="queue_max_size must be > 0")
+        ok, msg = server._priority_queue.set_max_size(body.queue_max_size)
+        if not ok:
+            raise HTTPException(status_code=400, detail=msg)
+        cfg.queue_max_size = body.queue_max_size
+        changed.append("queue_max_size")
+        logger.info("queue_max_size_updated", detail=msg)
+
+    if body.fallback_enabled_passthrough is not None:
+        cfg.fallback_enabled_passthrough = body.fallback_enabled_passthrough
+        changed.append("fallback_enabled_passthrough")
+
+    logger.info("config_updated", changed=changed)
+    return JSONResponse(
+        content={"status": "ok", "changed": changed},
+    )
+
+
+def _mask_api_key(key: str) -> str:
+    """对 API Key 进行脱敏处理，仅保留前 4 位以供识别。
+
+    严维序评审 #2 / 沈路明评审 #3：防止 API Key 明文泄露。
+    """
+    if not key:
+        return ""
+    if len(key) <= 4:
+        return key[:2] + "****"
+    return key[:4] + "****"
+
+
+def _get_current_rate(server_module: Any) -> float:
+    """获取当前实际速率（避退调整后），兼容 AdaptiveTokenBucket。"""
+    tb = server_module._token_bucket
+    if hasattr(tb, "get_effective_rate_rpm"):
+        return float(round(tb.get_effective_rate_rpm(), 1))
+    return float(tb.rate * 60.0)
+
+
+# ---------------------------------------------------------------------------
+# 路由注册
+# ---------------------------------------------------------------------------
+
+@webui_router.get("/dashboard/stream")
+async def dashboard_stream(request: Request) -> StreamingResponse:
+    """SSE 仪表盘实时推送端点。"""
+    return await _dashboard_stream(request)
+
+
+async def _verify_admin_auth(
+    credentials: HTTPAuthorizationCredentials | None = Depends(_admin_auth_scheme),
+) -> None:
+    """Admin API Bearer Token 认证（严维序评审 #1）。
+
+    若设置了 SIDECAR_ADMIN_TOKEN 环境变量，则要求请求携带匹配的 Bearer Token。
+    未设置时跳过认证（开发/测试环境）。
+    """
+    if _ADMIN_TOKEN is None:
+        return  # 未配置认证 token，允许无认证访问
+    if credentials is None:
+        raise HTTPException(status_code=401, detail="需要 Bearer Token 认证（Admin API）")
+    if credentials.credentials != _ADMIN_TOKEN:
+        raise HTTPException(status_code=403, detail="Admin Token 无效")
+
+
+@webui_router.get("/admin/config")
+async def admin_get_config(
+    _auth: None = Depends(_verify_admin_auth),
+) -> JSONResponse:
+    """获取当前配置（需要 Admin 认证）。"""
+    return JSONResponse(content=await get_config())
+
+
+@webui_router.post("/admin/config")
+async def admin_update_config(
+    body: ConfigPatch,
+    _auth: None = Depends(_verify_admin_auth),
+) -> JSONResponse:
+    """在线修改配置（热重载，需要 Admin 认证）。"""
+    return await update_config(body)
+
+
+# ---------------------------------------------------------------------------
+# 仪表盘静态页面
+# ---------------------------------------------------------------------------
+
+def _get_dashboard_html() -> str:
+    """获取仪表盘 HTML（带缓存，严维序评审 #6 / 梁思筑评审 #8）。
+
+    首次加载后缓存 5 分钟，避免每次请求读磁盘。
+    """
+    global _dashboard_html_cache
+    now = time.monotonic()
+    if _dashboard_html_cache is not None:
+        cached_content, cached_at = _dashboard_html_cache
+        if now - cached_at < _DASHBOARD_CACHE_TTL:
+            return cached_content
+
+    dashboard_path = STATIC_DIR / "dashboard.html"
+    if dashboard_path.is_file():
+        content = dashboard_path.read_text(encoding="utf-8")
+        _dashboard_html_cache = (content, now)
+        return content
+    return "<h1>dashboard.html not found</h1>"
+
+
+@webui_router.get("/dashboard", include_in_schema=False)
+async def dashboard_page() -> HTMLResponse:
+    """仪表盘 HTML 页面（含缓存策略）。"""
+    return HTMLResponse(content=_get_dashboard_html())