EnterpriseArchitect/services/nvidia_sidecar/webui.py

"""
NVIDIA Sidecar — WebUI 后端 API

提供仪表盘 SSE 实时推送 + 配置热重载 API。
"""

from __future__ import annotations

import asyncio
import json
import os
import time
from pathlib import Path
from typing import Any, AsyncGenerator

import structlog
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from pydantic import BaseModel

webui_router: APIRouter = APIRouter(prefix="/api", tags=["webui"])
logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar.webui")

STATIC_DIR: Path = Path(__file__).parent / "static"

# dashboard.html 缓存（严维序评审 #6 / 梁思筑评审 #8：避免每次请求读磁盘）
_dashboard_html_cache: tuple[str, float] | None = None
_DASHBOARD_CACHE_TTL: float = 300.0  # 5 分钟

# Admin API 认证（严维序评审 #1）
_ADMIN_TOKEN: str | None = os.environ.get("SIDECAR_ADMIN_TOKEN")
_admin_auth_scheme: HTTPBearer = HTTPBearer(auto_error=False)


# ---------------------------------------------------------------------------
# 配置热重载模型
# ---------------------------------------------------------------------------

class ConfigPatch(BaseModel):
    """可在线修改的配置字段。"""
    rate_rpm: int | None = None
    queue_max_size: int | None = None
    fallback_enabled_passthrough: bool | None = None


# ---------------------------------------------------------------------------
# 仪表盘 SSE 推送
# ---------------------------------------------------------------------------

async def _dashboard_stream(request: Request) -> StreamingResponse:
    """SSE 实时推送 Sidecar 完整状态快照（每秒一次）。

    供 dashboard.html 的 EventSource 消费。
    """
    async def event_generator() -> AsyncGenerator[str, None]:
        # 首帧发送 retry 字段（严维序评审 minor）：指示客户端断连后等待 3s 重试
        first_frame = True
        while True:
            if await request.is_disconnected():
                break
            try:
                snapshot: dict[str, Any] = await _build_snapshot()
                payload_sse = f"data: {json.dumps(snapshot, ensure_ascii=False)}\n\n"
                if first_frame:
                    payload_sse = f"retry: 3000\n{payload_sse}"
                    first_frame = False
                yield payload_sse
            except Exception:
                logger.exception("dashboard_sse_error")
                yield f"data: {json.dumps({'error': 'internal'})}\n\n"
            await asyncio.sleep(1.0)

    return StreamingResponse(
        event_generator(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",
        },
    )


# SSE 首帧写入 retry 字段（严维序评审 minor），在 event_generator 首次 yield 前注入
# 通过在 StreamingResponse 返回前手动发送 retry header 实现
# （SSE 协议支持 retry 字段作为重建连接间隔）
# 注：在 event_generator 的首个 yield 中加入 retry 声明


async def _build_snapshot() -> dict[str, Any]:
    """构建当前状态快照（从全局状态读取，含队列深度）。"""
    # 延迟导入避免循环依赖
    from nvidia_sidecar import server

    try:
        _stats = server._stats
        _token_bucket = server._token_bucket
        bucket_status = _token_bucket.get_status()
        now = time.time()
        uptime = int(now - _stats["start_time"]) if _stats.get("start_time") else 0

        # 获取队列统计数据（含 per-priority depth）
        queue_data: dict[str, Any] = {"current_size": 0, "per_priority": {}}
        try:
            queue_stats = await server._priority_queue.get_stats()
            queue_data = {
                "max_size": queue_stats.get("max_size", 0),
                "current_size": queue_stats.get("current_size", 0),
                "per_priority": queue_stats.get("depth_by_priority", {}),
                "total_enqueued": queue_stats.get("total_enqueued", 0),
                "total_dequeued": queue_stats.get("total_dequeued", 0),
                "total_dropped": queue_stats.get("total_dropped", 0),
            }
        except Exception:
            logger.warning("queue_stats_unavailable", message="队列统计获取失败，仪表盘队列深度可能不准确")

        return {
            "timestamp": now,
            "uptime_seconds": uptime,
            "token_bucket": bucket_status,
            "queue": queue_data,
            "retreat": {
                "state": getattr(_token_bucket, "_retreat_state", "normal"),
                "effective_rpm": round(getattr(_token_bucket, "get_effective_rate_rpm", lambda: 40.0)(), 1),
                "base_rpm": round(getattr(_token_bucket, "get_base_rate_rpm", lambda: 40.0)(), 1),
                "upstream_429_rate": round(getattr(_token_bucket, "get_429_rate", lambda: 0.0)(), 4),
            },
            "requests": {
                "total": _stats.get("total_requests", 0),
                "nvidia": _stats.get("nvidia_requests", 0),
                "passthrough": _stats.get("passthrough_requests", 0),
                "ratelimited": _stats.get("ratelimited_requests", 0),
            },
            "errors": {
                "queue_full_rejects": _stats.get("queue_full_rejects", 0),
                "upstream_errors": _stats.get("upstream_errors", 0),
            },
        }
    except Exception:
        logger.exception("snapshot_build_error")
        return {"error": "snapshot_unavailable", "timestamp": time.time()}


# ---------------------------------------------------------------------------
# 配置热重载
# ---------------------------------------------------------------------------

async def get_config() -> dict[str, Any]:
    """获取当前完整配置。"""
    from nvidia_sidecar import server

    cfg = server._config
    return {
        "listen_host": cfg.listen_host,
        "listen_port": cfg.listen_port,
        "metrics_port": cfg.metrics_port,
        "upstream_url": cfg.upstream_url,
        "upstream_api_key": _mask_api_key(cfg.upstream_api_key),
        "rate_rpm": _get_current_rate(server),
        "bucket_capacity": cfg.bucket_capacity,
        "request_timeout": cfg.request_timeout,
        "queue_max_size": cfg.queue_max_size,
        "low_priority_timeout": cfg.low_priority_timeout,
        "fallback_enabled_passthrough": cfg.fallback_enabled_passthrough,
        "log_level": cfg.log_level,
    }


async def update_config(body: ConfigPatch) -> JSONResponse:
    """在线修改配置项并即时生效。"""
    from nvidia_sidecar import server

    cfg = server._config
    changed: list[str] = []

    if body.rate_rpm is not None:
        if body.rate_rpm <= 0:
            raise HTTPException(status_code=400, detail="rate_rpm must be > 0")
        cfg.rate_rpm = body.rate_rpm
        server._token_bucket.set_rate(body.rate_rpm / 60.0)
        changed.append("rate_rpm")

    if body.queue_max_size is not None:
        if body.queue_max_size <= 0:
            raise HTTPException(status_code=400, detail="queue_max_size must be > 0")
        ok, msg = server._priority_queue.set_max_size(body.queue_max_size)
        if not ok:
            raise HTTPException(status_code=400, detail=msg)
        cfg.queue_max_size = body.queue_max_size
        changed.append("queue_max_size")
        logger.info("queue_max_size_updated", detail=msg)

    if body.fallback_enabled_passthrough is not None:
        cfg.fallback_enabled_passthrough = body.fallback_enabled_passthrough
        changed.append("fallback_enabled_passthrough")

    logger.info("config_updated", changed=changed)
    return JSONResponse(
        content={"status": "ok", "changed": changed},
    )


def _mask_api_key(key: str) -> str:
    """对 API Key 进行脱敏处理，仅保留前 4 位以供识别。

    严维序评审 #2 / 沈路明评审 #3：防止 API Key 明文泄露。
    """
    if not key:
        return ""
    if len(key) <= 4:
        return key[:2] + "****"
    return key[:4] + "****"


def _get_current_rate(server_module: Any) -> float:
    """获取当前实际速率（避退调整后），兼容 AdaptiveTokenBucket。"""
    tb = server_module._token_bucket
    if hasattr(tb, "get_effective_rate_rpm"):
        return float(round(tb.get_effective_rate_rpm(), 1))
    return float(tb.rate * 60.0)


# ---------------------------------------------------------------------------
# 路由注册
# ---------------------------------------------------------------------------

@webui_router.get("/dashboard/stream")
async def dashboard_stream(request: Request) -> StreamingResponse:
    """SSE 仪表盘实时推送端点。"""
    return await _dashboard_stream(request)


async def _verify_admin_auth(
    credentials: HTTPAuthorizationCredentials | None = Depends(_admin_auth_scheme),
) -> None:
    """Admin API Bearer Token 认证（严维序评审 #1）。

    若设置了 SIDECAR_ADMIN_TOKEN 环境变量，则要求请求携带匹配的 Bearer Token。
    未设置时跳过认证（开发/测试环境）。
    """
    if _ADMIN_TOKEN is None:
        return  # 未配置认证 token，允许无认证访问
    if credentials is None:
        raise HTTPException(status_code=401, detail="需要 Bearer Token 认证（Admin API）")
    if credentials.credentials != _ADMIN_TOKEN:
        raise HTTPException(status_code=403, detail="Admin Token 无效")


@webui_router.get("/admin/config")
async def admin_get_config(
    _auth: None = Depends(_verify_admin_auth),
) -> JSONResponse:
    """获取当前配置（需要 Admin 认证）。"""
    return JSONResponse(content=await get_config())


@webui_router.post("/admin/config")
async def admin_update_config(
    body: ConfigPatch,
    _auth: None = Depends(_verify_admin_auth),
) -> JSONResponse:
    """在线修改配置（热重载，需要 Admin 认证）。"""
    return await update_config(body)


# ---------------------------------------------------------------------------
# 仪表盘静态页面
# ---------------------------------------------------------------------------

def _get_dashboard_html() -> str:
    """获取仪表盘 HTML（带缓存，严维序评审 #6 / 梁思筑评审 #8）。

    首次加载后缓存 5 分钟，避免每次请求读磁盘。
    """
    global _dashboard_html_cache
    now = time.monotonic()
    if _dashboard_html_cache is not None:
        cached_content, cached_at = _dashboard_html_cache
        if now - cached_at < _DASHBOARD_CACHE_TTL:
            return cached_content

    dashboard_path = STATIC_DIR / "dashboard.html"
    if dashboard_path.is_file():
        content = dashboard_path.read_text(encoding="utf-8")
        _dashboard_html_cache = (content, now)
        return content
    return "<h1>dashboard.html not found</h1>"


@webui_router.get("/dashboard", include_in_schema=False)
async def dashboard_page() -> HTMLResponse:
    """仪表盘 HTML 页面（含缓存策略）。"""
    return HTMLResponse(content=_get_dashboard_html())