BIZ-42: Phase2 可观测性+WebUI+避退模式 — metrics/health/webui/dashboard/adaptive

新增文件:
- metrics.py: Prometheus 指标端点 (:9191), 10+3 个指标
- health.py: /health (liveness) + /health/ready (readiness)
- webui.py: WebUI 后端 API (SSE 实时推送 + 配置热重载)
- static/dashboard.html: 仪表盘前端 (Chart.js, 令牌桶仪表+队列柱状图+吞吐折线图)

更新文件:
- rate_limiter.py: 增加 AdaptiveTokenBucket 避退模式 (ADR-009)
  状态机 NORMAL→RETREAT→RECOVER, 429 率滑动窗口监控
- server.py: structlog 结构化日志 + 避退反馈回路
  挂载 metrics_server (:9191) + health/ready + webui + /status
- pyproject.toml: 增加 prometheus-client, pydantic, types-PyYAML 依赖

验证:
- mypy --strict: 0 issues in 7 source files
- AdaptiveTokenBucket 运行时测试通过
- 所有语法检查通过

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-24 11:54:02 +08:00
parent 205381c4ff
commit e829a4060b
8 changed files with 1235 additions and 19 deletions
+100 -16
View File
@@ -18,13 +18,14 @@ from typing import Any
import httpx
import structlog
import uvicorn
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse, StreamingResponse
from nvidia_sidecar.config import load_config, SidecarConfig
from nvidia_sidecar.rate_limiter import (
Priority,
TokenBucket,
AdaptiveTokenBucket,
is_nvidia_gateway,
)
from nvidia_sidecar.priority_queue import (
@@ -33,6 +34,9 @@ from nvidia_sidecar.priority_queue import (
QueueFullPassthrough,
QueueFullPolicy,
)
from nvidia_sidecar.metrics import PrometheusMetrics
from nvidia_sidecar.health import HealthService
from nvidia_sidecar.webui import webui_router
# ---------------------------------------------------------------------------
# 结构化日志
@@ -48,10 +52,11 @@ structlog.configure(
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
# 生产环境推荐 JSONRenderer,开发环境可用 ConsoleRenderer
structlog.dev.ConsoleRenderer(),
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
logger_factory=structlog.PrintLoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
@@ -65,9 +70,12 @@ logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar")
_config: SidecarConfig
_http_client: httpx.AsyncClient
_priority_queue: PriorityRequestQueue
_token_bucket: TokenBucket
_token_bucket: AdaptiveTokenBucket
_prometheus: PrometheusMetrics
_health_service: HealthService
_pending_requests: dict[str, tuple[asyncio.Future[httpx.Response], float]]
"""request_id → (response future, enqueued_at) 的映射。"""
_metrics_task: asyncio.Task[None] | None = None
# 统计计数器
_stats: dict[str, int] = {
@@ -207,6 +215,7 @@ async def _worker_loop() -> None:
if not got_token:
log.info("low_priority_timeout", request_id=request_id)
_stats["ratelimited_requests"] += 1
_prometheus.record_request(queue_item.priority.name, "ratelimited")
if not future.done():
future.set_exception(
_RateLimitedError(
@@ -234,6 +243,7 @@ async def _worker_loop() -> None:
timeout=_config.request_timeout,
)
_stats["ratelimited_requests"] += 1
_prometheus.record_request(queue_item.priority.name, "ratelimited")
if not future.done():
future.set_exception(
_RateLimitedError(
@@ -266,6 +276,16 @@ async def _worker_loop() -> None:
queue_latency = time.monotonic() - enqueued_at
total_latency = upstream_latency + queue_latency
is_429: bool = resp.status_code == 429
_token_bucket.record_response(is_429)
# 避退状态评估 + 指标更新
_token_bucket.evaluate_retreat()
retreat_state = _token_bucket.get_retreat_state()
effective_rpm = _token_bucket.get_effective_rate_rpm()
upstream_429_rate = _token_bucket.get_429_rate()
_prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
log.info(
"request_completed",
request_id=request_id,
@@ -273,14 +293,26 @@ async def _worker_loop() -> None:
upstream_latency=round(upstream_latency, 3),
queue_latency=round(queue_latency, 3),
total_latency=round(total_latency, 3),
retreat_state=retreat_state,
effective_rpm=round(effective_rpm, 1),
)
# 记录 Prometheus 指标
model_id = _extract_model(payload) or "unknown"
_prometheus.record_upstream_latency(model_id, upstream_latency)
if not resp.is_success:
_prometheus.record_upstream_error(resp.status_code, model_id)
_prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
_prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
if not future.done():
future.set_result(resp)
except (httpx.HTTPError, OSError) as exc:
log.error("upstream_request_failed", request_id=request_id, error=str(exc))
_stats["upstream_errors"] += 1
_prometheus.record_request(queue_item.priority.name, "error")
_prometheus.set_health(False)
if not future.done():
future.set_exception(exc)
@@ -316,6 +348,9 @@ async def _passthrough_with_rate_limit(
Returns:
FastAPI Response。
"""
_stats["passthrough_requests"] += 1
_prometheus.increment_fallback()
# 低优先级走令牌桶等待
if priority == Priority.LOW:
got_token = await asyncio.to_thread(
@@ -325,6 +360,7 @@ async def _passthrough_with_rate_limit(
)
if not got_token:
_stats["ratelimited_requests"] += 1
_prometheus.record_request(priority.name, "ratelimited")
return JSONResponse(
status_code=429,
content={
@@ -344,6 +380,7 @@ async def _passthrough_with_rate_limit(
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
if time.monotonic() > deadline:
_stats["ratelimited_requests"] += 1
_prometheus.record_request(priority.name, "ratelimited")
return JSONResponse(
status_code=429,
content={
@@ -364,10 +401,18 @@ async def _passthrough_with_rate_limit(
headers=clean_headers,
stream=False,
)
retreat_state = _token_bucket.get_retreat_state()
_token_bucket.evaluate_retreat()
_prometheus.update_retreat_metrics(
retreat_state,
_token_bucket.get_effective_rate_rpm(),
_token_bucket.get_429_rate(),
)
return _build_response(resp)
except Exception as exc:
status, msg = _map_exception(exc)
logger.error("passthrough_error", path=path, error=str(exc))
_prometheus.set_health(False)
return JSONResponse(
status_code=status,
content={"error": {"message": msg, "type": type(exc).__name__}},
@@ -412,6 +457,7 @@ def _map_exception(exc: Exception) -> tuple[int, str]:
async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
"""应用生命周期管理:初始化/清理全局资源。"""
global _config, _http_client, _priority_queue, _token_bucket, _pending_requests
global _prometheus, _health_service, _metrics_task
# 启动
_config = load_config()
@@ -421,22 +467,40 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
timeout=httpx.Timeout(_config.request_timeout),
)
_priority_queue = PriorityRequestQueue(max_size=_config.queue_max_size)
_token_bucket = TokenBucket(
_token_bucket = AdaptiveTokenBucket(
rate=_config.rate_rpm / 60.0,
capacity=_config.bucket_capacity,
)
_prometheus = PrometheusMetrics()
_health_service = HealthService()
_pending_requests = {}
_stats["start_time"] = int(time.time())
# 启动 worker 协程
worker_task = asyncio.create_task(_worker_loop())
# 在独立端口 :9191 启动 Prometheus metrics 服务器
metrics_app = _prometheus.build_asgi_app()
metrics_config = uvicorn.Config(
metrics_app,
host=_config.listen_host,
port=_config.metrics_port,
log_level="error",
)
metrics_server = uvicorn.Server(metrics_config)
_metrics_task = asyncio.create_task(metrics_server.serve())
# 挂载 webui 子路由
app.include_router(webui_router)
logger.info(
"sidecar_started",
host=_config.listen_host,
port=_config.listen_port,
metrics_port=_config.metrics_port,
rate_rpm=_config.rate_rpm,
queue_max=_config.queue_max_size,
retreat_enabled=True,
)
yield # app 运行中
@@ -448,6 +512,13 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
except asyncio.CancelledError:
pass
if _metrics_task is not None:
_metrics_task.cancel()
try:
await _metrics_task
except asyncio.CancelledError:
pass
await _http_client.aclose()
logger.info("sidecar_stopped")
@@ -610,21 +681,28 @@ def _build_response(resp: httpx.Response) -> Response:
@app.get("/health")
async def health() -> dict[str, Any]:
"""健康检查端点"""
queue_stats = await _priority_queue.get_stats()
"""存活检查 (liveness)"""
return _health_service.liveness()
@app.get("/health/ready")
async def health_ready() -> dict[str, Any]:
"""就绪检查 (readiness),含上游连通性。"""
queue_size = await _priority_queue.get_queue_size()
bucket_status = _token_bucket.get_status()
return {
"status": "ok",
"version": "0.1.0",
"uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
"queue": queue_stats,
"token_bucket": bucket_status,
}
return await _health_service.readiness(
upstream_url=_config.upstream_url,
upstream_api_key=_config.upstream_api_key or "",
queue_current_size=queue_size,
queue_max_size=_config.queue_max_size,
available_tokens=bucket_status["tokens"],
bucket_capacity=bucket_status["capacity"],
)
@app.get("/metrics")
async def metrics() -> dict[str, Any]:
"""Prometheus 格式 metrics 端点"""
@app.get("/status")
async def status() -> dict[str, Any]:
"""调试用:限流器 + 队列 + 避退完整状态"""
queue_stats = await _priority_queue.get_stats()
bucket_status = _token_bucket.get_status()
return {
@@ -640,6 +718,12 @@ async def metrics() -> dict[str, Any]:
},
"queue": queue_stats,
"token_bucket": bucket_status,
"retreat": {
"state": _token_bucket.get_retreat_state(),
"effective_rpm": round(_token_bucket.get_effective_rate_rpm(), 1),
"base_rpm": round(_token_bucket.get_base_rate_rpm(), 1),
"upstream_429_rate": round(_token_bucket.get_429_rate(), 4),
},
"uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
}