BIZ-42: Phase2 可观测性+WebUI+避退模式 — metrics/health/webui/dashboard/adaptive
新增文件: - metrics.py: Prometheus 指标端点 (:9191), 10+3 个指标 - health.py: /health (liveness) + /health/ready (readiness) - webui.py: WebUI 后端 API (SSE 实时推送 + 配置热重载) - static/dashboard.html: 仪表盘前端 (Chart.js, 令牌桶仪表+队列柱状图+吞吐折线图) 更新文件: - rate_limiter.py: 增加 AdaptiveTokenBucket 避退模式 (ADR-009) 状态机 NORMAL→RETREAT→RECOVER, 429 率滑动窗口监控 - server.py: structlog 结构化日志 + 避退反馈回路 挂载 metrics_server (:9191) + health/ready + webui + /status - pyproject.toml: 增加 prometheus-client, pydantic, types-PyYAML 依赖 验证: - mypy --strict: 0 issues in 7 source files - AdaptiveTokenBucket 运行时测试通过 - 所有语法检查通过 Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
@@ -18,13 +18,14 @@ from typing import Any
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from nvidia_sidecar.config import load_config, SidecarConfig
|
||||
from nvidia_sidecar.rate_limiter import (
|
||||
Priority,
|
||||
TokenBucket,
|
||||
AdaptiveTokenBucket,
|
||||
is_nvidia_gateway,
|
||||
)
|
||||
from nvidia_sidecar.priority_queue import (
|
||||
@@ -33,6 +34,9 @@ from nvidia_sidecar.priority_queue import (
|
||||
QueueFullPassthrough,
|
||||
QueueFullPolicy,
|
||||
)
|
||||
from nvidia_sidecar.metrics import PrometheusMetrics
|
||||
from nvidia_sidecar.health import HealthService
|
||||
from nvidia_sidecar.webui import webui_router
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 结构化日志
|
||||
@@ -48,10 +52,11 @@ structlog.configure(
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
# 生产环境推荐 JSONRenderer,开发环境可用 ConsoleRenderer
|
||||
structlog.dev.ConsoleRenderer(),
|
||||
],
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
logger_factory=structlog.PrintLoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
@@ -65,9 +70,12 @@ logger: structlog.stdlib.BoundLogger = structlog.get_logger("nvidia_sidecar")
|
||||
_config: SidecarConfig
|
||||
_http_client: httpx.AsyncClient
|
||||
_priority_queue: PriorityRequestQueue
|
||||
_token_bucket: TokenBucket
|
||||
_token_bucket: AdaptiveTokenBucket
|
||||
_prometheus: PrometheusMetrics
|
||||
_health_service: HealthService
|
||||
_pending_requests: dict[str, tuple[asyncio.Future[httpx.Response], float]]
|
||||
"""request_id → (response future, enqueued_at) 的映射。"""
|
||||
_metrics_task: asyncio.Task[None] | None = None
|
||||
|
||||
# 统计计数器
|
||||
_stats: dict[str, int] = {
|
||||
@@ -207,6 +215,7 @@ async def _worker_loop() -> None:
|
||||
if not got_token:
|
||||
log.info("low_priority_timeout", request_id=request_id)
|
||||
_stats["ratelimited_requests"] += 1
|
||||
_prometheus.record_request(queue_item.priority.name, "ratelimited")
|
||||
if not future.done():
|
||||
future.set_exception(
|
||||
_RateLimitedError(
|
||||
@@ -234,6 +243,7 @@ async def _worker_loop() -> None:
|
||||
timeout=_config.request_timeout,
|
||||
)
|
||||
_stats["ratelimited_requests"] += 1
|
||||
_prometheus.record_request(queue_item.priority.name, "ratelimited")
|
||||
if not future.done():
|
||||
future.set_exception(
|
||||
_RateLimitedError(
|
||||
@@ -266,6 +276,16 @@ async def _worker_loop() -> None:
|
||||
queue_latency = time.monotonic() - enqueued_at
|
||||
total_latency = upstream_latency + queue_latency
|
||||
|
||||
is_429: bool = resp.status_code == 429
|
||||
_token_bucket.record_response(is_429)
|
||||
|
||||
# 避退状态评估 + 指标更新
|
||||
_token_bucket.evaluate_retreat()
|
||||
retreat_state = _token_bucket.get_retreat_state()
|
||||
effective_rpm = _token_bucket.get_effective_rate_rpm()
|
||||
upstream_429_rate = _token_bucket.get_429_rate()
|
||||
_prometheus.update_retreat_metrics(retreat_state, effective_rpm, upstream_429_rate)
|
||||
|
||||
log.info(
|
||||
"request_completed",
|
||||
request_id=request_id,
|
||||
@@ -273,14 +293,26 @@ async def _worker_loop() -> None:
|
||||
upstream_latency=round(upstream_latency, 3),
|
||||
queue_latency=round(queue_latency, 3),
|
||||
total_latency=round(total_latency, 3),
|
||||
retreat_state=retreat_state,
|
||||
effective_rpm=round(effective_rpm, 1),
|
||||
)
|
||||
|
||||
# 记录 Prometheus 指标
|
||||
model_id = _extract_model(payload) or "unknown"
|
||||
_prometheus.record_upstream_latency(model_id, upstream_latency)
|
||||
if not resp.is_success:
|
||||
_prometheus.record_upstream_error(resp.status_code, model_id)
|
||||
_prometheus.record_request(queue_item.priority.name, "success" if resp.is_success else "error")
|
||||
_prometheus.record_queue_latency(queue_item.priority.name, queue_latency)
|
||||
|
||||
if not future.done():
|
||||
future.set_result(resp)
|
||||
|
||||
except (httpx.HTTPError, OSError) as exc:
|
||||
log.error("upstream_request_failed", request_id=request_id, error=str(exc))
|
||||
_stats["upstream_errors"] += 1
|
||||
_prometheus.record_request(queue_item.priority.name, "error")
|
||||
_prometheus.set_health(False)
|
||||
if not future.done():
|
||||
future.set_exception(exc)
|
||||
|
||||
@@ -316,6 +348,9 @@ async def _passthrough_with_rate_limit(
|
||||
Returns:
|
||||
FastAPI Response。
|
||||
"""
|
||||
_stats["passthrough_requests"] += 1
|
||||
_prometheus.increment_fallback()
|
||||
|
||||
# 低优先级走令牌桶等待
|
||||
if priority == Priority.LOW:
|
||||
got_token = await asyncio.to_thread(
|
||||
@@ -325,6 +360,7 @@ async def _passthrough_with_rate_limit(
|
||||
)
|
||||
if not got_token:
|
||||
_stats["ratelimited_requests"] += 1
|
||||
_prometheus.record_request(priority.name, "ratelimited")
|
||||
return JSONResponse(
|
||||
status_code=429,
|
||||
content={
|
||||
@@ -344,6 +380,7 @@ async def _passthrough_with_rate_limit(
|
||||
got_token = await asyncio.to_thread(_token_bucket.consume, tokens=1)
|
||||
if time.monotonic() > deadline:
|
||||
_stats["ratelimited_requests"] += 1
|
||||
_prometheus.record_request(priority.name, "ratelimited")
|
||||
return JSONResponse(
|
||||
status_code=429,
|
||||
content={
|
||||
@@ -364,10 +401,18 @@ async def _passthrough_with_rate_limit(
|
||||
headers=clean_headers,
|
||||
stream=False,
|
||||
)
|
||||
retreat_state = _token_bucket.get_retreat_state()
|
||||
_token_bucket.evaluate_retreat()
|
||||
_prometheus.update_retreat_metrics(
|
||||
retreat_state,
|
||||
_token_bucket.get_effective_rate_rpm(),
|
||||
_token_bucket.get_429_rate(),
|
||||
)
|
||||
return _build_response(resp)
|
||||
except Exception as exc:
|
||||
status, msg = _map_exception(exc)
|
||||
logger.error("passthrough_error", path=path, error=str(exc))
|
||||
_prometheus.set_health(False)
|
||||
return JSONResponse(
|
||||
status_code=status,
|
||||
content={"error": {"message": msg, "type": type(exc).__name__}},
|
||||
@@ -412,6 +457,7 @@ def _map_exception(exc: Exception) -> tuple[int, str]:
|
||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
|
||||
"""应用生命周期管理:初始化/清理全局资源。"""
|
||||
global _config, _http_client, _priority_queue, _token_bucket, _pending_requests
|
||||
global _prometheus, _health_service, _metrics_task
|
||||
|
||||
# 启动
|
||||
_config = load_config()
|
||||
@@ -421,22 +467,40 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
|
||||
timeout=httpx.Timeout(_config.request_timeout),
|
||||
)
|
||||
_priority_queue = PriorityRequestQueue(max_size=_config.queue_max_size)
|
||||
_token_bucket = TokenBucket(
|
||||
_token_bucket = AdaptiveTokenBucket(
|
||||
rate=_config.rate_rpm / 60.0,
|
||||
capacity=_config.bucket_capacity,
|
||||
)
|
||||
_prometheus = PrometheusMetrics()
|
||||
_health_service = HealthService()
|
||||
_pending_requests = {}
|
||||
_stats["start_time"] = int(time.time())
|
||||
|
||||
# 启动 worker 协程
|
||||
worker_task = asyncio.create_task(_worker_loop())
|
||||
|
||||
# 在独立端口 :9191 启动 Prometheus metrics 服务器
|
||||
metrics_app = _prometheus.build_asgi_app()
|
||||
metrics_config = uvicorn.Config(
|
||||
metrics_app,
|
||||
host=_config.listen_host,
|
||||
port=_config.metrics_port,
|
||||
log_level="error",
|
||||
)
|
||||
metrics_server = uvicorn.Server(metrics_config)
|
||||
_metrics_task = asyncio.create_task(metrics_server.serve())
|
||||
|
||||
# 挂载 webui 子路由
|
||||
app.include_router(webui_router)
|
||||
|
||||
logger.info(
|
||||
"sidecar_started",
|
||||
host=_config.listen_host,
|
||||
port=_config.listen_port,
|
||||
metrics_port=_config.metrics_port,
|
||||
rate_rpm=_config.rate_rpm,
|
||||
queue_max=_config.queue_max_size,
|
||||
retreat_enabled=True,
|
||||
)
|
||||
|
||||
yield # app 运行中
|
||||
@@ -448,6 +512,13 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, Any]:
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
if _metrics_task is not None:
|
||||
_metrics_task.cancel()
|
||||
try:
|
||||
await _metrics_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
await _http_client.aclose()
|
||||
logger.info("sidecar_stopped")
|
||||
|
||||
@@ -610,21 +681,28 @@ def _build_response(resp: httpx.Response) -> Response:
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, Any]:
|
||||
"""健康检查端点。"""
|
||||
queue_stats = await _priority_queue.get_stats()
|
||||
"""存活检查 (liveness)。"""
|
||||
return _health_service.liveness()
|
||||
|
||||
|
||||
@app.get("/health/ready")
|
||||
async def health_ready() -> dict[str, Any]:
|
||||
"""就绪检查 (readiness),含上游连通性。"""
|
||||
queue_size = await _priority_queue.get_queue_size()
|
||||
bucket_status = _token_bucket.get_status()
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": "0.1.0",
|
||||
"uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
|
||||
"queue": queue_stats,
|
||||
"token_bucket": bucket_status,
|
||||
}
|
||||
return await _health_service.readiness(
|
||||
upstream_url=_config.upstream_url,
|
||||
upstream_api_key=_config.upstream_api_key or "",
|
||||
queue_current_size=queue_size,
|
||||
queue_max_size=_config.queue_max_size,
|
||||
available_tokens=bucket_status["tokens"],
|
||||
bucket_capacity=bucket_status["capacity"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics() -> dict[str, Any]:
|
||||
"""Prometheus 格式 metrics 端点。"""
|
||||
@app.get("/status")
|
||||
async def status() -> dict[str, Any]:
|
||||
"""调试用:限流器 + 队列 + 避退完整状态。"""
|
||||
queue_stats = await _priority_queue.get_stats()
|
||||
bucket_status = _token_bucket.get_status()
|
||||
return {
|
||||
@@ -640,6 +718,12 @@ async def metrics() -> dict[str, Any]:
|
||||
},
|
||||
"queue": queue_stats,
|
||||
"token_bucket": bucket_status,
|
||||
"retreat": {
|
||||
"state": _token_bucket.get_retreat_state(),
|
||||
"effective_rpm": round(_token_bucket.get_effective_rate_rpm(), 1),
|
||||
"base_rpm": round(_token_bucket.get_base_rate_rpm(), 1),
|
||||
"upstream_429_rate": round(_token_bucket.get_429_rate(), 4),
|
||||
},
|
||||
"uptime_seconds": int(time.time() - _stats["start_time"]) if _stats["start_time"] else 0,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user