fix(BIZ-42): critical deadlock + major review issues

- Fix #1 (Critical): evaluate_retreat() deadlock — changed _retreat_lock to
  threading.RLock() to allow reentrant acquisition when evaluate_retreat()
  calls get_429_rate() while holding the lock
- Fix #2 (Major): Dashboard queue chart now uses snap.queue.per_priority
  instead of Math.random() mock data
- Fix #3 (Major): structlog uses JSONRenderer instead of ConsoleRenderer
  for JSON-format output as required by acceptance criteria
- Bonus: webui.py _build_snapshot() now async and includes queue data
  with per-priority depth for dashboard consumption

Reviewed-by: 梁思筑 (architect)
Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-24 12:20:23 +08:00
parent e829a4060b
commit ba5b932f50
4 changed files with 28 additions and 12 deletions
+2 -2
View File
@@ -295,8 +295,8 @@ class AdaptiveTokenBucket(TokenBucket):
# 上次状态变更时间 # 上次状态变更时间
self._last_state_change: float = time.monotonic() self._last_state_change: float = time.monotonic()
# 避退状态锁 # 避退状态锁RLock 防止 evaluate_retreat() → get_429_rate() 重入死锁)
self._retreat_lock: threading.Lock = threading.Lock() self._retreat_lock: threading.RLock = threading.RLock()
# ---- 429 反馈 ---- # ---- 429 反馈 ----
+1 -2
View File
@@ -52,8 +52,7 @@ structlog.configure(
structlog.processors.StackInfoRenderer(), structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info, structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(), structlog.processors.UnicodeDecoder(),
# 生产环境推荐 JSONRenderer,开发环境可用 ConsoleRenderer structlog.processors.JSONRenderer(),
structlog.dev.ConsoleRenderer(),
], ],
context_class=dict, context_class=dict,
logger_factory=structlog.PrintLoggerFactory(), logger_factory=structlog.PrintLoggerFactory(),
@@ -186,12 +186,13 @@ function updateDashboard(snap) {
]; ];
chartTokens.update(); chartTokens.update();
const mb = (snap.metrics_buffer || {}); const qs = snap.queue || {};
const perPriority = qs.per_priority || {};
chartQueue.data.datasets[0].data = [ chartQueue.data.datasets[0].data = [
Math.round(Math.random() * 5), perPriority.URGENT || 0,
Math.round(Math.random() * 10), perPriority.HIGH || 0,
Math.round(Math.random() * 15), perPriority.NORMAL || 0,
Math.round(Math.random() * 20) perPriority.LOW || 0
]; ];
chartQueue.update(); chartQueue.update();
+19 -3
View File
@@ -48,7 +48,7 @@ async def _dashboard_stream(request: Request) -> StreamingResponse:
if await request.is_disconnected(): if await request.is_disconnected():
break break
try: try:
snapshot: dict[str, Any] = _build_snapshot() snapshot: dict[str, Any] = await _build_snapshot()
yield f"data: {json.dumps(snapshot, ensure_ascii=False)}\n\n" yield f"data: {json.dumps(snapshot, ensure_ascii=False)}\n\n"
except Exception: except Exception:
logger.exception("dashboard_sse_error") logger.exception("dashboard_sse_error")
@@ -65,8 +65,8 @@ async def _dashboard_stream(request: Request) -> StreamingResponse:
) )
def _build_snapshot() -> dict[str, Any]: async def _build_snapshot() -> dict[str, Any]:
"""构建当前状态快照(同步部分,从全局状态读取)。""" """构建当前状态快照(从全局状态读取,含队列深度)。"""
# 延迟导入避免循环依赖 # 延迟导入避免循环依赖
from nvidia_sidecar import server from nvidia_sidecar import server
@@ -77,10 +77,26 @@ def _build_snapshot() -> dict[str, Any]:
now = time.time() now = time.time()
uptime = int(now - _stats["start_time"]) if _stats.get("start_time") else 0 uptime = int(now - _stats["start_time"]) if _stats.get("start_time") else 0
# 获取队列统计数据(含 per-priority depth
queue_data: dict[str, Any] = {"current_size": 0, "per_priority": {}}
try:
queue_stats = await server._priority_queue.get_stats()
queue_data = {
"max_size": queue_stats.get("max_size", 0),
"current_size": queue_stats.get("current_size", 0),
"per_priority": queue_stats.get("depth_by_priority", {}),
"total_enqueued": queue_stats.get("total_enqueued", 0),
"total_dequeued": queue_stats.get("total_dequeued", 0),
"total_dropped": queue_stats.get("total_dropped", 0),
}
except Exception:
pass
return { return {
"timestamp": now, "timestamp": now,
"uptime_seconds": uptime, "uptime_seconds": uptime,
"token_bucket": bucket_status, "token_bucket": bucket_status,
"queue": queue_data,
"retreat": { "retreat": {
"state": getattr(_token_bucket, "_retreat_state", "normal"), "state": getattr(_token_bucket, "_retreat_state", "normal"),
"effective_rpm": round(getattr(_token_bucket, "get_effective_rate_rpm", lambda: 40.0)(), 1), "effective_rpm": round(getattr(_token_bucket, "get_effective_rate_rpm", lambda: 40.0)(), 1),