fix(sidecar-v2): incorporate review feedback - P0/P1 fixes

P0 fixes: - Admin API Bearer Token auth middleware - Encryption key missing -> CRITICAL log + sys.exit(1) - Prometheus metrics endpoint (:9191) - requirements.txt + Dockerfile + docker-compose.yml + systemd + nginx P1 fixes: - Dead code removed from _refresh_cooldowns() - Stream detection fixed (text/event-stream only) - Emergency passthrough (10% RPM retry before 503) - Active health probing for backends - SQLite daily backup loop with retention - Chart.js CDN fallback - Key rotation SOP document - JSON log format support - Deploy files: systemd unit + nginx config BIZ-52 review re-entry Co-authored-by: multica-agent <github@multica.ai>
2026-06-25 17:11:35 +08:00
parent 611ebd11a8
commit 4f415fb500
9 changed files with 630 additions and 93 deletions
@@ -82,7 +82,9 @@ def build_response(resp: httpx.Response) -> Response:
        if k.lower() not in ("content-encoding", "transfer-encoding")
    }

-    if "text/event-stream" in content_type or "stream" in content_type:
+    is_sse = "text/event-stream" in content_type
+    is_chunked = resp.headers.get("transfer-encoding", "").lower() == "chunked"
+    if is_sse or (is_chunked and headers.get("content-type", "") != "application/octet-stream"):
        return StreamingResponse(
            content=resp.aiter_bytes(),
            status_code=resp.status_code,
@@ -176,7 +178,7 @@ async def handle_proxy_request(
    max_retries = config.max_pool_retries
    for attempt in range(max_retries):
        # Check and clear expired cooldowns before picking
-        _refresh_cooldowns(pool_manager)
+        _refresh_cooldowns()

        backend = router.pick_backend(canonical_model)
        if backend is None:
@@ -286,7 +288,7 @@ async def handle_proxy_request(
            )
            continue

-    # All backends exhausted — emergency rate-limited passthrough
+    # All pools exhausted — emergency rate-limited passthrough
    emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
    if emergency_rpm < 1:
        emergency_rpm = 1
@@ -297,7 +299,60 @@ async def handle_proxy_request(
        emergency_rpm=emergency_rpm,
    )

-    # Emergency: just return a clear error telling OpenClaw to use its fallback
+    # Emergency: try to get a token from any fallback backend at reduced RPM
+    emergency_retries = 3
+    for attempt in range(emergency_retries):
+        backends = pool_manager.get_any_healthy_backends()
+        for backend in backends:
+            if rate_limiter.consume(backend.id, emergency_rpm):
+                try:
+                    resp = await forward_to_backend(
+                        backend=backend,
+                        method=request.method,
+                        path=path,
+                        body=body_bytes if body_bytes else None,
+                        headers=raw_headers,
+                        stream=is_stream,
+                    )
+                    elapsed_ms = int((time.monotonic() - start_time) * 1000)
+
+                    if resp.status_code == 429:
+                        start_cooldown(backend.id, backend.consecutive_429_count + 1)
+                        continue
+
+                    # Success in emergency mode
+                    try:
+                        resp_json: dict[str, Any] = {}
+                        if not is_stream and resp.content:
+                            resp_json = json.loads(resp.content)
+                    except Exception:
+                        resp_json = {}
+
+                    prompt_tokens, completion_tokens, total_tokens = extract_usage_from_response(
+                        resp, resp_json, canonical_model
+                    )
+                    cost_em = calculate_cost(backend, canonical_model, prompt_tokens, completion_tokens)
+
+                    record_usage(
+                        backend_id=backend.id,
+                        model=canonical_model,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        cost=cost_em,
+                        latency_ms=elapsed_ms,
+                    )
+
+                    logger.info(
+                        "emergency_passthrough_success",
+                        backend_id=backend.id,
+                        model=canonical_model,
+                        emergency_rpm=emergency_rpm,
+                    )
+                    return build_response(resp)
+                except Exception:
+                    continue
+
+    # All emergency attempts failed — return 503 for OpenClaw fallback chain
    return build_error_response(
        503,
        "All provider pools exhausted. OpenClaw fallback chain should activate.",
@@ -305,15 +360,11 @@ async def handle_proxy_request(
    )


-def _refresh_cooldowns(pool_manager: PoolManager) -> None:
-    """Check and clear expired cooldowns for all active backends."""
-    for pool in ["primary", "fallback"]:
-        backends = pool_manager.get_any_healthy_backends(pool=pool)
-        for backend in backends:
-            # Only check backends in non-healthy state
-            pass
+def _refresh_cooldowns() -> None:
+    """Check and clear expired cooldowns for backends currently in cooling state.

-    # Actually check all backends including cooling ones
+    Only queries backends with status='cooling' (the health_check_loop handles
+    the periodic scanning; this is the on-demand refresh before proxy routing)."""
    from storage.backend_store import list_backends
    backends = list_backends(decrypt_key=False)
    for backend in backends: