fix(sidecar-v2): incorporate review feedback - P0/P1 fixes

P0 fixes:
- Admin API Bearer Token auth middleware
- Encryption key missing -> CRITICAL log + sys.exit(1)
- Prometheus metrics endpoint (:9191)
- requirements.txt + Dockerfile + docker-compose.yml + systemd + nginx

P1 fixes:
- Dead code removed from _refresh_cooldowns()
- Stream detection fixed (text/event-stream only)
- Emergency passthrough (10% RPM retry before 503)
- Active health probing for backends
- SQLite daily backup loop with retention
- Chart.js CDN fallback
- Key rotation SOP document
- JSON log format support
- Deploy files: systemd unit + nginx config

BIZ-52 review re-entry

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-25 17:11:35 +08:00
parent 611ebd11a8
commit 4f415fb500
9 changed files with 630 additions and 93 deletions
+63 -12
View File
@@ -82,7 +82,9 @@ def build_response(resp: httpx.Response) -> Response:
if k.lower() not in ("content-encoding", "transfer-encoding")
}
if "text/event-stream" in content_type or "stream" in content_type:
is_sse = "text/event-stream" in content_type
is_chunked = resp.headers.get("transfer-encoding", "").lower() == "chunked"
if is_sse or (is_chunked and headers.get("content-type", "") != "application/octet-stream"):
return StreamingResponse(
content=resp.aiter_bytes(),
status_code=resp.status_code,
@@ -176,7 +178,7 @@ async def handle_proxy_request(
max_retries = config.max_pool_retries
for attempt in range(max_retries):
# Check and clear expired cooldowns before picking
_refresh_cooldowns(pool_manager)
_refresh_cooldowns()
backend = router.pick_backend(canonical_model)
if backend is None:
@@ -286,7 +288,7 @@ async def handle_proxy_request(
)
continue
# All backends exhausted — emergency rate-limited passthrough
# All pools exhausted — emergency rate-limited passthrough
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
if emergency_rpm < 1:
emergency_rpm = 1
@@ -297,7 +299,60 @@ async def handle_proxy_request(
emergency_rpm=emergency_rpm,
)
# Emergency: just return a clear error telling OpenClaw to use its fallback
# Emergency: try to get a token from any fallback backend at reduced RPM
emergency_retries = 3
for attempt in range(emergency_retries):
backends = pool_manager.get_any_healthy_backends()
for backend in backends:
if rate_limiter.consume(backend.id, emergency_rpm):
try:
resp = await forward_to_backend(
backend=backend,
method=request.method,
path=path,
body=body_bytes if body_bytes else None,
headers=raw_headers,
stream=is_stream,
)
elapsed_ms = int((time.monotonic() - start_time) * 1000)
if resp.status_code == 429:
start_cooldown(backend.id, backend.consecutive_429_count + 1)
continue
# Success in emergency mode
try:
resp_json: dict[str, Any] = {}
if not is_stream and resp.content:
resp_json = json.loads(resp.content)
except Exception:
resp_json = {}
prompt_tokens, completion_tokens, total_tokens = extract_usage_from_response(
resp, resp_json, canonical_model
)
cost_em = calculate_cost(backend, canonical_model, prompt_tokens, completion_tokens)
record_usage(
backend_id=backend.id,
model=canonical_model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
cost=cost_em,
latency_ms=elapsed_ms,
)
logger.info(
"emergency_passthrough_success",
backend_id=backend.id,
model=canonical_model,
emergency_rpm=emergency_rpm,
)
return build_response(resp)
except Exception:
continue
# All emergency attempts failed — return 503 for OpenClaw fallback chain
return build_error_response(
503,
"All provider pools exhausted. OpenClaw fallback chain should activate.",
@@ -305,15 +360,11 @@ async def handle_proxy_request(
)
def _refresh_cooldowns(pool_manager: PoolManager) -> None:
"""Check and clear expired cooldowns for all active backends."""
for pool in ["primary", "fallback"]:
backends = pool_manager.get_any_healthy_backends(pool=pool)
for backend in backends:
# Only check backends in non-healthy state
pass
def _refresh_cooldowns() -> None:
"""Check and clear expired cooldowns for backends currently in cooling state.
# Actually check all backends including cooling ones
Only queries backends with status='cooling' (the health_check_loop handles
the periodic scanning; this is the on-demand refresh before proxy routing)."""
from storage.backend_store import list_backends
backends = list_backends(decrypt_key=False)
for backend in backends: