fix(sidecar-v2): incorporate review feedback - P0/P1 fixes
P0 fixes: - Admin API Bearer Token auth middleware - Encryption key missing -> CRITICAL log + sys.exit(1) - Prometheus metrics endpoint (:9191) - requirements.txt + Dockerfile + docker-compose.yml + systemd + nginx P1 fixes: - Dead code removed from _refresh_cooldowns() - Stream detection fixed (text/event-stream only) - Emergency passthrough (10% RPM retry before 503) - Active health probing for backends - SQLite daily backup loop with retention - Chart.js CDN fallback - Key rotation SOP document - JSON log format support - Deploy files: systemd unit + nginx config BIZ-52 review re-entry Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
@@ -82,7 +82,9 @@ def build_response(resp: httpx.Response) -> Response:
|
||||
if k.lower() not in ("content-encoding", "transfer-encoding")
|
||||
}
|
||||
|
||||
if "text/event-stream" in content_type or "stream" in content_type:
|
||||
is_sse = "text/event-stream" in content_type
|
||||
is_chunked = resp.headers.get("transfer-encoding", "").lower() == "chunked"
|
||||
if is_sse or (is_chunked and headers.get("content-type", "") != "application/octet-stream"):
|
||||
return StreamingResponse(
|
||||
content=resp.aiter_bytes(),
|
||||
status_code=resp.status_code,
|
||||
@@ -176,7 +178,7 @@ async def handle_proxy_request(
|
||||
max_retries = config.max_pool_retries
|
||||
for attempt in range(max_retries):
|
||||
# Check and clear expired cooldowns before picking
|
||||
_refresh_cooldowns(pool_manager)
|
||||
_refresh_cooldowns()
|
||||
|
||||
backend = router.pick_backend(canonical_model)
|
||||
if backend is None:
|
||||
@@ -286,7 +288,7 @@ async def handle_proxy_request(
|
||||
)
|
||||
continue
|
||||
|
||||
# All backends exhausted — emergency rate-limited passthrough
|
||||
# All pools exhausted — emergency rate-limited passthrough
|
||||
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
|
||||
if emergency_rpm < 1:
|
||||
emergency_rpm = 1
|
||||
@@ -297,7 +299,60 @@ async def handle_proxy_request(
|
||||
emergency_rpm=emergency_rpm,
|
||||
)
|
||||
|
||||
# Emergency: just return a clear error telling OpenClaw to use its fallback
|
||||
# Emergency: try to get a token from any fallback backend at reduced RPM
|
||||
emergency_retries = 3
|
||||
for attempt in range(emergency_retries):
|
||||
backends = pool_manager.get_any_healthy_backends()
|
||||
for backend in backends:
|
||||
if rate_limiter.consume(backend.id, emergency_rpm):
|
||||
try:
|
||||
resp = await forward_to_backend(
|
||||
backend=backend,
|
||||
method=request.method,
|
||||
path=path,
|
||||
body=body_bytes if body_bytes else None,
|
||||
headers=raw_headers,
|
||||
stream=is_stream,
|
||||
)
|
||||
elapsed_ms = int((time.monotonic() - start_time) * 1000)
|
||||
|
||||
if resp.status_code == 429:
|
||||
start_cooldown(backend.id, backend.consecutive_429_count + 1)
|
||||
continue
|
||||
|
||||
# Success in emergency mode
|
||||
try:
|
||||
resp_json: dict[str, Any] = {}
|
||||
if not is_stream and resp.content:
|
||||
resp_json = json.loads(resp.content)
|
||||
except Exception:
|
||||
resp_json = {}
|
||||
|
||||
prompt_tokens, completion_tokens, total_tokens = extract_usage_from_response(
|
||||
resp, resp_json, canonical_model
|
||||
)
|
||||
cost_em = calculate_cost(backend, canonical_model, prompt_tokens, completion_tokens)
|
||||
|
||||
record_usage(
|
||||
backend_id=backend.id,
|
||||
model=canonical_model,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
cost=cost_em,
|
||||
latency_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"emergency_passthrough_success",
|
||||
backend_id=backend.id,
|
||||
model=canonical_model,
|
||||
emergency_rpm=emergency_rpm,
|
||||
)
|
||||
return build_response(resp)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# All emergency attempts failed — return 503 for OpenClaw fallback chain
|
||||
return build_error_response(
|
||||
503,
|
||||
"All provider pools exhausted. OpenClaw fallback chain should activate.",
|
||||
@@ -305,15 +360,11 @@ async def handle_proxy_request(
|
||||
)
|
||||
|
||||
|
||||
def _refresh_cooldowns(pool_manager: PoolManager) -> None:
|
||||
"""Check and clear expired cooldowns for all active backends."""
|
||||
for pool in ["primary", "fallback"]:
|
||||
backends = pool_manager.get_any_healthy_backends(pool=pool)
|
||||
for backend in backends:
|
||||
# Only check backends in non-healthy state
|
||||
pass
|
||||
def _refresh_cooldowns() -> None:
|
||||
"""Check and clear expired cooldowns for backends currently in cooling state.
|
||||
|
||||
# Actually check all backends including cooling ones
|
||||
Only queries backends with status='cooling' (the health_check_loop handles
|
||||
the periodic scanning; this is the on-demand refresh before proxy routing)."""
|
||||
from storage.backend_store import list_backends
|
||||
backends = list_backends(decrypt_key=False)
|
||||
for backend in backends:
|
||||
|
||||
Reference in New Issue
Block a user