Files
sidecar-v2/router.py
T
vincent 376ce97d91 feat: Primary-Wait backoff queuing — BIZ-60
When all primary backends are in cooldown, wait and retry the primary pool
before falling through to fallback/emergency. This reduces unnecessary
spend on paid fallback providers during temporary 429 storms.

Config:
- primary_wait_ms (default 5000, env SIDECAR_PRIMARY_WAIT_MS)
- primary_wait_max_retries (default 6, env SIDECAR_PRIMARY_WAIT_MAX_RETRIES)

Implementation:
- config.py: 2 new config fields + env var loading
- router.py: pick_primary_backend() — primary-pool-only selection
- proxy.py: primary-wait loop between standard retries and emergency

Expected win: 17% error rate during high concurrency drops, emergency
passthrough count falls as requests wait for NVIDIA pool recovery
instead of immediately routing to SiliconFlow fallback.
2026-06-25 22:22:02 +08:00

76 lines
2.7 KiB
Python

"""Model → Backend routing logic for Sidecar V2."""
import structlog
from typing import Optional
from storage.models import Backend
from pool_manager import PoolManager
from rate_limiter import PerBackendRateLimiter
logger = structlog.get_logger("sidecar_v2.router")
class Router:
"""Routes model requests to the best available backend.
Pick strategy:
1. Primary pool → healthy backends supporting the model
2. Rate-limiter check → skip if RPM exhausted
3. Fallback pool → repeat above
4. If all exhausted → return None (caller handles emergency)
"""
def __init__(self, pool_manager: PoolManager, rate_limiter: PerBackendRateLimiter):
self._pool_manager = pool_manager
self._rate_limiter = rate_limiter
def pick_backend(self, canonical_model: str) -> Optional[Backend]:
"""Pick the best available backend for a model.
Tries primary pool first, then fallback.
Within each pool, skips backends at RPM limit.
Returns None if no backend available.
"""
# Try pools in order
for pool in ["primary", "fallback"]:
backends = self._pool_manager.get_available_backends(
canonical_model, pool=pool
)
for backend in backends:
# Rate-limit check
if self._rate_limiter.consume(
backend.id, backend.rpm_limit
):
return backend
# Skip this backend, try next
logger.debug(
"backend_rate_limited",
backend_id=backend.id,
pool=pool,
model=canonical_model,
)
if not backends:
logger.debug("pool_exhausted", pool=pool, model=canonical_model)
else:
logger.debug("pool_rpm_exhausted", pool=pool, model=canonical_model)
return None
def pick_primary_backend(self, canonical_model: str) -> Optional[Backend]:
"""Pick a backend from primary pool only (no fallback).
Used by Primary-Wait: when all primary backends are cooling,
wait and retry primary exclusively before falling through to fallback.
"""
backends = self._pool_manager.get_available_backends(
canonical_model, pool="primary"
)
for backend in backends:
if self._rate_limiter.consume(backend.id, backend.rpm_limit):
return backend
return None
def get_all_pools_exhausted_info(self, canonical_model: str) -> bool:
"""Check if ALL pools are exhausted for a model."""
return not self._pool_manager.is_any_pool_available(canonical_model)