376ce97d91
When all primary backends are in cooldown, wait and retry the primary pool before falling through to fallback/emergency. This reduces unnecessary spend on paid fallback providers during temporary 429 storms. Config: - primary_wait_ms (default 5000, env SIDECAR_PRIMARY_WAIT_MS) - primary_wait_max_retries (default 6, env SIDECAR_PRIMARY_WAIT_MAX_RETRIES) Implementation: - config.py: 2 new config fields + env var loading - router.py: pick_primary_backend() — primary-pool-only selection - proxy.py: primary-wait loop between standard retries and emergency Expected win: 17% error rate during high concurrency drops, emergency passthrough count falls as requests wait for NVIDIA pool recovery instead of immediately routing to SiliconFlow fallback.
76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
"""Model → Backend routing logic for Sidecar V2."""
|
|
|
|
import structlog
|
|
from typing import Optional
|
|
|
|
from storage.models import Backend
|
|
from pool_manager import PoolManager
|
|
from rate_limiter import PerBackendRateLimiter
|
|
|
|
logger = structlog.get_logger("sidecar_v2.router")
|
|
|
|
|
|
class Router:
|
|
"""Routes model requests to the best available backend.
|
|
|
|
Pick strategy:
|
|
1. Primary pool → healthy backends supporting the model
|
|
2. Rate-limiter check → skip if RPM exhausted
|
|
3. Fallback pool → repeat above
|
|
4. If all exhausted → return None (caller handles emergency)
|
|
"""
|
|
|
|
def __init__(self, pool_manager: PoolManager, rate_limiter: PerBackendRateLimiter):
|
|
self._pool_manager = pool_manager
|
|
self._rate_limiter = rate_limiter
|
|
|
|
def pick_backend(self, canonical_model: str) -> Optional[Backend]:
|
|
"""Pick the best available backend for a model.
|
|
|
|
Tries primary pool first, then fallback.
|
|
Within each pool, skips backends at RPM limit.
|
|
Returns None if no backend available.
|
|
"""
|
|
# Try pools in order
|
|
for pool in ["primary", "fallback"]:
|
|
backends = self._pool_manager.get_available_backends(
|
|
canonical_model, pool=pool
|
|
)
|
|
for backend in backends:
|
|
# Rate-limit check
|
|
if self._rate_limiter.consume(
|
|
backend.id, backend.rpm_limit
|
|
):
|
|
return backend
|
|
# Skip this backend, try next
|
|
logger.debug(
|
|
"backend_rate_limited",
|
|
backend_id=backend.id,
|
|
pool=pool,
|
|
model=canonical_model,
|
|
)
|
|
|
|
if not backends:
|
|
logger.debug("pool_exhausted", pool=pool, model=canonical_model)
|
|
else:
|
|
logger.debug("pool_rpm_exhausted", pool=pool, model=canonical_model)
|
|
|
|
return None
|
|
|
|
def pick_primary_backend(self, canonical_model: str) -> Optional[Backend]:
|
|
"""Pick a backend from primary pool only (no fallback).
|
|
|
|
Used by Primary-Wait: when all primary backends are cooling,
|
|
wait and retry primary exclusively before falling through to fallback.
|
|
"""
|
|
backends = self._pool_manager.get_available_backends(
|
|
canonical_model, pool="primary"
|
|
)
|
|
for backend in backends:
|
|
if self._rate_limiter.consume(backend.id, backend.rpm_limit):
|
|
return backend
|
|
return None
|
|
|
|
def get_all_pools_exhausted_info(self, canonical_model: str) -> bool:
|
|
"""Check if ALL pools are exhausted for a model."""
|
|
return not self._pool_manager.is_any_pool_available(canonical_model) |