376ce97d91
When all primary backends are in cooldown, wait and retry the primary pool before falling through to fallback/emergency. This reduces unnecessary spend on paid fallback providers during temporary 429 storms. Config: - primary_wait_ms (default 5000, env SIDECAR_PRIMARY_WAIT_MS) - primary_wait_max_retries (default 6, env SIDECAR_PRIMARY_WAIT_MAX_RETRIES) Implementation: - config.py: 2 new config fields + env var loading - router.py: pick_primary_backend() — primary-pool-only selection - proxy.py: primary-wait loop between standard retries and emergency Expected win: 17% error rate during high concurrency drops, emergency passthrough count falls as requests wait for NVIDIA pool recovery instead of immediately routing to SiliconFlow fallback.
179 lines
5.1 KiB
Python
179 lines
5.1 KiB
Python
"""System configuration management for Sidecar V2."""
|
|
|
|
import os
|
|
import json
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
"""Sidecar V2 runtime configuration.
|
|
|
|
Sources (priority order):
|
|
1. Environment variables (highest)
|
|
2. system_config table in SQLite
|
|
3. Defaults defined here
|
|
"""
|
|
|
|
# Listen
|
|
host: str = "127.0.0.1"
|
|
port: int = 9190
|
|
metrics_port: int = 9191
|
|
|
|
# Queue
|
|
queue_max_depth: int = 500
|
|
queue_timeout_seconds: float = 30.0
|
|
|
|
# Provider
|
|
default_rpm_limit: int = 40
|
|
|
|
# Cooldown
|
|
cooldown_base_seconds: float = 30.0
|
|
cooldown_max_seconds: float = 600.0
|
|
cooldown_exponential_backoff: bool = True
|
|
|
|
# Emergency channel: RPM fraction when all pools exhausted
|
|
emergency_rpm_fraction: float = 0.10
|
|
|
|
# Health check
|
|
health_check_interval_seconds: int = 60
|
|
health_check_timeout_seconds: int = 10
|
|
health_probe_endpoint: str = "/v1/models"
|
|
|
|
# Admin auth
|
|
admin_token: str = ""
|
|
|
|
# Encryption
|
|
encryption_key: str = ""
|
|
|
|
# Logging
|
|
log_level: str = "INFO"
|
|
|
|
# Database
|
|
db_path: str = ""
|
|
backup_dir: str = ""
|
|
backup_retention_days: int = 7
|
|
|
|
# Rate limiter
|
|
rate_limiter_refill_interval_ms: int = 50
|
|
|
|
# Router
|
|
router_refresh_interval_seconds: float = 5.0
|
|
|
|
# Max pool-internal retries
|
|
max_pool_retries: int = 5
|
|
|
|
# Pre-check cooldown threshold (seconds remaining)
|
|
cooldown_precheck_threshold_seconds: float = 10.0
|
|
|
|
# Dashboard
|
|
dashboard_sse_interval_seconds: float = 1.0
|
|
|
|
# Stats
|
|
stats_refresh_interval_seconds: float = 30.0
|
|
|
|
# Primary-Wait: when all primary backends are cooling, wait before fallback
|
|
primary_wait_ms: int = 5000
|
|
primary_wait_max_retries: int = 6
|
|
|
|
# Request timeout
|
|
default_request_timeout_seconds: int = 120
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "Config":
|
|
"""Load configuration from environment variables."""
|
|
c = cls()
|
|
|
|
# Listen
|
|
c.host = os.getenv("SIDECAR_HOST", c.host)
|
|
c.port = int(os.getenv("SIDECAR_PORT", str(c.port)))
|
|
c.metrics_port = int(os.getenv("SIDECAR_METRICS_PORT", str(c.metrics_port)))
|
|
|
|
# Queue
|
|
c.queue_max_depth = int(os.getenv("SIDECAR_QUEUE_MAX", str(c.queue_max_depth)))
|
|
c.queue_timeout_seconds = float(
|
|
os.getenv("SIDECAR_QUEUE_TIMEOUT", str(c.queue_timeout_seconds))
|
|
)
|
|
|
|
# Provider
|
|
c.default_rpm_limit = int(
|
|
os.getenv("SIDECAR_RATE_RPM", str(c.default_rpm_limit))
|
|
)
|
|
|
|
# Cooldown
|
|
c.cooldown_base_seconds = float(
|
|
os.getenv("SIDECAR_COOLDOWN_BASE", str(c.cooldown_base_seconds))
|
|
)
|
|
c.cooldown_max_seconds = float(
|
|
os.getenv("SIDECAR_COOLDOWN_MAX", str(c.cooldown_max_seconds))
|
|
)
|
|
|
|
# Admin
|
|
c.admin_token = os.getenv("SIDECAR_ADMIN_TOKEN", c.admin_token)
|
|
|
|
# Encryption
|
|
c.encryption_key = os.getenv("SIDECAR_ENCRYPTION_KEY", c.encryption_key)
|
|
|
|
# Logging
|
|
c.log_level = os.getenv("LOG_LEVEL", c.log_level).upper()
|
|
|
|
# Primary-Wait
|
|
c.primary_wait_ms = int(
|
|
os.getenv("SIDECAR_PRIMARY_WAIT_MS", str(c.primary_wait_ms))
|
|
)
|
|
c.primary_wait_max_retries = int(
|
|
os.getenv(
|
|
"SIDECAR_PRIMARY_WAIT_MAX_RETRIES", str(c.primary_wait_max_retries)
|
|
)
|
|
)
|
|
|
|
# Database
|
|
c.db_path = os.getenv(
|
|
"SIDECAR_DB_PATH",
|
|
os.path.join(os.getcwd(), "data", "sidecar_v2.db"),
|
|
)
|
|
c.backup_dir = os.getenv(
|
|
"SIDECAR_BACKUP_DIR",
|
|
os.path.join(os.getcwd(), "data", "backups"),
|
|
)
|
|
|
|
# V1 compatibility: migrate env vars
|
|
c._migrate_v1_env()
|
|
|
|
return c
|
|
|
|
def _migrate_v1_env(self) -> None:
|
|
"""Migrate V1 environment variables to V2 defaults."""
|
|
# V1 UPSTREAM endpoint
|
|
upstream = os.getenv("SIDECAR_UPSTREAM")
|
|
api_key = os.getenv("SIDECAR_API_KEY")
|
|
if api_key and self.encryption_key:
|
|
# These will be used during initial migration
|
|
os.environ["_SIDECAR_V1_API_KEY"] = api_key
|
|
os.environ["_SIDECAR_V1_UPSTREAM"] = upstream or "https://integrate.api.nvidia.com/v1"
|
|
|
|
def to_db_dict(self) -> dict:
|
|
"""Serialize to dict for system_config storage."""
|
|
result = {}
|
|
for key, value in asdict(self).items():
|
|
if isinstance(value, bool):
|
|
result[key] = "true" if value else "false"
|
|
elif isinstance(value, (int, float)):
|
|
result[key] = str(value)
|
|
else:
|
|
result[key] = value
|
|
return result
|
|
|
|
@classmethod
|
|
def merge_db(cls, base: "Config", db_config: dict) -> "Config":
|
|
"""Merge DB config into base config (env vars already applied to base)."""
|
|
for key, value in base.__dict__.items():
|
|
if key in db_config and key not in os.environ:
|
|
# DB values only apply when no env var override
|
|
setattr(base, key, type(value)(db_config[key]))
|
|
return base
|
|
|
|
|
|
# Singleton
|
|
config = Config.from_env() |