feat: Primary-Wait backoff queuing — BIZ-60

When all primary backends are in cooldown, wait and retry the primary pool
before falling through to fallback/emergency. This reduces unnecessary
spend on paid fallback providers during temporary 429 storms.

Config:
- primary_wait_ms (default 5000, env SIDECAR_PRIMARY_WAIT_MS)
- primary_wait_max_retries (default 6, env SIDECAR_PRIMARY_WAIT_MAX_RETRIES)

Implementation:
- config.py: 2 new config fields + env var loading
- router.py: pick_primary_backend() — primary-pool-only selection
- proxy.py: primary-wait loop between standard retries and emergency

Expected win: 17% error rate during high concurrency drops, emergency
passthrough count falls as requests wait for NVIDIA pool recovery
instead of immediately routing to SiliconFlow fallback.
This commit is contained in:
2026-06-25 22:22:02 +08:00
parent 4bdf6ddf32
commit 376ce97d91
3 changed files with 138 additions and 1 deletions
+110 -1
View File
@@ -308,7 +308,116 @@ async def handle_proxy_request(
)
continue
# All pools exhausted — emergency rate-limited passthrough
# --- Primary-Wait: wait for primary pool recovery before fallback/emergency ---
pwl = logger.bind(phase="primary_wait")
for pw_attempt in range(config.primary_wait_max_retries):
await asyncio.sleep(config.primary_wait_ms / 1000.0)
_refresh_cooldowns()
backend = router.pick_primary_backend(canonical_model)
if not backend:
pwl.debug(
"primary_wait_no_backend",
attempt=pw_attempt + 1,
model=canonical_model,
)
continue
try:
resp = await forward_to_backend(
backend=backend,
method=request.method,
path=path,
body=body_bytes if body_bytes else None,
headers=raw_headers,
stream=is_stream,
)
elapsed_ms = int((time.monotonic() - start_time) * 1000)
if resp.status_code == 429:
new_count = backend.consecutive_429_count + 1
start_cooldown(backend.id, new_count)
pwl.warning(
"primary_wait_429",
backend_id=backend.id,
attempt=pw_attempt + 1,
consecutive=new_count,
model=canonical_model,
)
record_usage(
backend_id=backend.id,
model=canonical_model,
prompt_tokens=0,
completion_tokens=0,
cost=0.0,
latency_ms=elapsed_ms,
is_error=True,
)
continue
# Primary recovered — success
resp_json: dict[str, Any] = {}
try:
if not is_stream and resp.content:
resp_json = json.loads(resp.content)
except (ValueError, TypeError):
pass
prompt_tokens, completion_tokens, total_tokens = extract_usage_from_response(
resp, resp_json, canonical_model
)
cost = calculate_cost(
backend, canonical_model, prompt_tokens, completion_tokens
)
record_usage(
backend_id=backend.id,
model=canonical_model,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
cost=cost,
latency_ms=elapsed_ms,
)
logger.info(
"primary_wait_recovery",
backend_id=backend.id,
pool=backend.pool,
model=canonical_model,
status=resp.status_code,
tokens=total_tokens,
cost=round(cost, 6),
elapsed_ms=elapsed_ms,
pw_attempt=pw_attempt + 1,
)
return build_response(resp)
except httpx.TimeoutException:
pwl.warning(
"primary_wait_timeout",
backend_id=backend.id,
attempt=pw_attempt + 1,
model=canonical_model,
)
except (httpx.ConnectError, httpx.RemoteProtocolError) as exc:
pwl.warning(
"primary_wait_connection_error",
backend_id=backend.id,
attempt=pw_attempt + 1,
model=canonical_model,
error=str(exc),
)
except Exception as exc:
pwl.error(
"primary_wait_error",
backend_id=backend.id,
attempt=pw_attempt + 1,
model=canonical_model,
error=str(exc),
)
continue
# All pools exhausted (including primary-wait retries) — emergency rate-limited passthrough
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
if emergency_rpm < 1:
emergency_rpm = 1