fix: add Primary-Wait Prometheus counters + conservative defaults — BIZ-60 review
P0 changes per 4-reviewer consensus (严维序/陆怀瑾/沈路明/梁思筑): 1. Prometheus metrics counters (proxy.py + server.py): - sidecar_primary_wait_enter_total: requests entering Primary-Wait - sidecar_primary_wait_recovery_total: successful primary recoveries - sidecar_primary_wait_exhausted_total: wait exhausted → emergency 2. Conservative default (config.py): - primary_wait_max_retries: 6 → 3 (15s total wait, safe start) - Observe recovery rate before increasing to 6 Counters form complete funnel: enter - recovery = exhausted, enabling Grafana monitoring and ROI validation per COO/PM/Ops.
This commit is contained in:
@@ -75,7 +75,7 @@ class Config:
|
||||
|
||||
# Primary-Wait: when all primary backends are cooling, wait before fallback
|
||||
primary_wait_ms: int = 5000
|
||||
primary_wait_max_retries: int = 6
|
||||
primary_wait_max_retries: int = 3
|
||||
|
||||
# Request timeout
|
||||
default_request_timeout_seconds: int = 120
|
||||
|
||||
@@ -21,11 +21,28 @@ from storage.usage_store import record_usage
|
||||
# Emergency activation counter (read by metrics endpoint)
|
||||
_emergency_count: int = 0
|
||||
|
||||
# Primary-Wait metrics counters (read by metrics endpoint)
|
||||
_primary_wait_enter_count: int = 0
|
||||
_primary_wait_recovery_count: int = 0
|
||||
_primary_wait_exhausted_count: int = 0
|
||||
|
||||
|
||||
def get_emergency_count() -> int:
|
||||
return _emergency_count
|
||||
|
||||
|
||||
def get_primary_wait_enter_count() -> int:
|
||||
return _primary_wait_enter_count
|
||||
|
||||
|
||||
def get_primary_wait_recovery_count() -> int:
|
||||
return _primary_wait_recovery_count
|
||||
|
||||
|
||||
def get_primary_wait_exhausted_count() -> int:
|
||||
return _primary_wait_exhausted_count
|
||||
|
||||
|
||||
logger: structlog.stdlib.BoundLogger = structlog.get_logger("sidecar_v2.proxy")
|
||||
|
||||
|
||||
@@ -309,6 +326,9 @@ async def handle_proxy_request(
|
||||
continue
|
||||
|
||||
# --- Primary-Wait: wait for primary pool recovery before fallback/emergency ---
|
||||
global _primary_wait_enter_count, _primary_wait_recovery_count, _primary_wait_exhausted_count
|
||||
_primary_wait_enter_count += 1
|
||||
|
||||
pwl = logger.bind(phase="primary_wait")
|
||||
for pw_attempt in range(config.primary_wait_max_retries):
|
||||
await asyncio.sleep(config.primary_wait_ms / 1000.0)
|
||||
@@ -356,6 +376,7 @@ async def handle_proxy_request(
|
||||
continue
|
||||
|
||||
# Primary recovered — success
|
||||
_primary_wait_recovery_count += 1
|
||||
resp_json: dict[str, Any] = {}
|
||||
try:
|
||||
if not is_stream and resp.content:
|
||||
@@ -417,6 +438,9 @@ async def handle_proxy_request(
|
||||
)
|
||||
continue
|
||||
|
||||
# Primary-Wait all retries exhausted
|
||||
_primary_wait_exhausted_count += 1
|
||||
|
||||
# All pools exhausted (including primary-wait retries) — emergency rate-limited passthrough
|
||||
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
|
||||
if emergency_rpm < 1:
|
||||
|
||||
@@ -20,7 +20,13 @@ from crypto import init_crypto, is_initialized
|
||||
from pool_manager import PoolManager
|
||||
from rate_limiter import PerBackendRateLimiter
|
||||
from router import Router
|
||||
from proxy import handle_proxy_request, get_emergency_count
|
||||
from proxy import (
|
||||
handle_proxy_request,
|
||||
get_emergency_count,
|
||||
get_primary_wait_enter_count,
|
||||
get_primary_wait_recovery_count,
|
||||
get_primary_wait_exhausted_count,
|
||||
)
|
||||
|
||||
from storage.db import init_db, create_tables, run_integrity_check, get_connection, _DB_PATH
|
||||
from storage.backend_store import (
|
||||
@@ -383,6 +389,11 @@ async def metrics() -> Response:
|
||||
# Emergency count (from proxy module)
|
||||
lines.append(f"sidecar_emergency_count {get_emergency_count()}")
|
||||
|
||||
# Primary-Wait metrics
|
||||
lines.append(f"sidecar_primary_wait_enter_total {get_primary_wait_enter_count()}")
|
||||
lines.append(f"sidecar_primary_wait_recovery_total {get_primary_wait_recovery_count()}")
|
||||
lines.append(f"sidecar_primary_wait_exhausted_total {get_primary_wait_exhausted_count()}")
|
||||
|
||||
# DB sizes
|
||||
from storage.db import get_db_sizes
|
||||
sizes = get_db_sizes()
|
||||
|
||||
Reference in New Issue
Block a user