fix: add Primary-Wait Prometheus counters + conservative defaults — BIZ-60 review
P0 changes per 4-reviewer consensus (严维序/陆怀瑾/沈路明/梁思筑): 1. Prometheus metrics counters (proxy.py + server.py): - sidecar_primary_wait_enter_total: requests entering Primary-Wait - sidecar_primary_wait_recovery_total: successful primary recoveries - sidecar_primary_wait_exhausted_total: wait exhausted → emergency 2. Conservative default (config.py): - primary_wait_max_retries: 6 → 3 (15s total wait, safe start) - Observe recovery rate before increasing to 6 Counters form complete funnel: enter - recovery = exhausted, enabling Grafana monitoring and ROI validation per COO/PM/Ops.
This commit is contained in:
@@ -75,7 +75,7 @@ class Config:
|
|||||||
|
|
||||||
# Primary-Wait: when all primary backends are cooling, wait before fallback
|
# Primary-Wait: when all primary backends are cooling, wait before fallback
|
||||||
primary_wait_ms: int = 5000
|
primary_wait_ms: int = 5000
|
||||||
primary_wait_max_retries: int = 6
|
primary_wait_max_retries: int = 3
|
||||||
|
|
||||||
# Request timeout
|
# Request timeout
|
||||||
default_request_timeout_seconds: int = 120
|
default_request_timeout_seconds: int = 120
|
||||||
|
|||||||
@@ -21,11 +21,28 @@ from storage.usage_store import record_usage
|
|||||||
# Emergency activation counter (read by metrics endpoint)
|
# Emergency activation counter (read by metrics endpoint)
|
||||||
_emergency_count: int = 0
|
_emergency_count: int = 0
|
||||||
|
|
||||||
|
# Primary-Wait metrics counters (read by metrics endpoint)
|
||||||
|
_primary_wait_enter_count: int = 0
|
||||||
|
_primary_wait_recovery_count: int = 0
|
||||||
|
_primary_wait_exhausted_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
def get_emergency_count() -> int:
|
def get_emergency_count() -> int:
|
||||||
return _emergency_count
|
return _emergency_count
|
||||||
|
|
||||||
|
|
||||||
|
def get_primary_wait_enter_count() -> int:
|
||||||
|
return _primary_wait_enter_count
|
||||||
|
|
||||||
|
|
||||||
|
def get_primary_wait_recovery_count() -> int:
|
||||||
|
return _primary_wait_recovery_count
|
||||||
|
|
||||||
|
|
||||||
|
def get_primary_wait_exhausted_count() -> int:
|
||||||
|
return _primary_wait_exhausted_count
|
||||||
|
|
||||||
|
|
||||||
logger: structlog.stdlib.BoundLogger = structlog.get_logger("sidecar_v2.proxy")
|
logger: structlog.stdlib.BoundLogger = structlog.get_logger("sidecar_v2.proxy")
|
||||||
|
|
||||||
|
|
||||||
@@ -309,6 +326,9 @@ async def handle_proxy_request(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# --- Primary-Wait: wait for primary pool recovery before fallback/emergency ---
|
# --- Primary-Wait: wait for primary pool recovery before fallback/emergency ---
|
||||||
|
global _primary_wait_enter_count, _primary_wait_recovery_count, _primary_wait_exhausted_count
|
||||||
|
_primary_wait_enter_count += 1
|
||||||
|
|
||||||
pwl = logger.bind(phase="primary_wait")
|
pwl = logger.bind(phase="primary_wait")
|
||||||
for pw_attempt in range(config.primary_wait_max_retries):
|
for pw_attempt in range(config.primary_wait_max_retries):
|
||||||
await asyncio.sleep(config.primary_wait_ms / 1000.0)
|
await asyncio.sleep(config.primary_wait_ms / 1000.0)
|
||||||
@@ -356,6 +376,7 @@ async def handle_proxy_request(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Primary recovered — success
|
# Primary recovered — success
|
||||||
|
_primary_wait_recovery_count += 1
|
||||||
resp_json: dict[str, Any] = {}
|
resp_json: dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
if not is_stream and resp.content:
|
if not is_stream and resp.content:
|
||||||
@@ -417,6 +438,9 @@ async def handle_proxy_request(
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Primary-Wait all retries exhausted
|
||||||
|
_primary_wait_exhausted_count += 1
|
||||||
|
|
||||||
# All pools exhausted (including primary-wait retries) — emergency rate-limited passthrough
|
# All pools exhausted (including primary-wait retries) — emergency rate-limited passthrough
|
||||||
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
|
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
|
||||||
if emergency_rpm < 1:
|
if emergency_rpm < 1:
|
||||||
|
|||||||
@@ -20,7 +20,13 @@ from crypto import init_crypto, is_initialized
|
|||||||
from pool_manager import PoolManager
|
from pool_manager import PoolManager
|
||||||
from rate_limiter import PerBackendRateLimiter
|
from rate_limiter import PerBackendRateLimiter
|
||||||
from router import Router
|
from router import Router
|
||||||
from proxy import handle_proxy_request, get_emergency_count
|
from proxy import (
|
||||||
|
handle_proxy_request,
|
||||||
|
get_emergency_count,
|
||||||
|
get_primary_wait_enter_count,
|
||||||
|
get_primary_wait_recovery_count,
|
||||||
|
get_primary_wait_exhausted_count,
|
||||||
|
)
|
||||||
|
|
||||||
from storage.db import init_db, create_tables, run_integrity_check, get_connection, _DB_PATH
|
from storage.db import init_db, create_tables, run_integrity_check, get_connection, _DB_PATH
|
||||||
from storage.backend_store import (
|
from storage.backend_store import (
|
||||||
@@ -383,6 +389,11 @@ async def metrics() -> Response:
|
|||||||
# Emergency count (from proxy module)
|
# Emergency count (from proxy module)
|
||||||
lines.append(f"sidecar_emergency_count {get_emergency_count()}")
|
lines.append(f"sidecar_emergency_count {get_emergency_count()}")
|
||||||
|
|
||||||
|
# Primary-Wait metrics
|
||||||
|
lines.append(f"sidecar_primary_wait_enter_total {get_primary_wait_enter_count()}")
|
||||||
|
lines.append(f"sidecar_primary_wait_recovery_total {get_primary_wait_recovery_count()}")
|
||||||
|
lines.append(f"sidecar_primary_wait_exhausted_total {get_primary_wait_exhausted_count()}")
|
||||||
|
|
||||||
# DB sizes
|
# DB sizes
|
||||||
from storage.db import get_db_sizes
|
from storage.db import get_db_sizes
|
||||||
sizes = get_db_sizes()
|
sizes = get_db_sizes()
|
||||||
|
|||||||
Reference in New Issue
Block a user