fix: add Primary-Wait Prometheus counters + conservative defaults — BIZ-60 review
P0 changes per 4-reviewer consensus (严维序/陆怀瑾/沈路明/梁思筑): 1. Prometheus metrics counters (proxy.py + server.py): - sidecar_primary_wait_enter_total: requests entering Primary-Wait - sidecar_primary_wait_recovery_total: successful primary recoveries - sidecar_primary_wait_exhausted_total: wait exhausted → emergency 2. Conservative default (config.py): - primary_wait_max_retries: 6 → 3 (15s total wait, safe start) - Observe recovery rate before increasing to 6 Counters form complete funnel: enter - recovery = exhausted, enabling Grafana monitoring and ROI validation per COO/PM/Ops.
This commit is contained in:
@@ -21,11 +21,28 @@ from storage.usage_store import record_usage
|
||||
# Emergency activation counter (read by metrics endpoint)
|
||||
_emergency_count: int = 0
|
||||
|
||||
# Primary-Wait metrics counters (read by metrics endpoint)
|
||||
_primary_wait_enter_count: int = 0
|
||||
_primary_wait_recovery_count: int = 0
|
||||
_primary_wait_exhausted_count: int = 0
|
||||
|
||||
|
||||
def get_emergency_count() -> int:
|
||||
return _emergency_count
|
||||
|
||||
|
||||
def get_primary_wait_enter_count() -> int:
|
||||
return _primary_wait_enter_count
|
||||
|
||||
|
||||
def get_primary_wait_recovery_count() -> int:
|
||||
return _primary_wait_recovery_count
|
||||
|
||||
|
||||
def get_primary_wait_exhausted_count() -> int:
|
||||
return _primary_wait_exhausted_count
|
||||
|
||||
|
||||
logger: structlog.stdlib.BoundLogger = structlog.get_logger("sidecar_v2.proxy")
|
||||
|
||||
|
||||
@@ -309,6 +326,9 @@ async def handle_proxy_request(
|
||||
continue
|
||||
|
||||
# --- Primary-Wait: wait for primary pool recovery before fallback/emergency ---
|
||||
global _primary_wait_enter_count, _primary_wait_recovery_count, _primary_wait_exhausted_count
|
||||
_primary_wait_enter_count += 1
|
||||
|
||||
pwl = logger.bind(phase="primary_wait")
|
||||
for pw_attempt in range(config.primary_wait_max_retries):
|
||||
await asyncio.sleep(config.primary_wait_ms / 1000.0)
|
||||
@@ -356,6 +376,7 @@ async def handle_proxy_request(
|
||||
continue
|
||||
|
||||
# Primary recovered — success
|
||||
_primary_wait_recovery_count += 1
|
||||
resp_json: dict[str, Any] = {}
|
||||
try:
|
||||
if not is_stream and resp.content:
|
||||
@@ -417,6 +438,9 @@ async def handle_proxy_request(
|
||||
)
|
||||
continue
|
||||
|
||||
# Primary-Wait all retries exhausted
|
||||
_primary_wait_exhausted_count += 1
|
||||
|
||||
# All pools exhausted (including primary-wait retries) — emergency rate-limited passthrough
|
||||
emergency_rpm = int(config.default_rpm_limit * config.emergency_rpm_fraction)
|
||||
if emergency_rpm < 1:
|
||||
|
||||
Reference in New Issue
Block a user