fix: add Primary-Wait Prometheus counters + conservative defaults — BIZ-60 review
P0 changes per 4-reviewer consensus (严维序/陆怀瑾/沈路明/梁思筑): 1. Prometheus metrics counters (proxy.py + server.py): - sidecar_primary_wait_enter_total: requests entering Primary-Wait - sidecar_primary_wait_recovery_total: successful primary recoveries - sidecar_primary_wait_exhausted_total: wait exhausted → emergency 2. Conservative default (config.py): - primary_wait_max_retries: 6 → 3 (15s total wait, safe start) - Observe recovery rate before increasing to 6 Counters form complete funnel: enter - recovery = exhausted, enabling Grafana monitoring and ROI validation per COO/PM/Ops.
This commit is contained in:
@@ -20,7 +20,13 @@ from crypto import init_crypto, is_initialized
|
||||
from pool_manager import PoolManager
|
||||
from rate_limiter import PerBackendRateLimiter
|
||||
from router import Router
|
||||
from proxy import handle_proxy_request, get_emergency_count
|
||||
from proxy import (
|
||||
handle_proxy_request,
|
||||
get_emergency_count,
|
||||
get_primary_wait_enter_count,
|
||||
get_primary_wait_recovery_count,
|
||||
get_primary_wait_exhausted_count,
|
||||
)
|
||||
|
||||
from storage.db import init_db, create_tables, run_integrity_check, get_connection, _DB_PATH
|
||||
from storage.backend_store import (
|
||||
@@ -383,6 +389,11 @@ async def metrics() -> Response:
|
||||
# Emergency count (from proxy module)
|
||||
lines.append(f"sidecar_emergency_count {get_emergency_count()}")
|
||||
|
||||
# Primary-Wait metrics
|
||||
lines.append(f"sidecar_primary_wait_enter_total {get_primary_wait_enter_count()}")
|
||||
lines.append(f"sidecar_primary_wait_recovery_total {get_primary_wait_recovery_count()}")
|
||||
lines.append(f"sidecar_primary_wait_exhausted_total {get_primary_wait_exhausted_count()}")
|
||||
|
||||
# DB sizes
|
||||
from storage.db import get_db_sizes
|
||||
sizes = get_db_sizes()
|
||||
|
||||
Reference in New Issue
Block a user