8a12ff9693
Co-authored-by: multica-agent <github@multica.ai>
438 lines
15 KiB
Python
438 lines
15 KiB
Python
"""
|
||
NVIDIA Sidecar 限流代理 — 令牌桶 + 网关识别模块 (§3.2)
|
||
|
||
从 BIZ-26 rate_limiter.py 提取核心限流逻辑,去除多线程调度器、缓存管理等。
|
||
保留:Priority, TokenBucket, is_nvidia_gateway, normalize_gateway_name。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
import threading
|
||
from enum import IntEnum
|
||
from typing import Any
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 优先级枚举
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class Priority(IntEnum):
|
||
"""请求优先级(数值越小优先级越高)。"""
|
||
URGENT = 1
|
||
HIGH = 2
|
||
NORMAL = 3
|
||
LOW = 4
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# NVIDIA 网关别名集
|
||
# ---------------------------------------------------------------------------
|
||
|
||
NVIDIA_GATEWAY_ALIASES: set[str] = {
|
||
"nvidia",
|
||
"nvidia-gateway",
|
||
"nvidiavx",
|
||
"nvidiavx18088980513",
|
||
}
|
||
|
||
|
||
def is_nvidia_gateway(value: str | None) -> bool:
|
||
"""判断给定网关名/模型全路径是否属于 NVIDIA 网关。
|
||
|
||
Args:
|
||
value: 网关名(如 ``"nvidia"``)或模型全路径前缀
|
||
(如 ``"nvidia/deepseek-ai/deepseek-v4-pro"``)。
|
||
None 时直接返回 False。
|
||
|
||
Returns:
|
||
True 当 value 的 provider 部分匹配已知 NVIDIA 别名。
|
||
"""
|
||
if value is None:
|
||
return False
|
||
|
||
# 提取 provider 前缀:取 "/" 前第一个部分
|
||
provider = value.split("/", 1)[0].lower().strip()
|
||
return provider in NVIDIA_GATEWAY_ALIASES
|
||
|
||
|
||
def normalize_gateway_name(value: str | None) -> str | None:
|
||
"""规范化网关名:提取 provider 前缀并转为小写。
|
||
|
||
Args:
|
||
value: 网关名或模型全路径。None 时返回 None。
|
||
|
||
Returns:
|
||
provider 前缀的小写形式,或 None。
|
||
"""
|
||
if value is None:
|
||
return None
|
||
return value.split("/", 1)[0].lower().strip()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 令牌桶(线程安全)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TokenBucket:
|
||
"""线程安全的令牌桶实现。
|
||
|
||
支持固定速率令牌补充和消费,带有溢出保护和可选的阻塞等待。
|
||
"""
|
||
|
||
def __init__(self, rate: float = 40 / 60, capacity: int = 40) -> None:
|
||
"""初始化令牌桶。
|
||
|
||
Args:
|
||
rate: 令牌补充速率(令牌/秒)。默认 40/60 ≈ 0.667 token/s(40 RPM)。
|
||
capacity: 桶最大容量(令牌数)。默认 40。
|
||
"""
|
||
self._rate: float = float(rate)
|
||
self._capacity: int = int(capacity)
|
||
self._tokens: float = float(capacity) # 启动时桶满
|
||
self._last_refill: float = time.monotonic()
|
||
self._lock: threading.Lock = threading.Lock()
|
||
|
||
# ---- 内部方法 ----
|
||
|
||
def _refill(self) -> None:
|
||
"""补充令牌(调用方需持有 _lock)。
|
||
|
||
根据距上次补充的时间差计算新增令牌数,不超过 capacity。
|
||
"""
|
||
now = time.monotonic()
|
||
elapsed = now - self._last_refill
|
||
if elapsed > 0 and self._rate > 0:
|
||
new_tokens = elapsed * self._rate
|
||
self._tokens = min(self._tokens + new_tokens, float(self._capacity))
|
||
self._last_refill = now
|
||
|
||
# ---- 公开方法 ----
|
||
|
||
def consume(self, tokens: int = 1) -> bool:
|
||
"""尝试立即消费令牌(非阻塞)。
|
||
|
||
Args:
|
||
tokens: 要消费的令牌数,默认 1。
|
||
|
||
Returns:
|
||
True 消费成功;False 令牌不足。
|
||
"""
|
||
if tokens <= 0:
|
||
return True
|
||
|
||
with self._lock:
|
||
self._refill()
|
||
if self._tokens >= tokens:
|
||
self._tokens -= tokens
|
||
return True
|
||
return False
|
||
|
||
def try_consume(self, tokens: int = 1, timeout: float = 2.0) -> bool:
|
||
"""尝试在指定时间内消费令牌(阻塞)。
|
||
|
||
Args:
|
||
tokens: 要消费的令牌数,默认 1。
|
||
timeout: 最大等待秒数,默认 2.0。
|
||
|
||
Returns:
|
||
True 在超时前成功消费;False 超时。
|
||
"""
|
||
if tokens <= 0:
|
||
return True
|
||
|
||
deadline = time.monotonic() + timeout
|
||
while True:
|
||
with self._lock:
|
||
self._refill()
|
||
if self._tokens >= tokens:
|
||
self._tokens -= tokens
|
||
return True
|
||
|
||
# 释放锁后计算剩余等待时间
|
||
remaining = deadline - time.monotonic()
|
||
if remaining <= 0:
|
||
return False
|
||
# 等待到下一个令牌应该补充的时间点
|
||
sleep_time = min(remaining, max(0.05, 1.0 / self._rate) if self._rate > 0 else remaining)
|
||
time.sleep(sleep_time)
|
||
|
||
def wait_for_token(self, timeout: float | None = None) -> bool:
|
||
"""等待并尝试消费 1 个令牌。
|
||
|
||
Args:
|
||
timeout: 最大等待秒数;None 表示无限等待(不推荐)。
|
||
|
||
Returns:
|
||
True 成功消费;False 超时。
|
||
"""
|
||
return self.try_consume(tokens=1, timeout=timeout if timeout is not None else float("inf"))
|
||
|
||
def get_status(self) -> dict[str, Any]:
|
||
"""获取令牌桶当前状态。
|
||
|
||
Returns:
|
||
包含 tokens, capacity, rate_per_minute, utilization 的字典。
|
||
"""
|
||
with self._lock:
|
||
self._refill()
|
||
rate_per_minute = self._rate * 60.0
|
||
utilization = 0.0 if self._capacity == 0 else (
|
||
(self._capacity - self._tokens) / self._capacity
|
||
)
|
||
return {
|
||
"tokens": round(self._tokens, 2),
|
||
"capacity": self._capacity,
|
||
"rate_per_minute": round(rate_per_minute, 1),
|
||
"utilization": round(utilization, 4),
|
||
}
|
||
|
||
# ---- 属性 ----
|
||
|
||
@property
|
||
def rate(self) -> float:
|
||
"""当前令牌补充速率(令牌/秒)。"""
|
||
return self._rate
|
||
|
||
@property
|
||
def capacity(self) -> int:
|
||
"""桶容量。"""
|
||
return self._capacity
|
||
|
||
# ---- 动态速率调整(供 AdaptiveTokenBucket 使用) ----
|
||
|
||
def set_rate(self, rate: float) -> None:
|
||
"""动态调整令牌补充速率(令牌/秒)。
|
||
|
||
Args:
|
||
rate: 新速率(令牌/秒)。
|
||
"""
|
||
with self._lock:
|
||
self._refill() # 先补充现有令牌再切换速率
|
||
self._rate = float(rate)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 避退模式:AdaptiveTokenBucket (§ADR-009)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class RetreatState:
|
||
"""避退状态机常量。"""
|
||
NORMAL: str = "normal"
|
||
RETREAT: str = "retreat"
|
||
RECOVER: str = "recover"
|
||
|
||
|
||
class AdaptiveTokenBucket(TokenBucket):
|
||
"""自适应避退令牌桶(ADR-009)。
|
||
|
||
监控上游 429 率(60s 滑动窗口),自动调整发射速率:
|
||
|
||
- 429 率 < 5% → NORMAL,保持基准速率
|
||
- 429 率 5-10% → RETREAT,速率 × 0.75
|
||
- 429 率 10-20% → RETREAT,再次降速
|
||
- 429 率 > 20% → RETREAT,最低 5 RPM + 告警
|
||
- 连续 120s 429 率 < 2% → RECOVER,逐步 +2 RPM 恢复
|
||
|
||
线程安全,继承 TokenBucket 的所有公共接口。
|
||
"""
|
||
|
||
# ADR-009 参数(可通过构造函数覆盖)
|
||
RETREAT_WINDOW_SECONDS: float = 60.0
|
||
RETREAT_429_THRESHOLD: float = 0.05
|
||
RETREAT_FACTOR: float = 0.75
|
||
RETREAT_MIN_RPM: float = 5.0
|
||
RECOVER_WINDOW_SECONDS: float = 120.0
|
||
RECOVER_429_THRESHOLD: float = 0.02
|
||
RECOVER_INCREMENT_RPM: float = 2.0
|
||
|
||
def __init__(
|
||
self,
|
||
rate: float = 40 / 60,
|
||
capacity: int = 40,
|
||
*,
|
||
retreat_window_seconds: float = 60.0,
|
||
retreat_429_threshold: float = 0.05,
|
||
retreat_factor: float = 0.75,
|
||
retreat_min_rpm: float = 5.0,
|
||
recover_window_seconds: float = 120.0,
|
||
recover_429_threshold: float = 0.02,
|
||
recover_increment_rpm: float = 2.0,
|
||
) -> None:
|
||
"""初始化自适应避退令牌桶。
|
||
|
||
Args:
|
||
rate: 基准令牌补充速率(令牌/秒)。默认 40/60 ≈ 0.667 token/s。
|
||
capacity: 桶最大容量。默认 40。
|
||
retreat_window_seconds: 429 率滑动窗口大小(秒)。
|
||
retreat_429_threshold: 触发避退的 429 率阈值。
|
||
retreat_factor: 每次避退速率乘数。
|
||
retreat_min_rpm: 避退最低 RPM。
|
||
recover_window_seconds: 恢复观察窗口大小(秒)。
|
||
recover_429_threshold: 触发恢复的 429 率阈值。
|
||
recover_increment_rpm: 每次恢复增加的 RPM。
|
||
"""
|
||
super().__init__(rate=rate, capacity=capacity)
|
||
|
||
# 基准速率(不变)
|
||
self._base_rate: float = float(rate)
|
||
|
||
# 避退参数
|
||
self.RETREAT_WINDOW_SECONDS = retreat_window_seconds
|
||
self.RETREAT_429_THRESHOLD = retreat_429_threshold
|
||
self.RETREAT_FACTOR = retreat_factor
|
||
self.RETREAT_MIN_RPM = retreat_min_rpm
|
||
self.RECOVER_WINDOW_SECONDS = recover_window_seconds
|
||
self.RECOVER_429_THRESHOLD = recover_429_threshold
|
||
self.RECOVER_INCREMENT_RPM = recover_increment_rpm
|
||
|
||
# 避退状态机
|
||
self._retreat_state: str = RetreatState.NORMAL
|
||
|
||
# 429 滑动窗口:[(timestamp, is_429), ...]
|
||
self._429_window: list[tuple[float, bool]] = []
|
||
|
||
# 上次状态变更时间
|
||
self._last_state_change: float = time.monotonic()
|
||
|
||
# 避退状态锁(RLock 防止 evaluate_retreat() → get_429_rate() 重入死锁)
|
||
self._retreat_lock: threading.RLock = threading.RLock()
|
||
|
||
# ---- 429 反馈 ----
|
||
|
||
def record_response(self, is_429: bool) -> None:
|
||
"""记录一次上游响应是否为 429。
|
||
|
||
Args:
|
||
is_429: True 表示上游返回了 429。
|
||
"""
|
||
now = time.monotonic()
|
||
with self._retreat_lock:
|
||
self._429_window.append((now, is_429))
|
||
# 清理超出观察窗口的旧记录
|
||
cutoff = now - max(
|
||
self.RETREAT_WINDOW_SECONDS,
|
||
self.RECOVER_WINDOW_SECONDS,
|
||
)
|
||
self._429_window = [
|
||
(ts, flag) for ts, flag in self._429_window
|
||
if ts >= cutoff
|
||
]
|
||
|
||
def get_429_rate(self, window_seconds: float | None = None) -> float:
|
||
"""获取指定窗口内的 429 率。
|
||
|
||
Args:
|
||
window_seconds: 滑动窗口大小;None 使用 RETREAT_WINDOW_SECONDS。
|
||
|
||
Returns:
|
||
0.0-1.0 之间的 429 率。
|
||
"""
|
||
ws = window_seconds or self.RETREAT_WINDOW_SECONDS
|
||
now = time.monotonic()
|
||
with self._retreat_lock:
|
||
in_window = [flag for ts, flag in self._429_window if now - ts <= ws]
|
||
if not in_window:
|
||
return 0.0
|
||
return sum(1 for f in in_window if f) / len(in_window)
|
||
|
||
# ---- 避退状态评估 ----
|
||
|
||
def evaluate_retreat(self) -> str:
|
||
"""评估并更新避退状态,返回新状态名。
|
||
|
||
每次调用根据当前 429 率 + 持续时间决定是否进入 RETREAT / RECOVER。
|
||
|
||
Returns:
|
||
"normal" / "retreat" / "recover"。
|
||
"""
|
||
now = time.monotonic()
|
||
with self._retreat_lock:
|
||
retreat_rate = self.get_429_rate(self.RETREAT_WINDOW_SECONDS)
|
||
recover_rate = self.get_429_rate(self.RECOVER_WINDOW_SECONDS)
|
||
|
||
if self._retreat_state == RetreatState.NORMAL:
|
||
if retreat_rate >= self.RETREAT_429_THRESHOLD:
|
||
self._retreat_state = RetreatState.RETREAT
|
||
self._last_state_change = now
|
||
self._apply_retreat()
|
||
|
||
elif self._retreat_state == RetreatState.RETREAT:
|
||
# 持续高 429 率 → 再次降速
|
||
if retreat_rate >= self.RETREAT_429_THRESHOLD * 2:
|
||
# 429 > 10%,再次降速
|
||
if self._rate > self.RETREAT_MIN_RPM / 60.0:
|
||
self._apply_retreat()
|
||
elif recover_rate < self.RECOVER_429_THRESHOLD:
|
||
time_in_low = now - self._last_state_change
|
||
if time_in_low >= self.RECOVER_WINDOW_SECONDS:
|
||
self._retreat_state = RetreatState.RECOVER
|
||
self._last_state_change = now
|
||
self._apply_recover()
|
||
|
||
elif self._retreat_state == RetreatState.RECOVER:
|
||
if retreat_rate >= self.RETREAT_429_THRESHOLD:
|
||
# 恢复期间 429 回升,重新进入避退
|
||
self._retreat_state = RetreatState.RETREAT
|
||
self._last_state_change = now
|
||
self._apply_retreat()
|
||
elif self._rate >= self._base_rate:
|
||
# 已恢复到基准速率
|
||
self._rate = self._base_rate
|
||
self._retreat_state = RetreatState.NORMAL
|
||
self._last_state_change = now
|
||
else:
|
||
# 继续逐步恢复
|
||
self._apply_recover()
|
||
|
||
return self._retreat_state
|
||
|
||
def _apply_retreat(self) -> None:
|
||
"""执行一次避退降速。"""
|
||
new_rate: float = max(
|
||
self.RETREAT_MIN_RPM / 60.0,
|
||
self._rate * self.RETREAT_FACTOR,
|
||
)
|
||
self._rate = new_rate
|
||
|
||
def _apply_recover(self) -> None:
|
||
"""执行一次恢复提速。"""
|
||
increment: float = self.RECOVER_INCREMENT_RPM / 60.0
|
||
new_rate: float = min(self._base_rate, self._rate + increment)
|
||
self._rate = new_rate
|
||
|
||
# ---- 状态查询 ----
|
||
|
||
def get_retreat_state(self) -> str:
|
||
"""获取当前避退状态。
|
||
|
||
Returns:
|
||
"normal" / "retreat" / "recover"。
|
||
"""
|
||
with self._retreat_lock:
|
||
return self._retreat_state
|
||
|
||
def get_effective_rate_rpm(self) -> float:
|
||
"""获取当前实际速率(RPM),考虑避退乘数。
|
||
|
||
Returns:
|
||
当前每分钟速率。
|
||
"""
|
||
with self._lock:
|
||
return self._rate * 60.0
|
||
|
||
def get_base_rate_rpm(self) -> float:
|
||
"""获取基准速率(RPM),即未避退时的速率。
|
||
|
||
Returns:
|
||
基准每分钟速率。
|
||
"""
|
||
return self._base_rate * 60.0
|
||
|
||
def reset_to_base(self) -> None:
|
||
"""手动重置到基准速率(用于运维干预)。"""
|
||
with self._retreat_lock:
|
||
self._rate = self._base_rate
|
||
self._retreat_state = RetreatState.NORMAL
|
||
self._last_state_change = time.monotonic()
|
||
self._429_window.clear() |