Files
EnterpriseArchitect/services/nvidia_sidecar/rate_limiter.py
T

442 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
NVIDIA Sidecar 限流代理 — 令牌桶 + 网关识别模块 (§3.2)
从 BIZ-26 rate_limiter.py 提取核心限流逻辑,去除多线程调度器、缓存管理等。
保留:Priority, TokenBucket, is_nvidia_gateway, normalize_gateway_name。
"""
from __future__ import annotations
import time
import threading
from enum import IntEnum
from typing import Any
# ---------------------------------------------------------------------------
# 优先级枚举
# ---------------------------------------------------------------------------
class Priority(IntEnum):
"""请求优先级(数值越小优先级越高)。"""
URGENT = 1
HIGH = 2
NORMAL = 3
LOW = 4
# ---------------------------------------------------------------------------
# NVIDIA 网关别名集
# ---------------------------------------------------------------------------
NVIDIA_GATEWAY_ALIASES: set[str] = {
# OpenClaw 配置中全部的 NVIDIA provider 名称
"nvidia",
"nvidia-gateway",
"nvidia98053",
"nvidialiuweicheng84",
"nvidiavx",
"nvidiavx18088980513",
"nvidiavx64391942",
}
def is_nvidia_gateway(value: str | None) -> bool:
"""判断给定网关名/模型全路径是否属于 NVIDIA 网关。
Args:
value: 网关名(如 ``"nvidia"``)或模型全路径前缀
(如 ``"nvidia/deepseek-ai/deepseek-v4-pro"``)。
None 时直接返回 False。
Returns:
True 当 value 的 provider 部分匹配已知 NVIDIA 别名。
"""
if value is None:
return False
# 提取 provider 前缀:取 "/" 前第一个部分
provider = value.split("/", 1)[0].lower().strip()
return provider in NVIDIA_GATEWAY_ALIASES
def normalize_gateway_name(value: str | None) -> str | None:
"""规范化网关名:提取 provider 前缀并转为小写。
Args:
value: 网关名或模型全路径。None 时返回 None。
Returns:
provider 前缀的小写形式,或 None。
"""
if value is None:
return None
return value.split("/", 1)[0].lower().strip()
# ---------------------------------------------------------------------------
# 令牌桶(线程安全)
# ---------------------------------------------------------------------------
class TokenBucket:
"""线程安全的令牌桶实现。
支持固定速率令牌补充和消费,带有溢出保护和可选的阻塞等待。
"""
def __init__(self, rate: float = 40 / 60, capacity: int = 40) -> None:
"""初始化令牌桶。
Args:
rate: 令牌补充速率(令牌/秒)。默认 40/60 ≈ 0.667 token/s40 RPM)。
capacity: 桶最大容量(令牌数)。默认 40。
"""
self._rate: float = float(rate)
self._capacity: int = int(capacity)
self._tokens: float = float(capacity) # 启动时桶满
self._last_refill: float = time.monotonic()
self._lock: threading.Lock = threading.Lock()
# ---- 内部方法 ----
def _refill(self) -> None:
"""补充令牌(调用方需持有 _lock)。
根据距上次补充的时间差计算新增令牌数,不超过 capacity。
"""
now = time.monotonic()
elapsed = now - self._last_refill
if elapsed > 0 and self._rate > 0:
new_tokens = elapsed * self._rate
self._tokens = min(self._tokens + new_tokens, float(self._capacity))
self._last_refill = now
# ---- 公开方法 ----
def consume(self, tokens: int = 1) -> bool:
"""尝试立即消费令牌(非阻塞)。
Args:
tokens: 要消费的令牌数,默认 1。
Returns:
True 消费成功;False 令牌不足。
"""
if tokens <= 0:
return True
with self._lock:
self._refill()
if self._tokens >= tokens:
self._tokens -= tokens
return True
return False
def try_consume(self, tokens: int = 1, timeout: float = 2.0) -> bool:
"""尝试在指定时间内消费令牌(阻塞)。
Args:
tokens: 要消费的令牌数,默认 1。
timeout: 最大等待秒数,默认 2.0。
Returns:
True 在超时前成功消费;False 超时。
"""
if tokens <= 0:
return True
deadline = time.monotonic() + timeout
while True:
with self._lock:
self._refill()
if self._tokens >= tokens:
self._tokens -= tokens
return True
# 释放锁后计算剩余等待时间
remaining = deadline - time.monotonic()
if remaining <= 0:
return False
# 等待到下一个令牌应该补充的时间点
sleep_time = min(remaining, max(0.05, 1.0 / self._rate) if self._rate > 0 else remaining)
time.sleep(sleep_time)
def wait_for_token(self, timeout: float | None = None) -> bool:
"""等待并尝试消费 1 个令牌。
Args:
timeout: 最大等待秒数;None 表示无限等待(不推荐)。
Returns:
True 成功消费;False 超时。
"""
return self.try_consume(tokens=1, timeout=timeout if timeout is not None else float("inf"))
def get_status(self) -> dict[str, Any]:
"""获取令牌桶当前状态。
Returns:
包含 tokens, capacity, rate_per_minute, utilization 的字典。
"""
with self._lock:
self._refill()
rate_per_minute = self._rate * 60.0
utilization = 0.0 if self._capacity == 0 else (
(self._capacity - self._tokens) / self._capacity
)
return {
"tokens": round(self._tokens, 2),
"capacity": self._capacity,
"rate_per_minute": round(rate_per_minute, 1),
"utilization": round(utilization, 4),
}
# ---- 属性 ----
@property
def rate(self) -> float:
"""当前令牌补充速率(令牌/秒)。"""
return self._rate
@property
def capacity(self) -> int:
"""桶容量。"""
return self._capacity
# ---- 动态速率调整(供 AdaptiveTokenBucket 使用) ----
def set_rate(self, rate: float) -> None:
"""动态调整令牌补充速率(令牌/秒)。
Args:
rate: 新速率(令牌/秒)。
"""
with self._lock:
self._refill() # 先补充现有令牌再切换速率
self._rate = float(rate)
# ---------------------------------------------------------------------------
# 避退模式:AdaptiveTokenBucket (§ADR-009)
# ---------------------------------------------------------------------------
class RetreatState:
"""避退状态机常量。"""
NORMAL: str = "normal"
RETREAT: str = "retreat"
RECOVER: str = "recover"
class AdaptiveTokenBucket(TokenBucket):
"""自适应避退令牌桶(ADR-009)。
监控上游 429 率(60s 滑动窗口),自动调整发射速率:
- 429 率 < 5% → NORMAL,保持基准速率
- 429 率 5-10% → RETREAT,速率 × 0.75
- 429 率 10-20% → RETREAT,再次降速
- 429 率 > 20% → RETREAT,最低 5 RPM + 告警
- 连续 120s 429 率 < 2% → RECOVER,逐步 +2 RPM 恢复
线程安全,继承 TokenBucket 的所有公共接口。
"""
# ADR-009 参数(可通过构造函数覆盖)
RETREAT_WINDOW_SECONDS: float = 60.0
RETREAT_429_THRESHOLD: float = 0.05
RETREAT_FACTOR: float = 0.75
RETREAT_MIN_RPM: float = 5.0
RECOVER_WINDOW_SECONDS: float = 120.0
RECOVER_429_THRESHOLD: float = 0.02
RECOVER_INCREMENT_RPM: float = 2.0
def __init__(
self,
rate: float = 40 / 60,
capacity: int = 40,
*,
retreat_window_seconds: float = 60.0,
retreat_429_threshold: float = 0.05,
retreat_factor: float = 0.75,
retreat_min_rpm: float = 5.0,
recover_window_seconds: float = 120.0,
recover_429_threshold: float = 0.02,
recover_increment_rpm: float = 2.0,
) -> None:
"""初始化自适应避退令牌桶。
Args:
rate: 基准令牌补充速率(令牌/秒)。默认 40/60 ≈ 0.667 token/s。
capacity: 桶最大容量。默认 40。
retreat_window_seconds: 429 率滑动窗口大小(秒)。
retreat_429_threshold: 触发避退的 429 率阈值。
retreat_factor: 每次避退速率乘数。
retreat_min_rpm: 避退最低 RPM。
recover_window_seconds: 恢复观察窗口大小(秒)。
recover_429_threshold: 触发恢复的 429 率阈值。
recover_increment_rpm: 每次恢复增加的 RPM。
"""
super().__init__(rate=rate, capacity=capacity)
# 基准速率(不变)
self._base_rate: float = float(rate)
# 避退参数
self.RETREAT_WINDOW_SECONDS = retreat_window_seconds
self.RETREAT_429_THRESHOLD = retreat_429_threshold
self.RETREAT_FACTOR = retreat_factor
self.RETREAT_MIN_RPM = retreat_min_rpm
self.RECOVER_WINDOW_SECONDS = recover_window_seconds
self.RECOVER_429_THRESHOLD = recover_429_threshold
self.RECOVER_INCREMENT_RPM = recover_increment_rpm
# 避退状态机
self._retreat_state: str = RetreatState.NORMAL
# 429 滑动窗口:[(timestamp, is_429), ...]
self._429_window: list[tuple[float, bool]] = []
# 上次状态变更时间
self._last_state_change: float = time.monotonic()
# 避退状态锁(RLock 防止 evaluate_retreat() → get_429_rate() 重入死锁)
self._retreat_lock: threading.RLock = threading.RLock()
# ---- 429 反馈 ----
def record_response(self, is_429: bool) -> None:
"""记录一次上游响应是否为 429。
Args:
is_429: True 表示上游返回了 429。
"""
now = time.monotonic()
with self._retreat_lock:
self._429_window.append((now, is_429))
# 清理超出观察窗口的旧记录
cutoff = now - max(
self.RETREAT_WINDOW_SECONDS,
self.RECOVER_WINDOW_SECONDS,
)
self._429_window = [
(ts, flag) for ts, flag in self._429_window
if ts >= cutoff
]
def get_429_rate(self, window_seconds: float | None = None) -> float:
"""获取指定窗口内的 429 率。
Args:
window_seconds: 滑动窗口大小;None 使用 RETREAT_WINDOW_SECONDS。
Returns:
0.0-1.0 之间的 429 率。
"""
ws = window_seconds or self.RETREAT_WINDOW_SECONDS
now = time.monotonic()
with self._retreat_lock:
in_window = [flag for ts, flag in self._429_window if now - ts <= ws]
if not in_window:
return 0.0
return sum(1 for f in in_window if f) / len(in_window)
# ---- 避退状态评估 ----
def evaluate_retreat(self) -> str:
"""评估并更新避退状态,返回新状态名。
每次调用根据当前 429 率 + 持续时间决定是否进入 RETREAT / RECOVER。
Returns:
"normal" / "retreat" / "recover"。
"""
now = time.monotonic()
with self._retreat_lock:
retreat_rate = self.get_429_rate(self.RETREAT_WINDOW_SECONDS)
recover_rate = self.get_429_rate(self.RECOVER_WINDOW_SECONDS)
if self._retreat_state == RetreatState.NORMAL:
if retreat_rate >= self.RETREAT_429_THRESHOLD:
self._retreat_state = RetreatState.RETREAT
self._last_state_change = now
self._apply_retreat()
elif self._retreat_state == RetreatState.RETREAT:
# 持续高 429 率 → 再次降速
if retreat_rate >= self.RETREAT_429_THRESHOLD * 2:
# 429 > 10%,再次降速
if self._rate > self.RETREAT_MIN_RPM / 60.0:
self._apply_retreat()
elif recover_rate < self.RECOVER_429_THRESHOLD:
time_in_low = now - self._last_state_change
if time_in_low >= self.RECOVER_WINDOW_SECONDS:
self._retreat_state = RetreatState.RECOVER
self._last_state_change = now
self._apply_recover()
elif self._retreat_state == RetreatState.RECOVER:
if retreat_rate >= self.RETREAT_429_THRESHOLD:
# 恢复期间 429 回升,重新进入避退
self._retreat_state = RetreatState.RETREAT
self._last_state_change = now
self._apply_retreat()
elif self._rate >= self._base_rate:
# 已恢复到基准速率
self._rate = self._base_rate
self._retreat_state = RetreatState.NORMAL
self._last_state_change = now
else:
# 继续逐步恢复
self._apply_recover()
return self._retreat_state
def _apply_retreat(self) -> None:
"""执行一次避退降速。"""
new_rate: float = max(
self.RETREAT_MIN_RPM / 60.0,
self._rate * self.RETREAT_FACTOR,
)
self._rate = new_rate
def _apply_recover(self) -> None:
"""执行一次恢复提速。"""
increment: float = self.RECOVER_INCREMENT_RPM / 60.0
new_rate: float = min(self._base_rate, self._rate + increment)
self._rate = new_rate
# ---- 状态查询 ----
def get_retreat_state(self) -> str:
"""获取当前避退状态。
Returns:
"normal" / "retreat" / "recover"。
"""
with self._retreat_lock:
return self._retreat_state
def get_effective_rate_rpm(self) -> float:
"""获取当前实际速率(RPM),考虑避退乘数。
Returns:
当前每分钟速率。
"""
with self._lock:
return self._rate * 60.0
def get_base_rate_rpm(self) -> float:
"""获取基准速率(RPM),即未避退时的速率。
Returns:
基准每分钟速率。
"""
return self._base_rate * 60.0
def reset_to_base(self) -> None:
"""手动重置到基准速率(用于运维干预)。"""
with self._retreat_lock:
self._rate = self._base_rate
self._retreat_state = RetreatState.NORMAL
self._last_state_change = time.monotonic()
self._429_window.clear()