diff --git a/docs/architecture/ADR-006-sidecar-v2-architecture.md b/docs/architecture/ADR-006-sidecar-v2-architecture.md index 5303f83..2636b96 100644 --- a/docs/architecture/ADR-006-sidecar-v2-architecture.md +++ b/docs/architecture/ADR-006-sidecar-v2-architecture.md @@ -4,7 +4,7 @@ |------|-----| | 编号 | ADR-006 | | 日期 | 2026-06-25 | -| 版本 | v2.0(评审修订版) | +| 版本 | v2.1(第二轮评审修订版) | | 状态 | 提议中 | | 作者 | 梁思筑 (architect) | | 关联 | BIZ-48 (父需求), BIZ-51 (本任务) | @@ -291,7 +291,7 @@ Provider 移入冷却池 恢复后首次仍 429 → 回到冷却池,加倍冷却时间 ``` -**紧急通道**:当主池 + Fallback 全部冷却时,选择冷却时间剩余最短的 1 个 Provider,以 10% RPM 继续尝试,避免完全断流。 +**紧急通道**:当主池 + Fallback 全部冷却时,选择冷却时间剩余最短的 1 个 Provider,以 `max(1, max_rpm * 0.1)` RPM 继续尝试(底线至少 1 RPM),避免完全断流。 **冷却预检**:降级到 Fallback 池前,检查主池是否有 Provider 剩余冷却时间 < 10s,可短暂等待其恢复。 @@ -302,7 +302,7 @@ Provider 移入冷却池 | 队列模型 | FIFO + 优先级 | 继承 V1 的四级优先级(URGENT > HIGH > NORMAL > LOW) | | 最大队列深度 | 500 | 超出后触发溢出策略 | | 令牌等待超时 | 30s(默认) | 可配置 `SIDECAR_QUEUE_TIMEOUT` | -| 溢出策略 | REJECT (503) | 队列满时拒绝新请求,返回 503 + Retry-After | +| 溢出策略 | REJECT (503) | 队列满时拒绝新请求,返回 503 + `Retry-After: 30` header | **溢出策略选择**: - REJECT (503):返回 503,触发上游 OpenClaw 重试/降级 — **选用** @@ -861,11 +861,22 @@ find "$BACKUP_DIR" -name "sidecar_v2_*.db" -mtime +7 -delete | `sidecar_db_wal_size_bytes` | Gauge | WAL 文件大小 | | `sidecar_db_integrity_ok` | Gauge | integrity_check 结果 (0/1) | +| `sidecar_backup_success` | Gauge | 最近一次备份是否成功 (0/1),连续 3 天失败触发告警 | + 系统级监控: - 磁盘使用率:> 80% 告警,> 90% 严重告警 - IO 延迟:> 100ms 告警 +- SQLite 文件大小经验上限:~100MB(实用),~500MB(WAL 上限),超出则触发 MySQL 迁移评估 -### 11.5 配置热加载 +### 11.5 启动时加密密钥校验 + +启动时 `crypto.py` 初始化必须执行以下校验: + +1. 检查 `SIDECAR_ENCRYPTION_KEY` 环境变量是否存在 → 不存在则启动失败,输出 `FATAL: SIDECAR_ENCRYPTION_KEY not set` +2. 验证密钥长度 = 64 字符(32 字节 hex)→ 不符合则启动失败,输出 `FATAL: SIDECAR_ENCRYPTION_KEY must be 64 hex chars (32 bytes)` +3. 尝试解密 `providers` 表中已有的 `api_key` 字段 → 解密失败则输出 WARNING 日志(密钥可能已更换,已有 Key 不可恢复) + +### 11.6 配置热加载 - Provider CRUD 操作自动更新内存缓存(无需重启) - 系统配置项(`system_config` 表)支持热加载 @@ -975,9 +986,28 @@ systemctl start nvidia-sidecar --- -## 14. 核心模块设计 +## 14. 开发实现指引(回复徐聪技术澄清) -### 14.1 模块结构 +### 14.0 徐聪三项技术答疑 + +**Q1: RPM 流控是请求数还是 Token 数?** + +A: 请求数流控。每个 HTTP 请求消耗 1 个 RPM 令牌。Token 用量仅用于事后统计(`provider_usage_logs`)和费用计算(`cost` 字段)。`rate_limiter.acquire(estimated_tokens=0)`——`estimated_tokens` 参数保留但在 V2 中固定为 0,为未来 TPM 流控预留接口。 + +**Q2: SSE 流式响应的用量统计?** + +A: 流式响应在流结束后写入一条 `provider_usage_logs` 记录: +- `total_tokens`:从最后一个 `usage` chunk 或累加 `choices[].delta.content` 长度估算 +- `avg_latency_ms`:记录 TTFT(Time To First Token);非流式记录端到端延迟 +- `request_count`:流式也计为 1 次请求 + +**Q3: `proxy/retry.py` 与 `core/cooldown.py` 的关系?** + +A: 建议采纳。将重试逻辑合并到 `core/cooldown.py`,对外暴露 `should_retry()` 和 `select_fallback()` 接口。简化后的模块结构如下。 + +## 15. 核心模块设计 + +### 15.1 模块结构 ``` sidecar_v2/ @@ -990,12 +1020,11 @@ sidecar_v2/ │ ├── pool.py # 池管理器(main/fallback/cooldown) │ ├── router.py # 负载均衡路由器 │ ├── rate_limiter.py # Per-Provider Token Bucket -│ ├── cooldown.py # 429 冷却管理(含紧急通道) +│ ├── cooldown.py # 429 冷却管理(含紧急通道、重试、降级逻辑) │ └── queue.py # FIFO 优先级队列 │ ├── proxy/ -│ ├── handler.py # 请求代理处理器 -│ └── retry.py # 429 重试 + 降级逻辑 +│ └── handler.py # 请求代理处理器(重试/降级调用 cooldown.py) │ ├── storage/ │ ├── db.py # SQLite/SQLAlchemy 连接管理 @@ -1021,7 +1050,7 @@ sidecar_v2/ └── style.css ``` -### 14.2 并发控制关键代码模式 +### 15.2 并发控制关键代码模式 Provider 选择 + RPM 预占必须原子化: @@ -1067,7 +1096,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 15. 非功能需求 +## 16. 非功能需求 ### 15.1 性能目标 @@ -1100,7 +1129,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 16. 开发排期(修订版) +## 17. 开发排期(最终版 v2.1) ### 新增 P0:文档推送 + 环境准备(3h) @@ -1166,7 +1195,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 17. 协作节点 +## 18. 协作节点 | 阶段 | 协作 Agent | 事项 | |------|-----------|------| @@ -1177,7 +1206,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 18. 技术决策摘要 +## 19. 技术决策摘要 | 决策 | 选择 | 关键理由 | |------|------|---------| @@ -1198,7 +1227,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 19. 风险评估 +## 20. 风险评估 | 风险 | 概率 | 影响 | 应对措施 | |------|------|------|---------| @@ -1214,7 +1243,7 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- -## 20. 下游交付说明 +## 21. 下游交付说明 ### 20.1 给 costcodev(徐聪) @@ -1239,4 +1268,4 @@ def record_usage(provider_id, tokens, cost, latency, status_code): --- > **架构格言**:好的架构就像城市的下水道——平时看不见,但一旦出问题就是灾难。 -> — 梁思筑,2026-06-25 (v2.0 评审修订版) \ No newline at end of file +> — 梁思筑,2026-06-25 (v2.1 第二轮评审修订版) \ No newline at end of file diff --git a/docs/architecture/sidecar-v2-nvidia-providers.yaml b/docs/architecture/sidecar-v2-nvidia-providers.yaml new file mode 100644 index 0000000..df1bfcf --- /dev/null +++ b/docs/architecture/sidecar-v2-nvidia-providers.yaml @@ -0,0 +1,121 @@ +# NVIDIA Provider Keys Reference for Sidecar V2 +# ============================================= +# ⚠️ SECURITY: This file contains sensitive API key material. +# In Sidecar V2 production deployment, API keys are stored as +# AES-256-GCM ciphertext in SQLite (providers.api_key column). +# The plaintext keys below are for V2 initial provisioning only. +# +# Usage: Import into Sidecar V2 via WebUI Admin or POST /api/v2/providers +# After import, this file should be stored in a secure location +# (Bitwarden / password manager) and NOT kept in plaintext on disk. +# +# Created: 2026-06-25 | By: 梁思筑 (architect) +# Total providers: 11 | Pool: main | RPM each: 40 | Total RPM capacity: 440 + +providers: + - account: bizwings + email: vincent@bizwingsinc.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "主账号" + + - account: "98053" + email: 98053@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: liuweicheng84 + email: liuweicheng84@gmail.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: vx18088980513 + email: vx18088980513@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: "64391942" + email: 64391942@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: cgtest1 + email: cgtest1@bizwingsinc.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "测试账号1" + + - account: cgtest2 + email: cgtest2@bizwingsinc.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "测试账号2" + + - account: "15876517651" + email: 1248106918@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: "19584586741" + email: 414133763@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: "18874954146" + email: 350894172@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + + - account: "2405483110" + email: 2405483110@qq.com + api_key: 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 + endpoint_url: https://integrate.api.nvidia.com/v1 + model_prefix: "nvidia/" + pool: main + rpm_limit: 40 + notes: "" + +# Aggregated stats +summary: + total_providers: 11 + total_rpm_capacity: 440 + pools: + main: 11 + fallback: 0 \ No newline at end of file