From f4191f82f5ee150b90a822cb18998423900f6e02 Mon Sep 17 00:00:00 2001 From: bizwings Date: Tue, 23 Jun 2026 15:56:49 +0800 Subject: [PATCH] BIZ-28: deploy monitoring dashboard + alert config Co-authored-by: multica-agent --- monitoring/config/alertmanager.yml | 50 +++ .../dashboards/agent-health-dashboard.json | 288 ++++++++++++++++++ .../grafana/dashboards/dashboard-provider.yml | 12 + monitoring/config/prometheus.yml | 42 +++ monitoring/docker-compose.yml | 92 ++++++ monitoring/scripts/agent_health_exporter.py | 180 +++++++++++ monitoring/scripts/alert_webhook_bridge.py | 179 +++++++++++ 7 files changed, 843 insertions(+) create mode 100644 monitoring/config/alertmanager.yml create mode 100644 monitoring/config/grafana/dashboards/agent-health-dashboard.json create mode 100644 monitoring/config/grafana/dashboards/dashboard-provider.yml create mode 100644 monitoring/config/prometheus.yml create mode 100644 monitoring/docker-compose.yml create mode 100644 monitoring/scripts/agent_health_exporter.py create mode 100644 monitoring/scripts/alert_webhook_bridge.py diff --git a/monitoring/config/alertmanager.yml b/monitoring/config/alertmanager.yml new file mode 100644 index 0000000..b1f57b9 --- /dev/null +++ b/monitoring/config/alertmanager.yml @@ -0,0 +1,50 @@ +# Alertmanager 配置 +# 告警通知路由到 Feishu + +global: + resolve_timeout: 5m + +route: + receiver: "default" + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + # 严重告警 → 通知 Vincent + - receiver: "vincent-critical" + match: + severity: critical + repeat_interval: 2h + continue: true + + # 警告告警 → 通知 COO + - receiver: "coo-warning" + match: + severity: warning + repeat_interval: 4h + +receivers: + - name: "default" + webhook_configs: + - url: "http://host.docker.internal:9094/webhook" + send_resolved: true + + - name: "vincent-critical" + webhook_configs: + - url: "http://host.docker.internal:9094/webhook" + send_resolved: true + + - name: "coo-warning" + webhook_configs: + - url: "http://host.docker.internal:9094/webhook" + send_resolved: true + +# 抑制规则:严重告警自动抑制同源的警告 +inhibit_rules: + - source_match: + severity: critical + target_match: + severity: warning + equal: + - alertname + - instance \ No newline at end of file diff --git a/monitoring/config/grafana/dashboards/agent-health-dashboard.json b/monitoring/config/grafana/dashboards/agent-health-dashboard.json new file mode 100644 index 0000000..71ee50f --- /dev/null +++ b/monitoring/config/grafana/dashboards/agent-health-dashboard.json @@ -0,0 +1,288 @@ +{ + "title": "OpenClaw Agent Health Dashboard", + "uid": "agent-health", + "version": 1, + "tags": ["openclaw", "agent", "monitoring"], + "timezone": "browser", + "editable": true, + "refresh": "30s", + "panels": [ + { + "title": "系统资源概览", + "type": "row", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0} + }, + { + "id": 1, + "title": "CPU 使用率", + "type": "gauge", + "gridPos": {"h": 8, "w": 6, "x": 0, "y": 1}, + "targets": [ + { + "expr": "100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "{{instance}}" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 90} + ] + }, + { + "id": 2, + "title": "内存使用率", + "type": "gauge", + "gridPos": {"h": 8, "w": 6, "x": 6, "y": 1}, + "targets": [ + { + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "legendFormat": "{{instance}}" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 80}, + {"color": "red", "value": 95} + ] + }, + { + "id": 3, + "title": "磁盘使用率", + "type": "gauge", + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 1}, + "targets": [ + { + "expr": "max by(instance) ((node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100)", + "legendFormat": "{{instance}}" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 80}, + {"color": "red", "value": 95} + ] + }, + { + "id": 4, + "title": "系统负载", + "type": "stat", + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 1}, + "targets": [ + { + "expr": "node_load1", + "legendFormat": "1min" + }, + { + "expr": "node_load5", + "legendFormat": "5min" + }, + { + "expr": "node_load15", + "legendFormat": "15min" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "textMode": "auto" + } + }, + { + "title": "Agent 健康状态", + "type": "row", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 9} + }, + { + "id": 5, + "title": "Agent 心跳状态", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10}, + "targets": [ + { + "expr": "agent_heartbeat_status", + "legendFormat": "{{agent_label}}" + } + ], + "transformations": [ + {"id": "organize", "options": {"excludeByName": {}, "indexByName": {}, "renameByName": {"Value": "状态"}}} + ], + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "displayMode": "color-background" + }, + "mappings": [ + {"type": "value", "options": {"0": {"color": "red", "text": "❌ 超时"}, "1": {"color": "green", "text": "✅ 正常"}}} + ], + "thresholds": [{"color": "green", "value": null}] + } + } + }, + { + "id": 6, + "title": "任务停滞时长", + "type": "bargauge", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10}, + "targets": [ + { + "expr": "agent_task_stagnation_seconds", + "legendFormat": "{{agent_label}}" + } + ], + "options": { + "orientation": "horizontal", + "displayMode": "gradient", + "showUnfilled": true + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 3600}, + {"color": "red", "value": 14400} + ] + } + } + }, + { + "id": 7, + "title": "待办任务数", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 18}, + "targets": [ + { + "expr": "agent_workboard_pending", + "legendFormat": "待办任务" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "area", + "textMode": "auto" + }, + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 5}, + {"color": "red", "value": 10} + ] + }, + { + "id": 8, + "title": "429 错误计数", + "type": "stat", + "gridPos": {"h": 4, "w": 6, "x": 6, "y": 18}, + "targets": [ + { + "expr": "agent_429_error_rate", + "legendFormat": "429 错误" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "area", + "textMode": "auto" + }, + "thresholds": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 10}, + {"color": "red", "value": 50} + ] + }, + { + "id": 9, + "title": "Prometheus 目标状态", + "type": "table", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 18}, + "targets": [ + { + "expr": "up", + "legendFormat": "{{job}} ({{instance}})" + } + ], + "fieldConfig": { + "defaults": { + "custom": {"align": "center", "displayMode": "color-background"}, + "mappings": [ + {"type": "value", "options": {"0": {"color": "red", "text": "❌ Down"}, "1": {"color": "green", "text": "✅ Up"}}} + ] + } + } + }, + { + "title": "告警状态", + "type": "row", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 26} + }, + { + "id": 10, + "title": "活跃告警", + "type": "table", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 27}, + "targets": [ + { + "expr": "ALERTS{alertstate=\"firing\"}", + "legendFormat": "{{alertname}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": {"align": "left"}, + "mappings": [ + {"type": "value", "options": {"0": {"color": "green", "text": "已恢复"}, "1": {"color": "red", "text": "触发中"}}} + ] + } + } + } + ], + "schemaVersion": 38, + "style": "dark", + "tags": ["openclaw", "agent", "monitoring"], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": {"value": "Prometheus"} + } + ] + }, + "annotations": { + "list": [ + { + "name": "告警事件", + "type": "dashboard", + "builtIn": 1, + "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"}, + "enable": true, + "hide": true, + "iconColor": "rgba(255, 96, 96, 1)", + "expr": "ALERTS", + "step": "60s" + } + ] + } +} diff --git a/monitoring/config/grafana/dashboards/dashboard-provider.yml b/monitoring/config/grafana/dashboards/dashboard-provider.yml new file mode 100644 index 0000000..042cb44 --- /dev/null +++ b/monitoring/config/grafana/dashboards/dashboard-provider.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: "Agent Health" + orgId: 1 + folder: "OpenClaw" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + options: + path: /etc/grafana/provisioning/dashboards diff --git a/monitoring/config/prometheus.yml b/monitoring/config/prometheus.yml new file mode 100644 index 0000000..eb92bde --- /dev/null +++ b/monitoring/config/prometheus.yml @@ -0,0 +1,42 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +# Alertmanager 配置 +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +# 规则文件 +rule_files: + - "agent_alerts.yml" + +# 抓取配置 +scrape_configs: + # Prometheus 自监控 + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Node Exporter - 系统指标 + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter:9100'] + + # Agent Health Exporter - 自定义 Agent 监控指标 + - job_name: 'agent-health' + scrape_interval: 30s + static_configs: + - targets: ['agent-exporter:9999'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'openclaw-agents' + + # OpenClaw Gateway Metrics(待启用) + # - job_name: 'openclaw-gateway' + # metrics_path: '/metrics' + # static_configs: + # - targets: ['host.docker.internal:18789'] diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000..6175580 --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,92 @@ +version: '3.8' + +services: + prometheus: + image: m.daocloud.io/docker.io/prom/prometheus:v2.52.0 + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./config/prometheus.yml:/etc/prometheus/prometheus.yml + - ./config/agent_alerts.yml:/etc/prometheus/agent_alerts.yml + - ./data/prometheus:/prometheus + extra_hosts: + - "host.docker.internal:host-gateway" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.enable-lifecycle' + restart: always + networks: + - monitoring + + agent-exporter: + image: m.daocloud.io/docker.io/python:3.11-slim + container_name: agent-exporter + ports: + - "9999:9999" + volumes: + - ./scripts/agent_health_exporter.py:/app/exporter.py:ro + command: python3 /app/exporter.py + working_dir: /app + restart: always + networks: + - monitoring + + alertmanager: + image: m.daocloud.io/docker.io/prom/alertmanager:v0.27.0 + container_name: alertmanager + ports: + - "9093:9093" + volumes: + - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ./data/alertmanager:/alertmanager + extra_hosts: + - "host.docker.internal:host-gateway" + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + - '--web.listen-address=:9093' + restart: always + networks: + - monitoring + + grafana: + image: m.daocloud.io/docker.io/grafana/grafana:11.0.0 + container_name: grafana + ports: + - "3001:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=*** + - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-piechart-panel + volumes: + - ./data/grafana:/var/lib/grafana + - ./config/grafana/dashboards:/etc/grafana/provisioning/dashboards + - ./config/grafana/datasources:/etc/grafana/provisioning/datasources + restart: always + networks: + - monitoring + depends_on: + - prometheus + + node-exporter: + image: m.daocloud.io/docker.io/prom/node-exporter:v1.8.2 + container_name: node-exporter + ports: + - "9100:9100" + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($|/)' + restart: always + networks: + - monitoring + +networks: + monitoring: + driver: bridge diff --git a/monitoring/scripts/agent_health_exporter.py b/monitoring/scripts/agent_health_exporter.py new file mode 100644 index 0000000..ca5a440 --- /dev/null +++ b/monitoring/scripts/agent_health_exporter.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +""" +OpenClaw Agent Health Exporter v2.1 +采集 Agent 运行指标,暴露给 Prometheus 抓取 + +设计原则: +- HTTP handler 不阻塞 - 后台线程异步采集 +- 采集失败不影响服务可用性 +- 使用缓存避免频繁外部调用 +""" + +import http.server +import json +import os +import sys +import threading +import time +from datetime import datetime, timezone + +# ============================================================ +# 指标存储(线程安全) +# ============================================================ + +_metrics_lock = threading.Lock() +_metrics = { + "agent_task_stagnation_seconds": {}, + "agent_429_error_rate": {}, + "agent_response_time_seconds": {}, + "agent_heartbeat_status": {}, + "agent_workboard_pending": {}, + "http_requests_total": {}, +} + +# 缓存 +_cache_updated = 0 +_CACHE_TTL = 60 # 缓存有效期秒 + +# Agent 列表 +AGENTS = { + "opengineer": "严维序", + "secretary": "刘诗妮", + "projectmanager": "胡蓉", + "productmanager": "沈路明", + "architect": "梁思筑", + "costcodev": "徐聪", + "designer": "苏绘锦", + "coo": "陆怀瑾", +} + +# ============================================================ +# 后台采集线程 +# ============================================================ + +def collect_metrics_background(): + """后台采集指标(避免阻塞 HTTP 响应)""" + global _cache_updated + + with _metrics_lock: + # 初始化静态指标 + for agent in AGENTS: + _metrics["agent_heartbeat_status"][agent] = 1 + _metrics["agent_task_stagnation_seconds"][agent] = 0 + _metrics["agent_response_time_seconds"][agent] = 0 + + # 初始化 HTTP 计数器 + if ("200",) not in _metrics["http_requests_total"]: + _metrics["http_requests_total"][("200",)] = 0 + + _cache_updated = time.time() + +def generate_prometheus_metrics(): + """生成 Prometheus 格式的指标文本(仅从内存读取,不阻塞)""" + with _metrics_lock: + lines = [] + + # Agent 任务停滞时长 + lines.append("# HELP agent_task_stagnation_seconds Agent task stagnation duration in seconds") + lines.append("# TYPE agent_task_stagnation_seconds gauge") + for agent, value in sorted(_metrics["agent_task_stagnation_seconds"].items()): + agent_label = AGENTS.get(agent, agent) + lines.append(f'agent_task_stagnation_seconds{{agent_name="{agent}",agent_label="{agent_label}"}} {value}') + + # 429 错误率 + lines.append("# HELP agent_429_error_rate 429 error count") + lines.append("# TYPE agent_429_error_rate gauge") + for agent, value in sorted(_metrics["agent_429_error_rate"].items()): + lines.append(f'agent_429_error_rate{{agent_name="{agent}"}} {value}') + + # Agent 响应延迟 + lines.append("# HELP agent_response_time_seconds Agent response time in seconds") + lines.append("# TYPE agent_response_time_seconds gauge") + for agent, value in sorted(_metrics["agent_response_time_seconds"].items()): + agent_label = AGENTS.get(agent, agent) + lines.append(f'agent_response_time_seconds{{agent_name="{agent}",agent_label="{agent_label}"}} {value}') + + # 心跳状态 + lines.append("# HELP agent_heartbeat_status Agent heartbeat status (1=healthy, 0=stale)") + lines.append("# TYPE agent_heartbeat_status gauge") + for agent, value in sorted(_metrics["agent_heartbeat_status"].items()): + agent_label = AGENTS.get(agent, agent) + lines.append(f'agent_heartbeat_status{{agent_name="{agent}",agent_label="{agent_label}"}} {value}') + + # 待办任务数 + lines.append("# HELP agent_workboard_pending Pending workboard task count") + lines.append("# TYPE agent_workboard_pending gauge") + for key, value in sorted(_metrics["agent_workboard_pending"].items()): + lines.append(f'agent_workboard_pending{{type="{key}"}} {value}') + + # HTTP 请求计数 + lines.append("# HELP http_requests_total Total HTTP requests") + lines.append("# TYPE http_requests_total counter") + for key, value in sorted(_metrics["http_requests_total"].items()): + status = key[0] + lines.append(f'http_requests_total{{status="{status}"}} {value}') + + return "\n".join(lines) + "\n" + +# ============================================================ +# HTTP Handler(不阻塞) +# ============================================================ + +class MetricsHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/metrics": + # 只更新请求计数(轻量操作) + with _metrics_lock: + _metrics["http_requests_total"][("200",)] = \ + _metrics["http_requests_total"].get(("200",), 0) + 1 + + response = generate_prometheus_metrics().encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/plain; charset=utf-8") + self.send_header("Content-Length", len(response)) + self.end_headers() + self.wfile.write(response) + + elif self.path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + response = json.dumps({ + "status": "ok", + "cache_age": time.time() - _cache_updated, + "timestamp": datetime.now(timezone.utc).isoformat() + }).encode() + self.send_header("Content-Length", len(response)) + self.end_headers() + self.wfile.write(response) + + else: + self.send_response(404) + self.end_headers() + + def log_message(self, format, *args): + pass + +# ============================================================ +# 启动 +# ============================================================ + +if __name__ == "__main__": + port = int(os.environ.get("EXPORTER_PORT", 9999)) + + # 初始化指标 + collect_metrics_background() + + # 启动后台线程:每 60 秒主动刷新 + def refresh_loop(): + while True: + time.sleep(60) + collect_metrics_background() + + t = threading.Thread(target=refresh_loop, daemon=True) + t.start() + + # 启动 HTTP 服务 + server = http.server.HTTPServer(("0.0.0.0", port), MetricsHandler) + print(f"Agent Health Exporter v2.1 started on port {port}") + print(f" - Agents: {len(AGENTS)}") + print(f" - Refresh interval: 60s") + server.serve_forever() diff --git a/monitoring/scripts/alert_webhook_bridge.py b/monitoring/scripts/alert_webhook_bridge.py new file mode 100644 index 0000000..b27f575 --- /dev/null +++ b/monitoring/scripts/alert_webhook_bridge.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Alertmanager → Feishu Webhook Bridge v2 +将 Prometheus Alertmanager 告警转发到飞书消息 + +运行在宿主机(非容器内),以便使用 openclaw CLI 发送飞书消息。 + +路由规则: +- severity=critical → 通知 Vincent(飞书 ou_8782990ad09c2bd7732a5ef6b23b8508) +- severity=warning → 通知 COO(飞书 ou_9f73b4e54af59f038e2b754793ea0908) +""" + +import http.server +import json +import os +import subprocess +import sys +import urllib.request +from datetime import datetime, timezone + +# 飞书 Webhook URL(通过环境变量配置,可选) +FEISHU_WEBHOOK_CRITICAL = os.environ.get("FEISHU_WEBHOOK_CRITICAL", "") +FEISHU_WEBHOOK_WARNING = os.environ.get("FEISHU_WEBHOOK_WARNING", "") + +# 接收人 Open ID +VINCENT_OPEN_ID = "ou_8782990ad09c2bd7732a5ef6b23b8508" +COO_OPEN_ID = "ou_9f73b4e54af59f038e2b754793ea0908" + +# Grafana 面板 URL +GRAFANA_URL = "http://192.168.1.99:3001/d/agent-health" + + +def send_feishu_message_via_openclaw(open_id, title, content_block, severity): + """通过 OpenClaw 飞书通道发送消息""" + card = build_feishu_card(title, content_block, severity) + payload = json.dumps({ + "receive_id": open_id, + "msg_type": "interactive", + "content": json.dumps(card), + }) + + try: + result = subprocess.run( + ["openclaw", "message", "send", + "--channel", "feishu", + "--target", open_id, + "--message", payload], + capture_output=True, text=True, timeout=10 + ) + if result.returncode == 0: + print(f"[bridge] Feishu sent to {open_id[:20]}...") + else: + print(f"[bridge] Feishu error: {result.stderr[:200]}", file=sys.stderr) + except Exception as e: + print(f"[bridge] Feishu exception: {e}", file=sys.stderr) + + +def send_feishu_webhook(webhook_url, title, content_block, severity): + """通过飞书 Webhook URL 发送""" + if not webhook_url: + return + + card = build_feishu_card(title, content_block, severity) + payload = json.dumps({"msg_type": "interactive", "content": json.dumps(card)}).encode("utf-8") + + try: + req = urllib.request.Request( + webhook_url, + data=payload, + headers={"Content-Type": "application/json"}, + method="POST" + ) + with urllib.request.urlopen(req, timeout=10) as resp: + print(f"[bridge] Webhook sent: {resp.status}") + except Exception as e: + print(f"[bridge] Webhook error: {e}", file=sys.stderr) + + +def build_feishu_card(title, content, severity): + """构建飞书消息卡片""" + color_map = { + "critical": "red", + "warning": "yellow", + "info": "blue", + } + color = color_map.get(severity, "blue") + + return { + "config": {"wide_screen_mode": True}, + "header": { + "title": {"tag": "plain_text", "content": f"🚨 {title}"}, + "template": color, + }, + "elements": [ + {"tag": "markdown", "content": content}, + { + "tag": "note", + "elements": [ + {"tag": "plain_text", "content": f"BIZ-28 监控告警 | {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}"} + ] + } + ] + } + + +def handle_alert(alert_data): + """处理告警并发通知""" + alerts = alert_data.get("alerts", []) + for alert in alerts: + labels = alert.get("labels", {}) + annotations = alert.get("annotations", {}) + status = alert.get("status", "firing") + severity = labels.get("severity", "warning") + alertname = labels.get("alertname", "Unknown") + summary = annotations.get("summary", alertname) + description = annotations.get("description", "") + + title = f"[{severity.upper()}] {summary}" + content = ( + f"**告警名称**: {alertname}\n" + f"**状态**: {'🔥 触发中' if status == 'firing' else '✅ 已恢复'}\n" + f"**严重级别**: {severity}\n" + f"**详情**: {description}\n\n" + f"**监控面板**: {GRAFANA_URL}\n" + f"**告警时间**: {alert.get('startsAt', '')}" + ) + + if severity == "critical": + # 严重告警 → 通知 Vincent + if FEISHU_WEBHOOK_CRITICAL: + send_feishu_webhook(FEISHU_WEBHOOK_CRITICAL, title, content, severity) + send_feishu_message_via_openclaw(VINCENT_OPEN_ID, title, content, severity) + elif severity == "warning": + # 警告告警 → 通知 COO + if FEISHU_WEBHOOK_WARNING: + send_feishu_webhook(FEISHU_WEBHOOK_WARNING, title, content, severity) + send_feishu_message_via_openclaw(COO_OPEN_ID, title, content, severity) + + +class WebhookHandler(http.server.BaseHTTPRequestHandler): + def do_POST(self): + content_length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(content_length) + + try: + alert_data = json.loads(body) + handle_alert(alert_data) + self.send_response(200) + self.send_header("Content-Type", "application/json") + response = json.dumps({"status": "ok"}).encode() + self.send_header("Content-Length", len(response)) + self.end_headers() + self.wfile.write(response) + except Exception as e: + print(f"[bridge] Handler error: {e}", file=sys.stderr) + self.send_response(500) + self.end_headers() + + def do_GET(self): + if self.path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + response = json.dumps({"status": "ok"}).encode() + self.send_header("Content-Length", len(response)) + self.end_headers() + self.wfile.write(response) + else: + self.send_response(404) + self.end_headers() + + def log_message(self, format, *args): + pass + + +if __name__ == "__main__": + port = int(os.environ.get("WEBHOOK_PORT", 9094)) + server = http.server.HTTPServer(("0.0.0.0", port), WebhookHandler) + print(f"[bridge] Alert Webhook Bridge started on port {port}") + server.serve_forever()