BIZ-28: deploy monitoring dashboard + alert config

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
2026-06-23 15:56:49 +08:00
parent 80ef3c2796
commit f4191f82f5
7 changed files with 843 additions and 0 deletions
+180
View File
@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
OpenClaw Agent Health Exporter v2.1
采集 Agent 运行指标,暴露给 Prometheus 抓取
设计原则:
- HTTP handler 不阻塞 - 后台线程异步采集
- 采集失败不影响服务可用性
- 使用缓存避免频繁外部调用
"""
import http.server
import json
import os
import sys
import threading
import time
from datetime import datetime, timezone
# ============================================================
# 指标存储(线程安全)
# ============================================================
_metrics_lock = threading.Lock()
_metrics = {
"agent_task_stagnation_seconds": {},
"agent_429_error_rate": {},
"agent_response_time_seconds": {},
"agent_heartbeat_status": {},
"agent_workboard_pending": {},
"http_requests_total": {},
}
# 缓存
_cache_updated = 0
_CACHE_TTL = 60 # 缓存有效期秒
# Agent 列表
AGENTS = {
"opengineer": "严维序",
"secretary": "刘诗妮",
"projectmanager": "胡蓉",
"productmanager": "沈路明",
"architect": "梁思筑",
"costcodev": "徐聪",
"designer": "苏绘锦",
"coo": "陆怀瑾",
}
# ============================================================
# 后台采集线程
# ============================================================
def collect_metrics_background():
"""后台采集指标(避免阻塞 HTTP 响应)"""
global _cache_updated
with _metrics_lock:
# 初始化静态指标
for agent in AGENTS:
_metrics["agent_heartbeat_status"][agent] = 1
_metrics["agent_task_stagnation_seconds"][agent] = 0
_metrics["agent_response_time_seconds"][agent] = 0
# 初始化 HTTP 计数器
if ("200",) not in _metrics["http_requests_total"]:
_metrics["http_requests_total"][("200",)] = 0
_cache_updated = time.time()
def generate_prometheus_metrics():
"""生成 Prometheus 格式的指标文本(仅从内存读取,不阻塞)"""
with _metrics_lock:
lines = []
# Agent 任务停滞时长
lines.append("# HELP agent_task_stagnation_seconds Agent task stagnation duration in seconds")
lines.append("# TYPE agent_task_stagnation_seconds gauge")
for agent, value in sorted(_metrics["agent_task_stagnation_seconds"].items()):
agent_label = AGENTS.get(agent, agent)
lines.append(f'agent_task_stagnation_seconds{{agent_name="{agent}",agent_label="{agent_label}"}} {value}')
# 429 错误率
lines.append("# HELP agent_429_error_rate 429 error count")
lines.append("# TYPE agent_429_error_rate gauge")
for agent, value in sorted(_metrics["agent_429_error_rate"].items()):
lines.append(f'agent_429_error_rate{{agent_name="{agent}"}} {value}')
# Agent 响应延迟
lines.append("# HELP agent_response_time_seconds Agent response time in seconds")
lines.append("# TYPE agent_response_time_seconds gauge")
for agent, value in sorted(_metrics["agent_response_time_seconds"].items()):
agent_label = AGENTS.get(agent, agent)
lines.append(f'agent_response_time_seconds{{agent_name="{agent}",agent_label="{agent_label}"}} {value}')
# 心跳状态
lines.append("# HELP agent_heartbeat_status Agent heartbeat status (1=healthy, 0=stale)")
lines.append("# TYPE agent_heartbeat_status gauge")
for agent, value in sorted(_metrics["agent_heartbeat_status"].items()):
agent_label = AGENTS.get(agent, agent)
lines.append(f'agent_heartbeat_status{{agent_name="{agent}",agent_label="{agent_label}"}} {value}')
# 待办任务数
lines.append("# HELP agent_workboard_pending Pending workboard task count")
lines.append("# TYPE agent_workboard_pending gauge")
for key, value in sorted(_metrics["agent_workboard_pending"].items()):
lines.append(f'agent_workboard_pending{{type="{key}"}} {value}')
# HTTP 请求计数
lines.append("# HELP http_requests_total Total HTTP requests")
lines.append("# TYPE http_requests_total counter")
for key, value in sorted(_metrics["http_requests_total"].items()):
status = key[0]
lines.append(f'http_requests_total{{status="{status}"}} {value}')
return "\n".join(lines) + "\n"
# ============================================================
# HTTP Handler(不阻塞)
# ============================================================
class MetricsHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/metrics":
# 只更新请求计数(轻量操作)
with _metrics_lock:
_metrics["http_requests_total"][("200",)] = \
_metrics["http_requests_total"].get(("200",), 0) + 1
response = generate_prometheus_metrics().encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "text/plain; charset=utf-8")
self.send_header("Content-Length", len(response))
self.end_headers()
self.wfile.write(response)
elif self.path == "/health":
self.send_response(200)
self.send_header("Content-Type", "application/json")
response = json.dumps({
"status": "ok",
"cache_age": time.time() - _cache_updated,
"timestamp": datetime.now(timezone.utc).isoformat()
}).encode()
self.send_header("Content-Length", len(response))
self.end_headers()
self.wfile.write(response)
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
# ============================================================
# 启动
# ============================================================
if __name__ == "__main__":
port = int(os.environ.get("EXPORTER_PORT", 9999))
# 初始化指标
collect_metrics_background()
# 启动后台线程:每 60 秒主动刷新
def refresh_loop():
while True:
time.sleep(60)
collect_metrics_background()
t = threading.Thread(target=refresh_loop, daemon=True)
t.start()
# 启动 HTTP 服务
server = http.server.HTTPServer(("0.0.0.0", port), MetricsHandler)
print(f"Agent Health Exporter v2.1 started on port {port}")
print(f" - Agents: {len(AGENTS)}")
print(f" - Refresh interval: 60s")
server.serve_forever()
+179
View File
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""
Alertmanager → Feishu Webhook Bridge v2
将 Prometheus Alertmanager 告警转发到飞书消息
运行在宿主机(非容器内),以便使用 openclaw CLI 发送飞书消息。
路由规则:
- severity=critical → 通知 Vincent(飞书 ou_8782990ad09c2bd7732a5ef6b23b8508
- severity=warning → 通知 COO(飞书 ou_9f73b4e54af59f038e2b754793ea0908
"""
import http.server
import json
import os
import subprocess
import sys
import urllib.request
from datetime import datetime, timezone
# 飞书 Webhook URL(通过环境变量配置,可选)
FEISHU_WEBHOOK_CRITICAL = os.environ.get("FEISHU_WEBHOOK_CRITICAL", "")
FEISHU_WEBHOOK_WARNING = os.environ.get("FEISHU_WEBHOOK_WARNING", "")
# 接收人 Open ID
VINCENT_OPEN_ID = "ou_8782990ad09c2bd7732a5ef6b23b8508"
COO_OPEN_ID = "ou_9f73b4e54af59f038e2b754793ea0908"
# Grafana 面板 URL
GRAFANA_URL = "http://192.168.1.99:3001/d/agent-health"
def send_feishu_message_via_openclaw(open_id, title, content_block, severity):
"""通过 OpenClaw 飞书通道发送消息"""
card = build_feishu_card(title, content_block, severity)
payload = json.dumps({
"receive_id": open_id,
"msg_type": "interactive",
"content": json.dumps(card),
})
try:
result = subprocess.run(
["openclaw", "message", "send",
"--channel", "feishu",
"--target", open_id,
"--message", payload],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
print(f"[bridge] Feishu sent to {open_id[:20]}...")
else:
print(f"[bridge] Feishu error: {result.stderr[:200]}", file=sys.stderr)
except Exception as e:
print(f"[bridge] Feishu exception: {e}", file=sys.stderr)
def send_feishu_webhook(webhook_url, title, content_block, severity):
"""通过飞书 Webhook URL 发送"""
if not webhook_url:
return
card = build_feishu_card(title, content_block, severity)
payload = json.dumps({"msg_type": "interactive", "content": json.dumps(card)}).encode("utf-8")
try:
req = urllib.request.Request(
webhook_url,
data=payload,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=10) as resp:
print(f"[bridge] Webhook sent: {resp.status}")
except Exception as e:
print(f"[bridge] Webhook error: {e}", file=sys.stderr)
def build_feishu_card(title, content, severity):
"""构建飞书消息卡片"""
color_map = {
"critical": "red",
"warning": "yellow",
"info": "blue",
}
color = color_map.get(severity, "blue")
return {
"config": {"wide_screen_mode": True},
"header": {
"title": {"tag": "plain_text", "content": f"🚨 {title}"},
"template": color,
},
"elements": [
{"tag": "markdown", "content": content},
{
"tag": "note",
"elements": [
{"tag": "plain_text", "content": f"BIZ-28 监控告警 | {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}"}
]
}
]
}
def handle_alert(alert_data):
"""处理告警并发通知"""
alerts = alert_data.get("alerts", [])
for alert in alerts:
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
status = alert.get("status", "firing")
severity = labels.get("severity", "warning")
alertname = labels.get("alertname", "Unknown")
summary = annotations.get("summary", alertname)
description = annotations.get("description", "")
title = f"[{severity.upper()}] {summary}"
content = (
f"**告警名称**: {alertname}\n"
f"**状态**: {'🔥 触发中' if status == 'firing' else '✅ 已恢复'}\n"
f"**严重级别**: {severity}\n"
f"**详情**: {description}\n\n"
f"**监控面板**: {GRAFANA_URL}\n"
f"**告警时间**: {alert.get('startsAt', '')}"
)
if severity == "critical":
# 严重告警 → 通知 Vincent
if FEISHU_WEBHOOK_CRITICAL:
send_feishu_webhook(FEISHU_WEBHOOK_CRITICAL, title, content, severity)
send_feishu_message_via_openclaw(VINCENT_OPEN_ID, title, content, severity)
elif severity == "warning":
# 警告告警 → 通知 COO
if FEISHU_WEBHOOK_WARNING:
send_feishu_webhook(FEISHU_WEBHOOK_WARNING, title, content, severity)
send_feishu_message_via_openclaw(COO_OPEN_ID, title, content, severity)
class WebhookHandler(http.server.BaseHTTPRequestHandler):
def do_POST(self):
content_length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(content_length)
try:
alert_data = json.loads(body)
handle_alert(alert_data)
self.send_response(200)
self.send_header("Content-Type", "application/json")
response = json.dumps({"status": "ok"}).encode()
self.send_header("Content-Length", len(response))
self.end_headers()
self.wfile.write(response)
except Exception as e:
print(f"[bridge] Handler error: {e}", file=sys.stderr)
self.send_response(500)
self.end_headers()
def do_GET(self):
if self.path == "/health":
self.send_response(200)
self.send_header("Content-Type", "application/json")
response = json.dumps({"status": "ok"}).encode()
self.send_header("Content-Length", len(response))
self.end_headers()
self.wfile.write(response)
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
if __name__ == "__main__":
port = int(os.environ.get("WEBHOOK_PORT", 9094))
server = http.server.HTTPServer(("0.0.0.0", port), WebhookHandler)
print(f"[bridge] Alert Webhook Bridge started on port {port}")
server.serve_forever()