Add Docker container health detector

Monitors for containers stuck in restart loops or unhealthy states: - Critical: restart loop (≥3 restarts) - Warning: restarting, exited abnormally, or unhealthy Disabled by default in config.json. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 20:54:08 -06:00
parent b99ac96ffa
commit da6613ada3
2 changed files with 188 additions and 0 deletions
--- a/detectors/docker.py
+++ b/detectors/docker.py
@@ -0,0 +1,178 @@
+"""
+Docker Container Health Detector
+Monitors for containers stuck in restart loops or unhealthy states.
+
+Environment variables:
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL      - Seconds between checks (default: 60)
+    RESTART_THRESHOLD   - Number of restarts to consider a loop (default: 3)
+    CONTAINERS          - Comma-separated container names to monitor (optional, monitors all if empty)
+"""
+
+import json
+import os
+import subprocess
+import time
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
+RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
+CONTAINERS = os.environ.get("CONTAINERS", "")
+
+
+def get_container_status():
+    """Get status of all containers using docker CLI."""
+    try:
+        # Get container info as JSON
+        result = subprocess.run(
+            ["docker", "ps", "-a", "--format", "{{json .}}"],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        if result.returncode != 0:
+            print(f"[ERROR] Docker command failed: {result.stderr}")
+            return None
+
+        containers = []
+        for line in result.stdout.strip().split('\n'):
+            if line:
+                containers.append(json.loads(line))
+
+        return containers
+    except FileNotFoundError:
+        print("[ERROR] Docker CLI not found")
+        return None
+    except subprocess.TimeoutExpired:
+        print("[ERROR] Docker command timed out")
+        return None
+    except Exception as e:
+        print(f"[ERROR] Failed to get container status: {e}")
+        return None
+
+
+def get_restart_count(container_name):
+    """Get restart count for a specific container."""
+    try:
+        result = subprocess.run(
+            ["docker", "inspect", "--format", "{{.RestartCount}}", container_name],
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        if result.returncode == 0:
+            return int(result.stdout.strip())
+    except Exception:
+        pass
+    return 0
+
+
+def send_event(event_id, priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event(event_id):
+    """Clear an event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": event_id},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {event_id}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    # Parse container filter
+    filter_containers = None
+    if CONTAINERS:
+        filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip())
+
+    print(f"Docker Container Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Restart threshold: {RESTART_THRESHOLD}")
+    if filter_containers:
+        print(f"  Monitoring: {', '.join(filter_containers)}")
+    else:
+        print(f"  Monitoring: all containers")
+    print()
+
+    active_alerts = set()
+
+    while True:
+        containers = get_container_status()
+
+        if containers is None:
+            print("[WARN] Could not fetch container status, skipping check")
+            time.sleep(CHECK_INTERVAL)
+            continue
+
+        current_alerts = set()
+
+        for container in containers:
+            name = container.get("Names", "unknown")
+            state = container.get("State", "").lower()
+            status = container.get("Status", "")
+
+            # Apply filter if specified
+            if filter_containers and name.lower() not in filter_containers:
+                continue
+
+            event_id = f"docker_{name.replace('/', '_')}"
+
+            # Check for restarting state
+            if state == "restarting":
+                restart_count = get_restart_count(name)
+                if restart_count >= RESTART_THRESHOLD:
+                    send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)")
+                    current_alerts.add(event_id)
+                else:
+                    send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)")
+                    current_alerts.add(event_id)
+
+            # Check for exited/dead containers (warning)
+            elif state in ("exited", "dead"):
+                # Only alert if it exited abnormally (non-zero exit code in status)
+                if "Exited (0)" not in status:
+                    send_event(event_id, 2, f"Container '{name}' {state}")
+                    current_alerts.add(event_id)
+                else:
+                    print(f"[OK] Container '{name}' exited cleanly")
+
+            # Check for unhealthy containers
+            elif "unhealthy" in status.lower():
+                send_event(event_id, 2, f"Container '{name}' unhealthy")
+                current_alerts.add(event_id)
+
+            else:
+                print(f"[OK] Container '{name}' is {state}")
+
+        # Clear alerts for containers that are now healthy
+        for event_id in active_alerts - current_alerts:
+            clear_event(event_id)
+
+        active_alerts = current_alerts
+
+        print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
+        time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()