From da6613ada320acec17e831946c129e68eb5daf23 Mon Sep 17 00:00:00 2001 From: Spencer Grimes Date: Tue, 3 Feb 2026 20:54:08 -0600 Subject: [PATCH] Add Docker container health detector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Monitors for containers stuck in restart loops or unhealthy states: - Critical: restart loop (≥3 restarts) - Warning: restarting, exited abnormally, or unhealthy Disabled by default in config.json. Co-Authored-By: Claude Opus 4.5 --- config.json | 10 +++ detectors/docker.py | 178 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 detectors/docker.py diff --git a/config.json b/config.json index 41587f7..f8046b7 100644 --- a/config.json +++ b/config.json @@ -52,6 +52,16 @@ "HOSTS": "8.8.8.8,google.com", "TIMEOUT": "5" } + }, + { + "name": "docker", + "enabled": false, + "script": "detectors/docker.py", + "env": { + "CHECK_INTERVAL": "60", + "RESTART_THRESHOLD": "3", + "CONTAINERS": "" + } } ] } diff --git a/detectors/docker.py b/detectors/docker.py new file mode 100644 index 0000000..269b33d --- /dev/null +++ b/detectors/docker.py @@ -0,0 +1,178 @@ +""" +Docker Container Health Detector +Monitors for containers stuck in restart loops or unhealthy states. + +Environment variables: + AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) + CHECK_INTERVAL - Seconds between checks (default: 60) + RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3) + CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty) +""" + +import json +import os +import subprocess +import time +import requests + +# Configuration from environment +AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") +CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60)) +RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3)) +CONTAINERS = os.environ.get("CONTAINERS", "") + + +def get_container_status(): + """Get status of all containers using docker CLI.""" + try: + # Get container info as JSON + result = subprocess.run( + ["docker", "ps", "-a", "--format", "{{json .}}"], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + print(f"[ERROR] Docker command failed: {result.stderr}") + return None + + containers = [] + for line in result.stdout.strip().split('\n'): + if line: + containers.append(json.loads(line)) + + return containers + except FileNotFoundError: + print("[ERROR] Docker CLI not found") + return None + except subprocess.TimeoutExpired: + print("[ERROR] Docker command timed out") + return None + except Exception as e: + print(f"[ERROR] Failed to get container status: {e}") + return None + + +def get_restart_count(container_name): + """Get restart count for a specific container.""" + try: + result = subprocess.run( + ["docker", "inspect", "--format", "{{.RestartCount}}", container_name], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0: + return int(result.stdout.strip()) + except Exception: + pass + return 0 + + +def send_event(event_id, priority, message): + """Send an event to the aggregator with heartbeat TTL.""" + ttl = CHECK_INTERVAL * 2 + try: + response = requests.post( + f"{AGGREGATOR_URL}/event", + json={"id": event_id, "priority": priority, "message": message, "ttl": ttl}, + timeout=5 + ) + print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}") + except requests.RequestException as e: + print(f"[ERROR] Failed to send event: {e}") + + +def clear_event(event_id): + """Clear an event from the aggregator.""" + try: + response = requests.post( + f"{AGGREGATOR_URL}/clear", + json={"id": event_id}, + timeout=5 + ) + if response.status_code == 200: + print(f"[CLEAR] {event_id}") + except requests.RequestException as e: + print(f"[ERROR] Failed to clear event: {e}") + + +def main(): + # Parse container filter + filter_containers = None + if CONTAINERS: + filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip()) + + print(f"Docker Container Detector started") + print(f" Aggregator: {AGGREGATOR_URL}") + print(f" Interval: {CHECK_INTERVAL}s") + print(f" Restart threshold: {RESTART_THRESHOLD}") + if filter_containers: + print(f" Monitoring: {', '.join(filter_containers)}") + else: + print(f" Monitoring: all containers") + print() + + active_alerts = set() + + while True: + containers = get_container_status() + + if containers is None: + print("[WARN] Could not fetch container status, skipping check") + time.sleep(CHECK_INTERVAL) + continue + + current_alerts = set() + + for container in containers: + name = container.get("Names", "unknown") + state = container.get("State", "").lower() + status = container.get("Status", "") + + # Apply filter if specified + if filter_containers and name.lower() not in filter_containers: + continue + + event_id = f"docker_{name.replace('/', '_')}" + + # Check for restarting state + if state == "restarting": + restart_count = get_restart_count(name) + if restart_count >= RESTART_THRESHOLD: + send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)") + current_alerts.add(event_id) + else: + send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)") + current_alerts.add(event_id) + + # Check for exited/dead containers (warning) + elif state in ("exited", "dead"): + # Only alert if it exited abnormally (non-zero exit code in status) + if "Exited (0)" not in status: + send_event(event_id, 2, f"Container '{name}' {state}") + current_alerts.add(event_id) + else: + print(f"[OK] Container '{name}' exited cleanly") + + # Check for unhealthy containers + elif "unhealthy" in status.lower(): + send_event(event_id, 2, f"Container '{name}' unhealthy") + current_alerts.add(event_id) + + else: + print(f"[OK] Container '{name}' is {state}") + + # Clear alerts for containers that are now healthy + for event_id in active_alerts - current_alerts: + clear_event(event_id) + + active_alerts = current_alerts + + print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n") + time.sleep(CHECK_INTERVAL) + + +if __name__ == "__main__": + main()