""" Docker Container Health Detector Monitors for containers stuck in restart loops or unhealthy states. Environment variables: AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) CHECK_INTERVAL - Seconds between checks (default: 60) RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3) CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty) """ import json import os import subprocess import time import requests # Configuration from environment AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60)) RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3)) CONTAINERS = os.environ.get("CONTAINERS", "") def get_container_status(): """Get status of all containers using docker CLI.""" try: # Get container info as JSON result = subprocess.run( ["docker", "ps", "-a", "--format", "{{json .}}"], capture_output=True, text=True, timeout=30 ) if result.returncode != 0: print(f"[ERROR] Docker command failed: {result.stderr}") return None containers = [] for line in result.stdout.strip().split('\n'): if line: containers.append(json.loads(line)) return containers except FileNotFoundError: print("[ERROR] Docker CLI not found") return None except subprocess.TimeoutExpired: print("[ERROR] Docker command timed out") return None except Exception as e: print(f"[ERROR] Failed to get container status: {e}") return None def get_restart_count(container_name): """Get restart count for a specific container.""" try: result = subprocess.run( ["docker", "inspect", "--format", "{{.RestartCount}}", container_name], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: return int(result.stdout.strip()) except Exception: pass return 0 def send_event(event_id, priority, message): """Send an event to the aggregator with heartbeat TTL.""" ttl = CHECK_INTERVAL * 2 try: response = requests.post( f"{AGGREGATOR_URL}/event", json={"id": event_id, "priority": priority, "message": message, "ttl": ttl}, timeout=5 ) print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}") except requests.RequestException as e: print(f"[ERROR] Failed to send event: {e}") def clear_event(event_id): """Clear an event from the aggregator.""" try: response = requests.post( f"{AGGREGATOR_URL}/clear", json={"id": event_id}, timeout=5 ) if response.status_code == 200: print(f"[CLEAR] {event_id}") except requests.RequestException as e: print(f"[ERROR] Failed to clear event: {e}") def main(): # Parse container filter filter_containers = None if CONTAINERS: filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip()) print(f"Docker Container Detector started") print(f" Aggregator: {AGGREGATOR_URL}") print(f" Interval: {CHECK_INTERVAL}s") print(f" Restart threshold: {RESTART_THRESHOLD}") if filter_containers: print(f" Monitoring: {', '.join(filter_containers)}") else: print(f" Monitoring: all containers") print() active_alerts = set() while True: containers = get_container_status() if containers is None: print("[WARN] Could not fetch container status, skipping check") time.sleep(CHECK_INTERVAL) continue current_alerts = set() for container in containers: name = container.get("Names", "unknown") state = container.get("State", "").lower() status = container.get("Status", "") # Apply filter if specified if filter_containers and name.lower() not in filter_containers: continue event_id = f"docker_{name.replace('/', '_')}" # Check for restarting state if state == "restarting": restart_count = get_restart_count(name) if restart_count >= RESTART_THRESHOLD: send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)") current_alerts.add(event_id) else: send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)") current_alerts.add(event_id) # Check for exited/dead containers (warning) elif state in ("exited", "dead"): # Only alert if it exited abnormally (non-zero exit code in status) if "Exited (0)" not in status: send_event(event_id, 2, f"Container '{name}' {state}") current_alerts.add(event_id) else: print(f"[OK] Container '{name}' exited cleanly") # Check for unhealthy containers elif "unhealthy" in status.lower(): send_event(event_id, 2, f"Container '{name}' unhealthy") current_alerts.add(event_id) else: print(f"[OK] Container '{name}' is {state}") # Clear alerts for containers that are now healthy for event_id in active_alerts - current_alerts: clear_event(event_id) active_alerts = current_alerts print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n") time.sleep(CHECK_INTERVAL) if __name__ == "__main__": main()