Add Docker container health detector

Monitors for containers stuck in restart loops or unhealthy states:
- Critical: restart loop (≥3 restarts)
- Warning: restarting, exited abnormally, or unhealthy

Disabled by default in config.json.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-03 20:54:08 -06:00
parent b99ac96ffa
commit da6613ada3
2 changed files with 188 additions and 0 deletions

178
detectors/docker.py Normal file
View File

@@ -0,0 +1,178 @@
"""
Docker Container Health Detector
Monitors for containers stuck in restart loops or unhealthy states.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 60)
RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3)
CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty)
"""
import json
import os
import subprocess
import time
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
CONTAINERS = os.environ.get("CONTAINERS", "")
def get_container_status():
"""Get status of all containers using docker CLI."""
try:
# Get container info as JSON
result = subprocess.run(
["docker", "ps", "-a", "--format", "{{json .}}"],
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0:
print(f"[ERROR] Docker command failed: {result.stderr}")
return None
containers = []
for line in result.stdout.strip().split('\n'):
if line:
containers.append(json.loads(line))
return containers
except FileNotFoundError:
print("[ERROR] Docker CLI not found")
return None
except subprocess.TimeoutExpired:
print("[ERROR] Docker command timed out")
return None
except Exception as e:
print(f"[ERROR] Failed to get container status: {e}")
return None
def get_restart_count(container_name):
"""Get restart count for a specific container."""
try:
result = subprocess.run(
["docker", "inspect", "--format", "{{.RestartCount}}", container_name],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
return int(result.stdout.strip())
except Exception:
pass
return 0
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear an event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
# Parse container filter
filter_containers = None
if CONTAINERS:
filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip())
print(f"Docker Container Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Restart threshold: {RESTART_THRESHOLD}")
if filter_containers:
print(f" Monitoring: {', '.join(filter_containers)}")
else:
print(f" Monitoring: all containers")
print()
active_alerts = set()
while True:
containers = get_container_status()
if containers is None:
print("[WARN] Could not fetch container status, skipping check")
time.sleep(CHECK_INTERVAL)
continue
current_alerts = set()
for container in containers:
name = container.get("Names", "unknown")
state = container.get("State", "").lower()
status = container.get("Status", "")
# Apply filter if specified
if filter_containers and name.lower() not in filter_containers:
continue
event_id = f"docker_{name.replace('/', '_')}"
# Check for restarting state
if state == "restarting":
restart_count = get_restart_count(name)
if restart_count >= RESTART_THRESHOLD:
send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)")
current_alerts.add(event_id)
else:
send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)")
current_alerts.add(event_id)
# Check for exited/dead containers (warning)
elif state in ("exited", "dead"):
# Only alert if it exited abnormally (non-zero exit code in status)
if "Exited (0)" not in status:
send_event(event_id, 2, f"Container '{name}' {state}")
current_alerts.add(event_id)
else:
print(f"[OK] Container '{name}' exited cleanly")
# Check for unhealthy containers
elif "unhealthy" in status.lower():
send_event(event_id, 2, f"Container '{name}' unhealthy")
current_alerts.add(event_id)
else:
print(f"[OK] Container '{name}' is {state}")
# Clear alerts for containers that are now healthy
for event_id in active_alerts - current_alerts:
clear_event(event_id)
active_alerts = current_alerts
print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
main()