- Extract shared send_event/clear_event into detectors/base.py, removing ~150 lines of duplication across all 6 detectors - Fix default aggregator URL from port 5000 to 5100 in all detectors - Standardize cpu.py and memory.py to use active_alerts set pattern - Fix immediate emote rotation on startup (last_emote_change = time.time()) - Extract magic numbers to named constants in aggregator - Protect write_status() with try/except OSError - Fix notify event ID collision with monotonic counter - Replace blocking stream_output() with background daemon threads in kao.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
"""
|
|
Docker Container Health Detector
|
|
Monitors for containers stuck in restart loops or unhealthy states.
|
|
|
|
Environment variables:
|
|
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
|
|
CHECK_INTERVAL - Seconds between checks (default: 60)
|
|
RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3)
|
|
CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import time
|
|
|
|
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
|
|
|
|
# Configuration from environment
|
|
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
|
|
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
|
|
RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
|
|
CONTAINERS = os.environ.get("CONTAINERS", "")
|
|
|
|
|
|
def get_container_status():
|
|
"""Get status of all containers using docker CLI."""
|
|
try:
|
|
# Get container info as JSON
|
|
result = subprocess.run(
|
|
["docker", "ps", "-a", "--format", "{{json .}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
print(f"[ERROR] Docker command failed: {result.stderr}")
|
|
return None
|
|
|
|
containers = []
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line:
|
|
containers.append(json.loads(line))
|
|
|
|
return containers
|
|
except FileNotFoundError:
|
|
print("[ERROR] Docker CLI not found")
|
|
return None
|
|
except subprocess.TimeoutExpired:
|
|
print("[ERROR] Docker command timed out")
|
|
return None
|
|
except Exception as e:
|
|
print(f"[ERROR] Failed to get container status: {e}")
|
|
return None
|
|
|
|
|
|
def get_restart_count(container_name):
|
|
"""Get restart count for a specific container."""
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "inspect", "--format", "{{.RestartCount}}", container_name],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10
|
|
)
|
|
if result.returncode == 0:
|
|
return int(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
return 0
|
|
|
|
|
|
def main():
|
|
# Parse container filter
|
|
filter_containers = None
|
|
if CONTAINERS:
|
|
filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip())
|
|
|
|
print(f"Docker Container Detector started")
|
|
print(f" Aggregator: {AGGREGATOR_URL}")
|
|
print(f" Interval: {CHECK_INTERVAL}s")
|
|
print(f" Restart threshold: {RESTART_THRESHOLD}")
|
|
if filter_containers:
|
|
print(f" Monitoring: {', '.join(filter_containers)}")
|
|
else:
|
|
print(f" Monitoring: all containers")
|
|
print()
|
|
|
|
active_alerts = set()
|
|
last_restart_counts = {}
|
|
|
|
while True:
|
|
containers = get_container_status()
|
|
|
|
if containers is None:
|
|
print("[WARN] Could not fetch container status, skipping check")
|
|
time.sleep(CHECK_INTERVAL)
|
|
continue
|
|
|
|
current_alerts = set()
|
|
|
|
for container in containers:
|
|
name = container.get("Names", "unknown")
|
|
state = container.get("State", "").lower()
|
|
status = container.get("Status", "")
|
|
|
|
# Apply filter if specified
|
|
if filter_containers and name.lower() not in filter_containers:
|
|
continue
|
|
|
|
event_id = f"docker_{name.replace('/', '_')}"
|
|
|
|
# Check restart count for running/restarting containers
|
|
# The "restarting" state is too transient to catch reliably,
|
|
# so we track count increases between checks instead
|
|
if state in ("running", "restarting"):
|
|
restart_count = get_restart_count(name)
|
|
prev_count = last_restart_counts.get(name, restart_count)
|
|
new_restarts = restart_count - prev_count
|
|
last_restart_counts[name] = restart_count
|
|
|
|
if state == "restarting" or new_restarts >= RESTART_THRESHOLD:
|
|
send_event(AGGREGATOR_URL, event_id, 1, f"Container '{name}' restart loop ({restart_count}x)", CHECK_INTERVAL)
|
|
current_alerts.add(event_id)
|
|
elif new_restarts > 0:
|
|
send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' restarting ({restart_count}x)", CHECK_INTERVAL)
|
|
current_alerts.add(event_id)
|
|
else:
|
|
print(f"[OK] Container '{name}' is {state}")
|
|
|
|
# Check for exited/dead containers (warning)
|
|
elif state in ("exited", "dead"):
|
|
# Only alert if it exited abnormally (non-zero exit code in status)
|
|
if "Exited (0)" not in status:
|
|
send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' {state}", CHECK_INTERVAL)
|
|
current_alerts.add(event_id)
|
|
else:
|
|
print(f"[OK] Container '{name}' exited cleanly")
|
|
|
|
# Check for unhealthy containers
|
|
elif "unhealthy" in status.lower():
|
|
send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' unhealthy", CHECK_INTERVAL)
|
|
current_alerts.add(event_id)
|
|
|
|
else:
|
|
print(f"[OK] Container '{name}' is {state}")
|
|
|
|
# Clear alerts for containers that are now healthy
|
|
for event_id in active_alerts - current_alerts:
|
|
clear_event(AGGREGATOR_URL, event_id)
|
|
|
|
active_alerts = current_alerts
|
|
|
|
print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
|
|
time.sleep(CHECK_INTERVAL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|