Kao/detectors/docker.py

"""
Docker Container Health Detector
Monitors for containers stuck in restart loops or unhealthy states.

Environment variables:
    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5100)
    CHECK_INTERVAL      - Seconds between checks (default: 60)
    RESTART_THRESHOLD   - Number of restarts to consider a loop (default: 3)
    CONTAINERS          - Comma-separated container names to monitor (optional, monitors all if empty)
"""

import json
import os
import subprocess
import time

from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event

# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
CONTAINERS = os.environ.get("CONTAINERS", "")


def get_container_status():
    """Get status of all containers using docker CLI."""
    try:
        # Get container info as JSON
        result = subprocess.run(
            ["docker", "ps", "-a", "--format", "{{json .}}"],
            capture_output=True,
            text=True,
            timeout=30
        )

        if result.returncode != 0:
            print(f"[ERROR] Docker command failed: {result.stderr}")
            return None

        containers = []
        for line in result.stdout.strip().split('\n'):
            if line:
                containers.append(json.loads(line))

        return containers
    except FileNotFoundError:
        print("[ERROR] Docker CLI not found")
        return None
    except subprocess.TimeoutExpired:
        print("[ERROR] Docker command timed out")
        return None
    except Exception as e:
        print(f"[ERROR] Failed to get container status: {e}")
        return None


def get_restart_count(container_name):
    """Get restart count for a specific container."""
    try:
        result = subprocess.run(
            ["docker", "inspect", "--format", "{{.RestartCount}}", container_name],
            capture_output=True,
            text=True,
            timeout=10
        )
        if result.returncode == 0:
            return int(result.stdout.strip())
    except Exception:
        pass
    return 0


def main():
    # Parse container filter
    filter_containers = None
    if CONTAINERS:
        filter_containers = set(s.strip().lower() for s in CONTAINERS.split(",") if s.strip())

    print(f"Docker Container Detector started")
    print(f"  Aggregator: {AGGREGATOR_URL}")
    print(f"  Interval: {CHECK_INTERVAL}s")
    print(f"  Restart threshold: {RESTART_THRESHOLD}")
    if filter_containers:
        print(f"  Monitoring: {', '.join(filter_containers)}")
    else:
        print(f"  Monitoring: all containers")
    print()

    active_alerts = set()
    last_restart_counts = {}

    while True:
        containers = get_container_status()

        if containers is None:
            print("[WARN] Could not fetch container status, skipping check")
            time.sleep(CHECK_INTERVAL)
            continue

        current_alerts = set()

        for container in containers:
            name = container.get("Names", "unknown")
            state = container.get("State", "").lower()
            status = container.get("Status", "")

            # Apply filter if specified
            if filter_containers and name.lower() not in filter_containers:
                continue

            event_id = f"docker_{name.replace('/', '_')}"

            # Check restart count for running/restarting containers
            # The "restarting" state is too transient to catch reliably,
            # so we track count increases between checks instead
            if state in ("running", "restarting"):
                restart_count = get_restart_count(name)
                prev_count = last_restart_counts.get(name, restart_count)
                new_restarts = restart_count - prev_count
                last_restart_counts[name] = restart_count

                if state == "restarting" or new_restarts >= RESTART_THRESHOLD:
                    send_event(AGGREGATOR_URL, event_id, 1, f"Container '{name}' restart loop ({restart_count}x)", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                elif new_restarts > 0:
                    send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' restarting ({restart_count}x)", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                else:
                    print(f"[OK] Container '{name}' is {state}")

            # Check for exited/dead containers (warning)
            elif state in ("exited", "dead"):
                # Only alert if it exited abnormally (non-zero exit code in status)
                if "Exited (0)" not in status:
                    send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' {state}", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                else:
                    print(f"[OK] Container '{name}' exited cleanly")

            # Check for unhealthy containers
            elif "unhealthy" in status.lower():
                send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' unhealthy", CHECK_INTERVAL)
                current_alerts.add(event_id)

            else:
                print(f"[OK] Container '{name}' is {state}")

        # Clear alerts for containers that are now healthy
        for event_id in active_alerts - current_alerts:
            clear_event(AGGREGATOR_URL, event_id)

        active_alerts = current_alerts

        print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
        time.sleep(CHECK_INTERVAL)


if __name__ == "__main__":
    main()