Bump to v1.5.0: deduplicate detectors, fix aggregator bugs, fix blocking I/O

- Extract shared send_event/clear_event into detectors/base.py, removing ~150 lines of duplication across all 6 detectors - Fix default aggregator URL from port 5000 to 5100 in all detectors - Standardize cpu.py and memory.py to use active_alerts set pattern - Fix immediate emote rotation on startup (last_emote_change = time.time()) - Extract magic numbers to named constants in aggregator - Protect write_status() with try/except OSError - Fix notify event ID collision with monotonic counter - Replace blocking stream_output() with background daemon threads in kao.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 12:17:17 -06:00
parent c3ceb74ce8
commit dd8bf6005b
12 changed files with 126 additions and 236 deletions
--- a/detectors/docker.py
+++ b/detectors/docker.py
@@ -3,7 +3,7 @@ Docker Container Health Detector
 Monitors for containers stuck in restart loops or unhealthy states.

 Environment variables:
-    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5100)
    CHECK_INTERVAL      - Seconds between checks (default: 60)
    RESTART_THRESHOLD   - Number of restarts to consider a loop (default: 3)
    CONTAINERS          - Comma-separated container names to monitor (optional, monitors all if empty)
@@ -13,10 +13,11 @@ import json
 import os
 import subprocess
 import time
-import requests
+
+from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event

 # Configuration from environment
-AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
 CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
 RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
 CONTAINERS = os.environ.get("CONTAINERS", "")
@@ -70,34 +71,6 @@ def get_restart_count(container_name):
    return 0


-def send_event(event_id, priority, message):
-    """Send an event to the aggregator with heartbeat TTL."""
-    ttl = CHECK_INTERVAL * 2
-    try:
-        response = requests.post(
-            f"{AGGREGATOR_URL}/event",
-            json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
-            timeout=5
-        )
-        print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
-    except requests.RequestException as e:
-        print(f"[ERROR] Failed to send event: {e}")
-
-
-def clear_event(event_id):
-    """Clear an event from the aggregator."""
-    try:
-        response = requests.post(
-            f"{AGGREGATOR_URL}/clear",
-            json={"id": event_id},
-            timeout=5
-        )
-        if response.status_code == 200:
-            print(f"[CLEAR] {event_id}")
-    except requests.RequestException as e:
-        print(f"[ERROR] Failed to clear event: {e}")
-
-
 def main():
    # Parse container filter
    filter_containers = None
@@ -148,10 +121,10 @@ def main():
                last_restart_counts[name] = restart_count

                if state == "restarting" or new_restarts >= RESTART_THRESHOLD:
-                    send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)")
+                    send_event(AGGREGATOR_URL, event_id, 1, f"Container '{name}' restart loop ({restart_count}x)", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                elif new_restarts > 0:
-                    send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)")
+                    send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' restarting ({restart_count}x)", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                else:
                    print(f"[OK] Container '{name}' is {state}")
@@ -160,14 +133,14 @@ def main():
            elif state in ("exited", "dead"):
                # Only alert if it exited abnormally (non-zero exit code in status)
                if "Exited (0)" not in status:
-                    send_event(event_id, 2, f"Container '{name}' {state}")
+                    send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' {state}", CHECK_INTERVAL)
                    current_alerts.add(event_id)
                else:
                    print(f"[OK] Container '{name}' exited cleanly")

            # Check for unhealthy containers
            elif "unhealthy" in status.lower():
-                send_event(event_id, 2, f"Container '{name}' unhealthy")
+                send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' unhealthy", CHECK_INTERVAL)
                current_alerts.add(event_id)

            else:
@@ -175,7 +148,7 @@ def main():

        # Clear alerts for containers that are now healthy
        for event_id in active_alerts - current_alerts:
-            clear_event(event_id)
+            clear_event(AGGREGATOR_URL, event_id)

        active_alerts = current_alerts