Bump to v1.5.0: deduplicate detectors, fix aggregator bugs, fix blocking I/O

- Extract shared send_event/clear_event into detectors/base.py, removing ~150 lines of duplication across all 6 detectors - Fix default aggregator URL from port 5000 to 5100 in all detectors - Standardize cpu.py and memory.py to use active_alerts set pattern - Fix immediate emote rotation on startup (last_emote_change = time.time()) - Extract magic numbers to named constants in aggregator - Protect write_status() with try/except OSError - Fix notify event ID collision with monotonic counter - Replace blocking stream_output() with background daemon threads in kao.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 12:17:17 -06:00
parent c3ceb74ce8
commit dd8bf6005b
12 changed files with 126 additions and 236 deletions
--- a/detectors/cpu.py
+++ b/detectors/cpu.py
@@ -3,7 +3,7 @@ CPU Usage Detector
 Monitors CPU usage and reports to the aggregator when thresholds are exceeded.

 Environment variables:
-    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5100)
    CHECK_INTERVAL      - Seconds between checks (default: 30)
    THRESHOLD_CRITICAL  - Percent usage for critical alert (default: 95)
    THRESHOLD_WARNING   - Percent usage for warning alert (default: 85)
@@ -12,10 +12,11 @@ Environment variables:
 import os
 import time
 import psutil
-import requests
+
+from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event

 # Configuration from environment
-AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
 CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
 THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
 THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
@@ -23,34 +24,6 @@ THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
 EVENT_ID = "cpu_usage"


-def send_event(priority, message):
-    """Send an event to the aggregator with heartbeat TTL."""
-    ttl = CHECK_INTERVAL * 2
-    try:
-        response = requests.post(
-            f"{AGGREGATOR_URL}/event",
-            json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
-            timeout=5
-        )
-        print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
-    except requests.RequestException as e:
-        print(f"[ERROR] Failed to send event: {e}")
-
-
-def clear_event():
-    """Clear the event from the aggregator."""
-    try:
-        response = requests.post(
-            f"{AGGREGATOR_URL}/clear",
-            json={"id": EVENT_ID},
-            timeout=5
-        )
-        if response.status_code == 200:
-            print(f"[CLEAR] {EVENT_ID}")
-    except requests.RequestException as e:
-        print(f"[ERROR] Failed to clear event: {e}")
-
-
 def main():
    print(f"CPU Usage Detector started")
    print(f"  Aggregator: {AGGREGATOR_URL}")
@@ -58,23 +31,27 @@ def main():
    print(f"  Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
    print()

-    alert_active = False
+    active_alerts = set()

    while True:
        # Get CPU usage over a 1-second sample
        cpu_percent = psutil.cpu_percent(interval=1)
+        current_alerts = set()

        if cpu_percent >= THRESHOLD_CRITICAL:
-            send_event(1, f"CPU at {cpu_percent:.0f}%")
-            alert_active = True
+            send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
+            current_alerts.add(EVENT_ID)
        elif cpu_percent >= THRESHOLD_WARNING:
-            send_event(2, f"CPU at {cpu_percent:.0f}%")
-            alert_active = True
+            send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
+            current_alerts.add(EVENT_ID)
        else:
            print(f"[OK] CPU: {cpu_percent:.0f}%")
-            if alert_active:
-                clear_event()
-                alert_active = False
+
+        # Clear alerts that are no longer active
+        for eid in active_alerts - current_alerts:
+            clear_event(AGGREGATOR_URL, eid)
+
+        active_alerts = current_alerts

        time.sleep(CHECK_INTERVAL - 1)  # Account for 1s sample time