Bump to v1.5.0: deduplicate detectors, fix aggregator bugs, fix blocking I/O

- Extract shared send_event/clear_event into detectors/base.py, removing
  ~150 lines of duplication across all 6 detectors
- Fix default aggregator URL from port 5000 to 5100 in all detectors
- Standardize cpu.py and memory.py to use active_alerts set pattern
- Fix immediate emote rotation on startup (last_emote_change = time.time())
- Extract magic numbers to named constants in aggregator
- Protect write_status() with try/except OSError
- Fix notify event ID collision with monotonic counter
- Replace blocking stream_output() with background daemon threads in kao.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-06 12:17:17 -06:00
parent c3ceb74ce8
commit dd8bf6005b
12 changed files with 126 additions and 236 deletions

View File

@@ -3,7 +3,7 @@ Network/Ping Detector
Monitors if hosts are reachable via ping.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 60)
HOSTS - Comma-separated list of hosts to ping (required)
Example: "8.8.8.8,google.com,192.168.1.1"
@@ -15,10 +15,11 @@ import sys
import time
import platform
import subprocess
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
HOSTS = os.environ.get("HOSTS", "")
TIMEOUT = int(os.environ.get("TIMEOUT", 5))
@@ -44,34 +45,6 @@ def ping(host):
return False
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
if not HOSTS:
print("ERROR: HOSTS environment variable is required")
@@ -99,12 +72,12 @@ def main():
if ping(host):
print(f"[OK] Host '{host}' is reachable")
else:
send_event(event_id, 1, f"Host '{host}' is unreachable")
send_event(AGGREGATOR_URL, event_id, 1, f"Host '{host}' is unreachable", CHECK_INTERVAL)
current_alerts.add(event_id)
# Clear alerts for hosts that are now reachable
for event_id in active_alerts - current_alerts:
clear_event(event_id)
clear_event(AGGREGATOR_URL, event_id)
active_alerts = current_alerts