Bump to v1.5.0: deduplicate detectors, fix aggregator bugs, fix blocking I/O

- Extract shared send_event/clear_event into detectors/base.py, removing
  ~150 lines of duplication across all 6 detectors
- Fix default aggregator URL from port 5000 to 5100 in all detectors
- Standardize cpu.py and memory.py to use active_alerts set pattern
- Fix immediate emote rotation on startup (last_emote_change = time.time())
- Extract magic numbers to named constants in aggregator
- Protect write_status() with try/except OSError
- Fix notify event ID collision with monotonic counter
- Replace blocking stream_output() with background daemon threads in kao.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-06 12:17:17 -06:00
parent c3ceb74ce8
commit dd8bf6005b
12 changed files with 126 additions and 236 deletions

View File

@@ -180,6 +180,7 @@ Use in automations:
├── config.json # Runtime configuration ├── config.json # Runtime configuration
├── openapi.yaml # API documentation (OpenAPI 3.0) ├── openapi.yaml # API documentation (OpenAPI 3.0)
├── detectors/ ├── detectors/
│ ├── base.py
│ ├── disk_space.py │ ├── disk_space.py
│ ├── cpu.py │ ├── cpu.py
│ ├── memory.py │ ├── memory.py

View File

@@ -19,6 +19,9 @@ ROOT_DIR = Path(__file__).parent
STATUS_FILE = Path(__file__).parent / "status.json" STATUS_FILE = Path(__file__).parent / "status.json"
DEFAULT_NOTIFY_TTL = 10 # Default TTL for Priority 3 (Notify) events DEFAULT_NOTIFY_TTL = 10 # Default TTL for Priority 3 (Notify) events
CELEBRATION_DURATION = 5 # Seconds to show celebration after recovery CELEBRATION_DURATION = 5 # Seconds to show celebration after recovery
EMOTE_ROTATION_INTERVAL = 300 # Seconds between emote rotations
IDLE_EXPRESSION_CHANCE = 0.15 # Chance of a brief blink/wink on rotation
DEFAULT_NOTIFY_DURATION = 5 # Default duration for /notify events
# Emote variations with paired animations # Emote variations with paired animations
OPTIMAL_EMOTES = [ OPTIMAL_EMOTES = [
@@ -53,10 +56,13 @@ celebrating_until = 0
blinking_until = 0 blinking_until = 0
blink_emote = None blink_emote = None
blink_animation = None blink_animation = None
last_emote_change = 0 last_emote_change = time.time()
current_optimal_emote = OPTIMAL_EMOTES[0][0] current_optimal_emote = OPTIMAL_EMOTES[0][0]
current_optimal_animation = OPTIMAL_EMOTES[0][1] current_optimal_animation = OPTIMAL_EMOTES[0][1]
# Notify counter for unique IDs
_notify_counter = 0
# Sleep mode # Sleep mode
is_sleeping = False is_sleeping = False
SLEEP_EMOTE = "( -_-)zzZ" SLEEP_EMOTE = "( -_-)zzZ"
@@ -134,11 +140,11 @@ def get_current_state():
animation = blink_animation animation = blink_animation
else: else:
# Rotate optimal emotes every 5 minutes # Rotate optimal emotes every 5 minutes
if now - last_emote_change > 300: if now - last_emote_change > EMOTE_ROTATION_INTERVAL:
last_emote_change = now last_emote_change = now
current_optimal_emote, current_optimal_animation = random.choice(OPTIMAL_EMOTES) current_optimal_emote, current_optimal_animation = random.choice(OPTIMAL_EMOTES)
# 15% chance of a brief blink/wink # Brief blink/wink chance on rotation
if random.random() < 0.15: if random.random() < IDLE_EXPRESSION_CHANCE:
blink_emote, blink_animation = random.choice(IDLE_EMOTES) blink_emote, blink_animation = random.choice(IDLE_EMOTES)
blinking_until = now + random.uniform(1, 2) blinking_until = now + random.uniform(1, 2)
emote = current_optimal_emote emote = current_optimal_emote
@@ -163,8 +169,11 @@ def get_current_state():
def write_status(): def write_status():
"""Write current state to status.json.""" """Write current state to status.json."""
state = get_current_state() state = get_current_state()
try:
with open(STATUS_FILE, "w") as f: with open(STATUS_FILE, "w") as f:
json.dump(state, f, indent="\t") json.dump(state, f, indent="\t")
except OSError as e:
print(f"[ERROR] Failed to write status file: {e}")
return state return state
@@ -272,12 +281,14 @@ def notify():
"sound": "chime" # optional: chime, alert, warning, critical, success, none "sound": "chime" # optional: chime, alert, warning, critical, success, none
} }
""" """
global _notify_counter
data = request.get_json(force=True) if request.data else {} data = request.get_json(force=True) if request.data else {}
message = data.get("message", "") message = data.get("message", "")
duration = int(data.get("duration", 5)) duration = int(data.get("duration", DEFAULT_NOTIFY_DURATION))
# Generate unique ID to avoid conflicts # Generate unique ID to avoid conflicts
event_id = f"ha_notify_{int(time.time() * 1000)}" _notify_counter += 1
event_id = f"notify_{int(time.time())}_{_notify_counter}"
event = { event = {
"priority": 3, # Notify priority "priority": 3, # Notify priority

35
detectors/base.py Normal file
View File

@@ -0,0 +1,35 @@
"""
Shared utilities for Kao detectors.
"""
import requests
DEFAULT_AGGREGATOR_URL = "http://localhost:5100"
def send_event(url, event_id, priority, message, check_interval):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = check_interval * 2
try:
response = requests.post(
f"{url}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5,
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(url, event_id):
"""Clear an event from the aggregator."""
try:
response = requests.post(
f"{url}/clear",
json={"id": event_id},
timeout=5,
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")

View File

@@ -3,7 +3,7 @@ CPU Usage Detector
Monitors CPU usage and reports to the aggregator when thresholds are exceeded. Monitors CPU usage and reports to the aggregator when thresholds are exceeded.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 30) CHECK_INTERVAL - Seconds between checks (default: 30)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95) THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85) THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
@@ -12,10 +12,11 @@ Environment variables:
import os import os
import time import time
import psutil import psutil
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95)) THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85)) THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
@@ -23,34 +24,6 @@ THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
EVENT_ID = "cpu_usage" EVENT_ID = "cpu_usage"
def send_event(priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event():
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": EVENT_ID},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {EVENT_ID}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
print(f"CPU Usage Detector started") print(f"CPU Usage Detector started")
print(f" Aggregator: {AGGREGATOR_URL}") print(f" Aggregator: {AGGREGATOR_URL}")
@@ -58,23 +31,27 @@ def main():
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%") print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print() print()
alert_active = False active_alerts = set()
while True: while True:
# Get CPU usage over a 1-second sample # Get CPU usage over a 1-second sample
cpu_percent = psutil.cpu_percent(interval=1) cpu_percent = psutil.cpu_percent(interval=1)
current_alerts = set()
if cpu_percent >= THRESHOLD_CRITICAL: if cpu_percent >= THRESHOLD_CRITICAL:
send_event(1, f"CPU at {cpu_percent:.0f}%") send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
alert_active = True current_alerts.add(EVENT_ID)
elif cpu_percent >= THRESHOLD_WARNING: elif cpu_percent >= THRESHOLD_WARNING:
send_event(2, f"CPU at {cpu_percent:.0f}%") send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
alert_active = True current_alerts.add(EVENT_ID)
else: else:
print(f"[OK] CPU: {cpu_percent:.0f}%") print(f"[OK] CPU: {cpu_percent:.0f}%")
if alert_active:
clear_event() # Clear alerts that are no longer active
alert_active = False for eid in active_alerts - current_alerts:
clear_event(AGGREGATOR_URL, eid)
active_alerts = current_alerts
time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time

View File

@@ -3,7 +3,7 @@ Disk Space Detector
Monitors all drives and reports to the aggregator when thresholds are exceeded. Monitors all drives and reports to the aggregator when thresholds are exceeded.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 300) CHECK_INTERVAL - Seconds between checks (default: 300)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95) THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85) THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
@@ -12,10 +12,11 @@ Environment variables:
import os import os
import time import time
import shutil import shutil
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 300)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 300))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95)) THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85)) THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
@@ -85,34 +86,6 @@ def check_disk(drive):
return None, None, None return None, None, None
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2 # Event expires if not refreshed
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear an event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
print(f"Disk Space Detector started") print(f"Disk Space Detector started")
print(f" Aggregator: {AGGREGATOR_URL}") print(f" Aggregator: {AGGREGATOR_URL}")
@@ -139,18 +112,18 @@ def main():
if percent >= THRESHOLD_CRITICAL: if percent >= THRESHOLD_CRITICAL:
message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)" message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
send_event(event_id, 1, message) send_event(AGGREGATOR_URL, event_id, 1, message, CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
elif percent >= THRESHOLD_WARNING: elif percent >= THRESHOLD_WARNING:
message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)" message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
send_event(event_id, 2, message) send_event(AGGREGATOR_URL, event_id, 2, message, CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
else: else:
print(f"[OK] {drive}: {percent:.0f}%") print(f"[OK] {drive}: {percent:.0f}%")
# Clear alerts that are no longer active # Clear alerts that are no longer active
for event_id in active_alerts - current_alerts: for event_id in active_alerts - current_alerts:
clear_event(event_id) clear_event(AGGREGATOR_URL, event_id)
active_alerts = current_alerts active_alerts = current_alerts

View File

@@ -3,7 +3,7 @@ Docker Container Health Detector
Monitors for containers stuck in restart loops or unhealthy states. Monitors for containers stuck in restart loops or unhealthy states.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 60) CHECK_INTERVAL - Seconds between checks (default: 60)
RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3) RESTART_THRESHOLD - Number of restarts to consider a loop (default: 3)
CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty) CONTAINERS - Comma-separated container names to monitor (optional, monitors all if empty)
@@ -13,10 +13,11 @@ import json
import os import os
import subprocess import subprocess
import time import time
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3)) RESTART_THRESHOLD = int(os.environ.get("RESTART_THRESHOLD", 3))
CONTAINERS = os.environ.get("CONTAINERS", "") CONTAINERS = os.environ.get("CONTAINERS", "")
@@ -70,34 +71,6 @@ def get_restart_count(container_name):
return 0 return 0
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear an event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
# Parse container filter # Parse container filter
filter_containers = None filter_containers = None
@@ -148,10 +121,10 @@ def main():
last_restart_counts[name] = restart_count last_restart_counts[name] = restart_count
if state == "restarting" or new_restarts >= RESTART_THRESHOLD: if state == "restarting" or new_restarts >= RESTART_THRESHOLD:
send_event(event_id, 1, f"Container '{name}' restart loop ({restart_count}x)") send_event(AGGREGATOR_URL, event_id, 1, f"Container '{name}' restart loop ({restart_count}x)", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
elif new_restarts > 0: elif new_restarts > 0:
send_event(event_id, 2, f"Container '{name}' restarting ({restart_count}x)") send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' restarting ({restart_count}x)", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
else: else:
print(f"[OK] Container '{name}' is {state}") print(f"[OK] Container '{name}' is {state}")
@@ -160,14 +133,14 @@ def main():
elif state in ("exited", "dead"): elif state in ("exited", "dead"):
# Only alert if it exited abnormally (non-zero exit code in status) # Only alert if it exited abnormally (non-zero exit code in status)
if "Exited (0)" not in status: if "Exited (0)" not in status:
send_event(event_id, 2, f"Container '{name}' {state}") send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' {state}", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
else: else:
print(f"[OK] Container '{name}' exited cleanly") print(f"[OK] Container '{name}' exited cleanly")
# Check for unhealthy containers # Check for unhealthy containers
elif "unhealthy" in status.lower(): elif "unhealthy" in status.lower():
send_event(event_id, 2, f"Container '{name}' unhealthy") send_event(AGGREGATOR_URL, event_id, 2, f"Container '{name}' unhealthy", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
else: else:
@@ -175,7 +148,7 @@ def main():
# Clear alerts for containers that are now healthy # Clear alerts for containers that are now healthy
for event_id in active_alerts - current_alerts: for event_id in active_alerts - current_alerts:
clear_event(event_id) clear_event(AGGREGATOR_URL, event_id)
active_alerts = current_alerts active_alerts = current_alerts

View File

@@ -3,7 +3,7 @@ Memory Usage Detector
Monitors RAM usage and reports to the aggregator when thresholds are exceeded. Monitors RAM usage and reports to the aggregator when thresholds are exceeded.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 30) CHECK_INTERVAL - Seconds between checks (default: 30)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95) THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85) THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
@@ -12,10 +12,11 @@ Environment variables:
import os import os
import time import time
import psutil import psutil
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95)) THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85)) THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
@@ -23,34 +24,6 @@ THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
EVENT_ID = "memory_usage" EVENT_ID = "memory_usage"
def send_event(priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event():
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": EVENT_ID},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {EVENT_ID}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
print(f"Memory Usage Detector started") print(f"Memory Usage Detector started")
print(f" Aggregator: {AGGREGATOR_URL}") print(f" Aggregator: {AGGREGATOR_URL}")
@@ -58,25 +31,29 @@ def main():
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%") print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print() print()
alert_active = False active_alerts = set()
while True: while True:
mem = psutil.virtual_memory() mem = psutil.virtual_memory()
mem_percent = mem.percent mem_percent = mem.percent
used_gb = mem.used / (1024 ** 3) used_gb = mem.used / (1024 ** 3)
total_gb = mem.total / (1024 ** 3) total_gb = mem.total / (1024 ** 3)
current_alerts = set()
if mem_percent >= THRESHOLD_CRITICAL: if mem_percent >= THRESHOLD_CRITICAL:
send_event(1, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)") send_event(AGGREGATOR_URL, EVENT_ID, 1, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)", CHECK_INTERVAL)
alert_active = True current_alerts.add(EVENT_ID)
elif mem_percent >= THRESHOLD_WARNING: elif mem_percent >= THRESHOLD_WARNING:
send_event(2, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)") send_event(AGGREGATOR_URL, EVENT_ID, 2, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)", CHECK_INTERVAL)
alert_active = True current_alerts.add(EVENT_ID)
else: else:
print(f"[OK] Memory: {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)") print(f"[OK] Memory: {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
if alert_active:
clear_event() # Clear alerts that are no longer active
alert_active = False for eid in active_alerts - current_alerts:
clear_event(AGGREGATOR_URL, eid)
active_alerts = current_alerts
time.sleep(CHECK_INTERVAL) time.sleep(CHECK_INTERVAL)

View File

@@ -3,7 +3,7 @@ Network/Ping Detector
Monitors if hosts are reachable via ping. Monitors if hosts are reachable via ping.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 60) CHECK_INTERVAL - Seconds between checks (default: 60)
HOSTS - Comma-separated list of hosts to ping (required) HOSTS - Comma-separated list of hosts to ping (required)
Example: "8.8.8.8,google.com,192.168.1.1" Example: "8.8.8.8,google.com,192.168.1.1"
@@ -15,10 +15,11 @@ import sys
import time import time
import platform import platform
import subprocess import subprocess
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
HOSTS = os.environ.get("HOSTS", "") HOSTS = os.environ.get("HOSTS", "")
TIMEOUT = int(os.environ.get("TIMEOUT", 5)) TIMEOUT = int(os.environ.get("TIMEOUT", 5))
@@ -44,34 +45,6 @@ def ping(host):
return False return False
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
if not HOSTS: if not HOSTS:
print("ERROR: HOSTS environment variable is required") print("ERROR: HOSTS environment variable is required")
@@ -99,12 +72,12 @@ def main():
if ping(host): if ping(host):
print(f"[OK] Host '{host}' is reachable") print(f"[OK] Host '{host}' is reachable")
else: else:
send_event(event_id, 1, f"Host '{host}' is unreachable") send_event(AGGREGATOR_URL, event_id, 1, f"Host '{host}' is unreachable", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
# Clear alerts for hosts that are now reachable # Clear alerts for hosts that are now reachable
for event_id in active_alerts - current_alerts: for event_id in active_alerts - current_alerts:
clear_event(event_id) clear_event(AGGREGATOR_URL, event_id)
active_alerts = current_alerts active_alerts = current_alerts

View File

@@ -3,7 +3,7 @@ Service Health Detector
Monitors if specific processes/services are running. Monitors if specific processes/services are running.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 30) CHECK_INTERVAL - Seconds between checks (default: 30)
SERVICES - Comma-separated list of process names to monitor (required) SERVICES - Comma-separated list of process names to monitor (required)
Example: "nginx,postgres,redis" Example: "nginx,postgres,redis"
@@ -13,10 +13,11 @@ import os
import sys import sys
import time import time
import psutil import psutil
import requests
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
# Configuration from environment # Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
SERVICES = os.environ.get("SERVICES", "") SERVICES = os.environ.get("SERVICES", "")
@@ -37,34 +38,6 @@ def get_running_processes():
return running return running
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main(): def main():
if not SERVICES: if not SERVICES:
print("ERROR: SERVICES environment variable is required") print("ERROR: SERVICES environment variable is required")
@@ -90,14 +63,14 @@ def main():
event_id = f"service_{service}" event_id = f"service_{service}"
if service not in running: if service not in running:
send_event(event_id, 1, f"Service '{service}' is not running") send_event(AGGREGATOR_URL, event_id, 1, f"Service '{service}' is not running", CHECK_INTERVAL)
current_alerts.add(event_id) current_alerts.add(event_id)
else: else:
print(f"[OK] Service '{service}' is running") print(f"[OK] Service '{service}' is running")
# Clear alerts for services that are now running # Clear alerts for services that are now running
for event_id in active_alerts - current_alerts: for event_id in active_alerts - current_alerts:
clear_event(event_id) clear_event(AGGREGATOR_URL, event_id)
active_alerts = current_alerts active_alerts = current_alerts

View File

@@ -217,7 +217,7 @@
const emoteEl = document.getElementById("emote"); const emoteEl = document.getElementById("emote");
const messageEl = document.getElementById("message"); const messageEl = document.getElementById("message");
const POLL_INTERVAL = 2000; const POLL_INTERVAL = 2000;
const VERSION = "v1.4.0"; const VERSION = "v1.5.0";
// Sound system // Sound system
let audioCtx = null; let audioCtx = null;

21
kao.py
View File

@@ -11,6 +11,7 @@ import os
import signal import signal
import subprocess import subprocess
import sys import sys
import threading
import time import time
from pathlib import Path from pathlib import Path
@@ -59,11 +60,19 @@ class KaoManager:
universal_newlines=True, universal_newlines=True,
) )
print(f"[{name}] Started (PID {process.pid})") print(f"[{name}] Started (PID {process.pid})")
# Read output in a background thread to avoid blocking the main loop
thread = threading.Thread(target=self._read_output, args=(name, process), daemon=True)
thread.start()
return process return process
except Exception as e: except Exception as e:
print(f"[{name}] Failed to start: {e}") print(f"[{name}] Failed to start: {e}")
return None return None
def _read_output(self, name, process):
"""Read and print output from a process in a background thread."""
for line in process.stdout:
print(f"[{name}] {line.rstrip()}")
def wait_for_aggregator(self, url, timeout=AGGREGATOR_STARTUP_TIMEOUT): def wait_for_aggregator(self, url, timeout=AGGREGATOR_STARTUP_TIMEOUT):
"""Wait for the aggregator to become available.""" """Wait for the aggregator to become available."""
print(f"[aggregator] Waiting for service at {url}...") print(f"[aggregator] Waiting for service at {url}...")
@@ -80,15 +89,6 @@ class KaoManager:
print(f"[aggregator] Timeout waiting for service") print(f"[aggregator] Timeout waiting for service")
return False return False
def stream_output(self, name, process):
"""Read and print output from a process (non-blocking)."""
if process.stdout:
while True:
line = process.stdout.readline()
if not line:
break
print(f"[{name}] {line.rstrip()}")
def get_aggregator_url(self): def get_aggregator_url(self):
"""Get aggregator URL from config port.""" """Get aggregator URL from config port."""
port = self.config.get("port", 5100) port = self.config.get("port", 5100)
@@ -135,9 +135,6 @@ class KaoManager:
for name, info in list(self.processes.items()): for name, info in list(self.processes.items()):
process = info["process"] process = info["process"]
# Stream any available output
self.stream_output(name, process)
# Check if process has exited # Check if process has exited
if process.poll() is not None: if process.poll() is not None:
print(f"[{name}] Exited with code {process.returncode}, restarting in {RESTART_DELAY}s...") print(f"[{name}] Exited with code {process.returncode}, restarting in {RESTART_DELAY}s...")

View File

@@ -14,7 +14,7 @@ info:
## TTL/Heartbeat Pattern ## TTL/Heartbeat Pattern
Events can have a TTL (time-to-live) that auto-expires them. Detectors typically send Events can have a TTL (time-to-live) that auto-expires them. Detectors typically send
heartbeat events that expire if not refreshed, indicating loss of communication. heartbeat events that expire if not refreshed, indicating loss of communication.
version: 1.4.0 version: 1.5.0
license: license:
name: MIT name: MIT