- Extract shared send_event/clear_event into detectors/base.py, removing ~150 lines of duplication across all 6 detectors - Fix default aggregator URL from port 5000 to 5100 in all detectors - Standardize cpu.py and memory.py to use active_alerts set pattern - Fix immediate emote rotation on startup (last_emote_change = time.time()) - Extract magic numbers to named constants in aggregator - Protect write_status() with try/except OSError - Fix notify event ID collision with monotonic counter - Replace blocking stream_output() with background daemon threads in kao.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
82 lines
2.4 KiB
Python
82 lines
2.4 KiB
Python
"""
|
|
Service Health Detector
|
|
Monitors if specific processes/services are running.
|
|
|
|
Environment variables:
|
|
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
|
|
CHECK_INTERVAL - Seconds between checks (default: 30)
|
|
SERVICES - Comma-separated list of process names to monitor (required)
|
|
Example: "nginx,postgres,redis"
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import psutil
|
|
|
|
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
|
|
|
|
# Configuration from environment
|
|
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
|
|
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
|
|
SERVICES = os.environ.get("SERVICES", "")
|
|
|
|
|
|
def get_running_processes():
|
|
"""Get set of running process names."""
|
|
running = set()
|
|
for proc in psutil.process_iter(['name']):
|
|
try:
|
|
name = proc.info['name']
|
|
if name:
|
|
# Store both with and without common extensions
|
|
running.add(name.lower())
|
|
if name.lower().endswith('.exe'):
|
|
running.add(name.lower()[:-4])
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
return running
|
|
|
|
|
|
def main():
|
|
if not SERVICES:
|
|
print("ERROR: SERVICES environment variable is required")
|
|
print("Example: SERVICES=nginx,postgres,redis python detectors/service.py")
|
|
sys.exit(1)
|
|
|
|
services = [s.strip().lower() for s in SERVICES.split(",") if s.strip()]
|
|
|
|
print(f"Service Health Detector started")
|
|
print(f" Aggregator: {AGGREGATOR_URL}")
|
|
print(f" Interval: {CHECK_INTERVAL}s")
|
|
print(f" Monitoring: {', '.join(services)}")
|
|
print()
|
|
|
|
# Track which services have active alerts
|
|
active_alerts = set()
|
|
|
|
while True:
|
|
running = get_running_processes()
|
|
current_alerts = set()
|
|
|
|
for service in services:
|
|
event_id = f"service_{service}"
|
|
|
|
if service not in running:
|
|
send_event(AGGREGATOR_URL, event_id, 1, f"Service '{service}' is not running", CHECK_INTERVAL)
|
|
current_alerts.add(event_id)
|
|
else:
|
|
print(f"[OK] Service '{service}' is running")
|
|
|
|
# Clear alerts for services that are now running
|
|
for event_id in active_alerts - current_alerts:
|
|
clear_event(AGGREGATOR_URL, event_id)
|
|
|
|
active_alerts = current_alerts
|
|
|
|
time.sleep(CHECK_INTERVAL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|