""" Service Health Detector Monitors if specific processes/services are running. Environment variables: AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000) CHECK_INTERVAL - Seconds between checks (default: 30) SERVICES - Comma-separated list of process names to monitor (required) Example: "nginx,postgres,redis" """ import os import sys import time import psutil import requests # Configuration from environment AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000") CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) SERVICES = os.environ.get("SERVICES", "") def get_running_processes(): """Get set of running process names.""" running = set() for proc in psutil.process_iter(['name']): try: name = proc.info['name'] if name: # Store both with and without common extensions running.add(name.lower()) if name.lower().endswith('.exe'): running.add(name.lower()[:-4]) except (psutil.NoSuchProcess, psutil.AccessDenied): pass return running def send_event(event_id, priority, message): """Send an event to the aggregator with heartbeat TTL.""" ttl = CHECK_INTERVAL * 2 try: response = requests.post( f"{AGGREGATOR_URL}/event", json={"id": event_id, "priority": priority, "message": message, "ttl": ttl}, timeout=5 ) print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}") except requests.RequestException as e: print(f"[ERROR] Failed to send event: {e}") def clear_event(event_id): """Clear the event from the aggregator.""" try: response = requests.post( f"{AGGREGATOR_URL}/clear", json={"id": event_id}, timeout=5 ) if response.status_code == 200: print(f"[CLEAR] {event_id}") except requests.RequestException as e: print(f"[ERROR] Failed to clear event: {e}") def main(): if not SERVICES: print("ERROR: SERVICES environment variable is required") print("Example: SERVICES=nginx,postgres,redis python detectors/service.py") sys.exit(1) services = [s.strip().lower() for s in SERVICES.split(",") if s.strip()] print(f"Service Health Detector started") print(f" Aggregator: {AGGREGATOR_URL}") print(f" Interval: {CHECK_INTERVAL}s") print(f" Monitoring: {', '.join(services)}") print() # Track which services have active alerts active_alerts = set() while True: running = get_running_processes() current_alerts = set() for service in services: event_id = f"service_{service}" if service not in running: send_event(event_id, 1, f"Service '{service}' is not running") current_alerts.add(event_id) else: print(f"[OK] Service '{service}' is running") # Clear alerts for services that are now running for event_id in active_alerts - current_alerts: clear_event(event_id) active_alerts = current_alerts time.sleep(CHECK_INTERVAL) if __name__ == "__main__": main()