Initial commit: Sentry-Emote system monitor

- Aggregator: Flask-based event broker with priority queue - Frontend: OLED-optimized UI with animations - Detectors: disk, cpu, memory, service, network - Unified entry point (sentry.py) with process management - Heartbeat TTL system for auto-clearing stale events Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 21:04:02 -06:00
commit 11896919e4
13 changed files with 1405 additions and 0 deletions
--- a/detectors/cpu.py
+++ b/detectors/cpu.py
@@ -0,0 +1,83 @@
+"""
+CPU Usage Detector
+Monitors CPU usage and reports to the aggregator when thresholds are exceeded.
+
+Environment variables:
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL      - Seconds between checks (default: 30)
+    THRESHOLD_CRITICAL  - Percent usage for critical alert (default: 95)
+    THRESHOLD_WARNING   - Percent usage for warning alert (default: 85)
+"""
+
+import os
+import time
+import psutil
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
+THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
+THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
+
+EVENT_ID = "cpu_usage"
+
+
+def send_event(priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event():
+    """Clear the event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": EVENT_ID},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {EVENT_ID}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    print(f"CPU Usage Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
+    print()
+
+    alert_active = False
+
+    while True:
+        # Get CPU usage over a 1-second sample
+        cpu_percent = psutil.cpu_percent(interval=1)
+
+        if cpu_percent >= THRESHOLD_CRITICAL:
+            send_event(1, f"CPU at {cpu_percent:.0f}%")
+            alert_active = True
+        elif cpu_percent >= THRESHOLD_WARNING:
+            send_event(2, f"CPU at {cpu_percent:.0f}%")
+            alert_active = True
+        else:
+            print(f"[OK] CPU: {cpu_percent:.0f}%")
+            if alert_active:
+                clear_event()
+                alert_active = False
+
+        time.sleep(CHECK_INTERVAL - 1)  # Account for 1s sample time
+
+
+if __name__ == "__main__":
+    main()
--- a/detectors/disk_space.py
+++ b/detectors/disk_space.py
@@ -0,0 +1,159 @@
+"""
+Disk Space Detector
+Monitors all drives and reports to the aggregator when thresholds are exceeded.
+
+Environment variables:
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL      - Seconds between checks (default: 300)
+    THRESHOLD_CRITICAL  - Percent usage for critical alert (default: 95)
+    THRESHOLD_WARNING   - Percent usage for warning alert (default: 85)
+"""
+
+import os
+import time
+import shutil
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 300))
+THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
+THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
+
+
+def get_all_drives():
+    """Get list of mounted drives/partitions."""
+    import platform
+    drives = []
+
+    if platform.system() == "Windows":
+        import string
+        for letter in string.ascii_uppercase:
+            drive = f"{letter}:\\"
+            try:
+                shutil.disk_usage(drive)
+                drives.append(drive)
+            except (FileNotFoundError, PermissionError, OSError):
+                pass
+    else:
+        # Linux/macOS - parse /proc/mounts or /etc/mtab for real filesystems
+        seen_devices = set()
+        try:
+            with open("/proc/mounts", "r") as f:
+                for line in f:
+                    parts = line.split()
+                    if len(parts) < 2:
+                        continue
+                    device, mount = parts[0], parts[1]
+                    # Skip virtual filesystems
+                    if not device.startswith("/dev/"):
+                        continue
+                    # Skip duplicate devices (e.g., bind mounts)
+                    if device in seen_devices:
+                        continue
+                    seen_devices.add(device)
+                    try:
+                        shutil.disk_usage(mount)
+                        drives.append(mount)
+                    except (FileNotFoundError, PermissionError, OSError):
+                        pass
+        except FileNotFoundError:
+            # Fallback for macOS or systems without /proc/mounts
+            for mount in ["/", "/home", "/var"]:
+                if os.path.exists(mount):
+                    try:
+                        shutil.disk_usage(mount)
+                        drives.append(mount)
+                    except (FileNotFoundError, PermissionError, OSError):
+                        pass
+
+    return drives
+
+
+def check_disk(drive):
+    """Check disk usage for a drive. Returns (percent_used, total_gb, used_gb)."""
+    try:
+        usage = shutil.disk_usage(drive)
+        total_gb = usage.total / (1024 ** 3)
+        used_gb = usage.used / (1024 ** 3)
+        percent = (usage.used / usage.total) * 100
+        return percent, total_gb, used_gb
+    except Exception:
+        return None, None, None
+
+
+def send_event(event_id, priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2  # Event expires if not refreshed
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event(event_id):
+    """Clear an event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": event_id},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {event_id}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    print(f"Disk Space Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
+    print()
+
+    # Track active alerts to know when to clear
+    active_alerts = set()
+
+    while True:
+        drives = get_all_drives()
+        print(f"[CHECK] Scanning {len(drives)} drive(s)...")
+
+        current_alerts = set()
+
+        for drive in drives:
+            percent, total_gb, used_gb = check_disk(drive)
+            if percent is None:
+                continue
+
+            # Create a clean event ID from drive path
+            event_id = f"disk_{drive.replace(':', '').replace('/', '_').replace('\\', '').strip('_') or 'root'}"
+
+            if percent >= THRESHOLD_CRITICAL:
+                message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
+                send_event(event_id, 1, message)
+                current_alerts.add(event_id)
+            elif percent >= THRESHOLD_WARNING:
+                message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
+                send_event(event_id, 2, message)
+                current_alerts.add(event_id)
+            else:
+                print(f"[OK] {drive}: {percent:.0f}%")
+
+        # Clear alerts that are no longer active
+        for event_id in active_alerts - current_alerts:
+            clear_event(event_id)
+
+        active_alerts = current_alerts
+
+        print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
+        time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
--- a/detectors/memory.py
+++ b/detectors/memory.py
@@ -0,0 +1,85 @@
+"""
+Memory Usage Detector
+Monitors RAM usage and reports to the aggregator when thresholds are exceeded.
+
+Environment variables:
+    AGGREGATOR_URL      - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL      - Seconds between checks (default: 30)
+    THRESHOLD_CRITICAL  - Percent usage for critical alert (default: 95)
+    THRESHOLD_WARNING   - Percent usage for warning alert (default: 85)
+"""
+
+import os
+import time
+import psutil
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
+THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
+THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
+
+EVENT_ID = "memory_usage"
+
+
+def send_event(priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event():
+    """Clear the event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": EVENT_ID},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {EVENT_ID}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    print(f"Memory Usage Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
+    print()
+
+    alert_active = False
+
+    while True:
+        mem = psutil.virtual_memory()
+        mem_percent = mem.percent
+        used_gb = mem.used / (1024 ** 3)
+        total_gb = mem.total / (1024 ** 3)
+
+        if mem_percent >= THRESHOLD_CRITICAL:
+            send_event(1, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
+            alert_active = True
+        elif mem_percent >= THRESHOLD_WARNING:
+            send_event(2, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
+            alert_active = True
+        else:
+            print(f"[OK] Memory: {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
+            if alert_active:
+                clear_event()
+                alert_active = False
+
+        time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
--- a/detectors/network.py
+++ b/detectors/network.py
@@ -0,0 +1,115 @@
+"""
+Network/Ping Detector
+Monitors if hosts are reachable via ping.
+
+Environment variables:
+    AGGREGATOR_URL  - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL  - Seconds between checks (default: 60)
+    HOSTS           - Comma-separated list of hosts to ping (required)
+                      Example: "8.8.8.8,google.com,192.168.1.1"
+    TIMEOUT         - Ping timeout in seconds (default: 5)
+"""
+
+import os
+import sys
+import time
+import platform
+import subprocess
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
+HOSTS = os.environ.get("HOSTS", "")
+TIMEOUT = int(os.environ.get("TIMEOUT", 5))
+
+
+def ping(host):
+    """Ping a host. Returns True if reachable."""
+    param = "-n" if platform.system().lower() == "windows" else "-c"
+    timeout_param = "-w" if platform.system().lower() == "windows" else "-W"
+    timeout_val = str(TIMEOUT * 1000) if platform.system().lower() == "windows" else str(TIMEOUT)
+
+    try:
+        result = subprocess.run(
+            ["ping", param, "1", timeout_param, timeout_val, host],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            timeout=TIMEOUT + 2
+        )
+        return result.returncode == 0
+    except subprocess.TimeoutExpired:
+        return False
+    except Exception:
+        return False
+
+
+def send_event(event_id, priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event(event_id):
+    """Clear the event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": event_id},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {event_id}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    if not HOSTS:
+        print("ERROR: HOSTS environment variable is required")
+        print("Example: HOSTS=8.8.8.8,google.com python detectors/network.py")
+        sys.exit(1)
+
+    hosts = [h.strip() for h in HOSTS.split(",") if h.strip()]
+
+    print(f"Network/Ping Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Timeout: {TIMEOUT}s")
+    print(f"  Monitoring: {', '.join(hosts)}")
+    print()
+
+    # Track which hosts have active alerts
+    active_alerts = set()
+
+    while True:
+        current_alerts = set()
+
+        for host in hosts:
+            event_id = f"ping_{host.replace('.', '_').replace(':', '_')}"
+
+            if ping(host):
+                print(f"[OK] Host '{host}' is reachable")
+            else:
+                send_event(event_id, 1, f"Host '{host}' is unreachable")
+                current_alerts.add(event_id)
+
+        # Clear alerts for hosts that are now reachable
+        for event_id in active_alerts - current_alerts:
+            clear_event(event_id)
+
+        active_alerts = current_alerts
+
+        time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
--- a/detectors/service.py
+++ b/detectors/service.py
@@ -0,0 +1,108 @@
+"""
+Service Health Detector
+Monitors if specific processes/services are running.
+
+Environment variables:
+    AGGREGATOR_URL  - URL of the aggregator (default: http://localhost:5000)
+    CHECK_INTERVAL  - Seconds between checks (default: 30)
+    SERVICES        - Comma-separated list of process names to monitor (required)
+                      Example: "nginx,postgres,redis"
+"""
+
+import os
+import sys
+import time
+import psutil
+import requests
+
+# Configuration from environment
+AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
+CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
+SERVICES = os.environ.get("SERVICES", "")
+
+
+def get_running_processes():
+    """Get set of running process names."""
+    running = set()
+    for proc in psutil.process_iter(['name']):
+        try:
+            name = proc.info['name']
+            if name:
+                # Store both with and without common extensions
+                running.add(name.lower())
+                if name.lower().endswith('.exe'):
+                    running.add(name.lower()[:-4])
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            pass
+    return running
+
+
+def send_event(event_id, priority, message):
+    """Send an event to the aggregator with heartbeat TTL."""
+    ttl = CHECK_INTERVAL * 2
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/event",
+            json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
+            timeout=5
+        )
+        print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to send event: {e}")
+
+
+def clear_event(event_id):
+    """Clear the event from the aggregator."""
+    try:
+        response = requests.post(
+            f"{AGGREGATOR_URL}/clear",
+            json={"id": event_id},
+            timeout=5
+        )
+        if response.status_code == 200:
+            print(f"[CLEAR] {event_id}")
+    except requests.RequestException as e:
+        print(f"[ERROR] Failed to clear event: {e}")
+
+
+def main():
+    if not SERVICES:
+        print("ERROR: SERVICES environment variable is required")
+        print("Example: SERVICES=nginx,postgres,redis python detectors/service.py")
+        sys.exit(1)
+
+    services = [s.strip().lower() for s in SERVICES.split(",") if s.strip()]
+
+    print(f"Service Health Detector started")
+    print(f"  Aggregator: {AGGREGATOR_URL}")
+    print(f"  Interval: {CHECK_INTERVAL}s")
+    print(f"  Monitoring: {', '.join(services)}")
+    print()
+
+    # Track which services have active alerts
+    active_alerts = set()
+
+    while True:
+        running = get_running_processes()
+        current_alerts = set()
+
+        for service in services:
+            event_id = f"service_{service}"
+
+            if service not in running:
+                send_event(event_id, 1, f"Service '{service}' is not running")
+                current_alerts.add(event_id)
+            else:
+                print(f"[OK] Service '{service}' is running")
+
+        # Clear alerts for services that are now running
+        for event_id in active_alerts - current_alerts:
+            clear_event(event_id)
+
+        active_alerts = current_alerts
+
+        time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()