Initial commit: Sentry-Emote system monitor

- Aggregator: Flask-based event broker with priority queue
- Frontend: OLED-optimized UI with animations
- Detectors: disk, cpu, memory, service, network
- Unified entry point (sentry.py) with process management
- Heartbeat TTL system for auto-clearing stale events

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-02 21:04:02 -06:00
commit 11896919e4
13 changed files with 1405 additions and 0 deletions

83
detectors/cpu.py Normal file
View File

@@ -0,0 +1,83 @@
"""
CPU Usage Detector
Monitors CPU usage and reports to the aggregator when thresholds are exceeded.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 30)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
"""
import os
import time
import psutil
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
EVENT_ID = "cpu_usage"
def send_event(priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event():
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": EVENT_ID},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {EVENT_ID}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
print(f"CPU Usage Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print()
alert_active = False
while True:
# Get CPU usage over a 1-second sample
cpu_percent = psutil.cpu_percent(interval=1)
if cpu_percent >= THRESHOLD_CRITICAL:
send_event(1, f"CPU at {cpu_percent:.0f}%")
alert_active = True
elif cpu_percent >= THRESHOLD_WARNING:
send_event(2, f"CPU at {cpu_percent:.0f}%")
alert_active = True
else:
print(f"[OK] CPU: {cpu_percent:.0f}%")
if alert_active:
clear_event()
alert_active = False
time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time
if __name__ == "__main__":
main()

159
detectors/disk_space.py Normal file
View File

@@ -0,0 +1,159 @@
"""
Disk Space Detector
Monitors all drives and reports to the aggregator when thresholds are exceeded.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 300)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
"""
import os
import time
import shutil
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 300))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
def get_all_drives():
"""Get list of mounted drives/partitions."""
import platform
drives = []
if platform.system() == "Windows":
import string
for letter in string.ascii_uppercase:
drive = f"{letter}:\\"
try:
shutil.disk_usage(drive)
drives.append(drive)
except (FileNotFoundError, PermissionError, OSError):
pass
else:
# Linux/macOS - parse /proc/mounts or /etc/mtab for real filesystems
seen_devices = set()
try:
with open("/proc/mounts", "r") as f:
for line in f:
parts = line.split()
if len(parts) < 2:
continue
device, mount = parts[0], parts[1]
# Skip virtual filesystems
if not device.startswith("/dev/"):
continue
# Skip duplicate devices (e.g., bind mounts)
if device in seen_devices:
continue
seen_devices.add(device)
try:
shutil.disk_usage(mount)
drives.append(mount)
except (FileNotFoundError, PermissionError, OSError):
pass
except FileNotFoundError:
# Fallback for macOS or systems without /proc/mounts
for mount in ["/", "/home", "/var"]:
if os.path.exists(mount):
try:
shutil.disk_usage(mount)
drives.append(mount)
except (FileNotFoundError, PermissionError, OSError):
pass
return drives
def check_disk(drive):
"""Check disk usage for a drive. Returns (percent_used, total_gb, used_gb)."""
try:
usage = shutil.disk_usage(drive)
total_gb = usage.total / (1024 ** 3)
used_gb = usage.used / (1024 ** 3)
percent = (usage.used / usage.total) * 100
return percent, total_gb, used_gb
except Exception:
return None, None, None
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2 # Event expires if not refreshed
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear an event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
print(f"Disk Space Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print()
# Track active alerts to know when to clear
active_alerts = set()
while True:
drives = get_all_drives()
print(f"[CHECK] Scanning {len(drives)} drive(s)...")
current_alerts = set()
for drive in drives:
percent, total_gb, used_gb = check_disk(drive)
if percent is None:
continue
# Create a clean event ID from drive path
event_id = f"disk_{drive.replace(':', '').replace('/', '_').replace('\\', '').strip('_') or 'root'}"
if percent >= THRESHOLD_CRITICAL:
message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
send_event(event_id, 1, message)
current_alerts.add(event_id)
elif percent >= THRESHOLD_WARNING:
message = f"{drive} at {percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)"
send_event(event_id, 2, message)
current_alerts.add(event_id)
else:
print(f"[OK] {drive}: {percent:.0f}%")
# Clear alerts that are no longer active
for event_id in active_alerts - current_alerts:
clear_event(event_id)
active_alerts = current_alerts
print(f"[SLEEP] Next check in {CHECK_INTERVAL}s\n")
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
main()

85
detectors/memory.py Normal file
View File

@@ -0,0 +1,85 @@
"""
Memory Usage Detector
Monitors RAM usage and reports to the aggregator when thresholds are exceeded.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 30)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
"""
import os
import time
import psutil
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
EVENT_ID = "memory_usage"
def send_event(priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": EVENT_ID, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event():
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": EVENT_ID},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {EVENT_ID}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
print(f"Memory Usage Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print()
alert_active = False
while True:
mem = psutil.virtual_memory()
mem_percent = mem.percent
used_gb = mem.used / (1024 ** 3)
total_gb = mem.total / (1024 ** 3)
if mem_percent >= THRESHOLD_CRITICAL:
send_event(1, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
alert_active = True
elif mem_percent >= THRESHOLD_WARNING:
send_event(2, f"Memory at {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
alert_active = True
else:
print(f"[OK] Memory: {mem_percent:.0f}% ({used_gb:.1f}/{total_gb:.1f} GB)")
if alert_active:
clear_event()
alert_active = False
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
main()

115
detectors/network.py Normal file
View File

@@ -0,0 +1,115 @@
"""
Network/Ping Detector
Monitors if hosts are reachable via ping.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 60)
HOSTS - Comma-separated list of hosts to ping (required)
Example: "8.8.8.8,google.com,192.168.1.1"
TIMEOUT - Ping timeout in seconds (default: 5)
"""
import os
import sys
import time
import platform
import subprocess
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 60))
HOSTS = os.environ.get("HOSTS", "")
TIMEOUT = int(os.environ.get("TIMEOUT", 5))
def ping(host):
"""Ping a host. Returns True if reachable."""
param = "-n" if platform.system().lower() == "windows" else "-c"
timeout_param = "-w" if platform.system().lower() == "windows" else "-W"
timeout_val = str(TIMEOUT * 1000) if platform.system().lower() == "windows" else str(TIMEOUT)
try:
result = subprocess.run(
["ping", param, "1", timeout_param, timeout_val, host],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=TIMEOUT + 2
)
return result.returncode == 0
except subprocess.TimeoutExpired:
return False
except Exception:
return False
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
if not HOSTS:
print("ERROR: HOSTS environment variable is required")
print("Example: HOSTS=8.8.8.8,google.com python detectors/network.py")
sys.exit(1)
hosts = [h.strip() for h in HOSTS.split(",") if h.strip()]
print(f"Network/Ping Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Timeout: {TIMEOUT}s")
print(f" Monitoring: {', '.join(hosts)}")
print()
# Track which hosts have active alerts
active_alerts = set()
while True:
current_alerts = set()
for host in hosts:
event_id = f"ping_{host.replace('.', '_').replace(':', '_')}"
if ping(host):
print(f"[OK] Host '{host}' is reachable")
else:
send_event(event_id, 1, f"Host '{host}' is unreachable")
current_alerts.add(event_id)
# Clear alerts for hosts that are now reachable
for event_id in active_alerts - current_alerts:
clear_event(event_id)
active_alerts = current_alerts
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
main()

108
detectors/service.py Normal file
View File

@@ -0,0 +1,108 @@
"""
Service Health Detector
Monitors if specific processes/services are running.
Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5000)
CHECK_INTERVAL - Seconds between checks (default: 30)
SERVICES - Comma-separated list of process names to monitor (required)
Example: "nginx,postgres,redis"
"""
import os
import sys
import time
import psutil
import requests
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", "http://localhost:5000")
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
SERVICES = os.environ.get("SERVICES", "")
def get_running_processes():
"""Get set of running process names."""
running = set()
for proc in psutil.process_iter(['name']):
try:
name = proc.info['name']
if name:
# Store both with and without common extensions
running.add(name.lower())
if name.lower().endswith('.exe'):
running.add(name.lower()[:-4])
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return running
def send_event(event_id, priority, message):
"""Send an event to the aggregator with heartbeat TTL."""
ttl = CHECK_INTERVAL * 2
try:
response = requests.post(
f"{AGGREGATOR_URL}/event",
json={"id": event_id, "priority": priority, "message": message, "ttl": ttl},
timeout=5
)
print(f"[EVENT] {event_id}: {message} (priority {priority}, ttl {ttl}s) -> {response.status_code}")
except requests.RequestException as e:
print(f"[ERROR] Failed to send event: {e}")
def clear_event(event_id):
"""Clear the event from the aggregator."""
try:
response = requests.post(
f"{AGGREGATOR_URL}/clear",
json={"id": event_id},
timeout=5
)
if response.status_code == 200:
print(f"[CLEAR] {event_id}")
except requests.RequestException as e:
print(f"[ERROR] Failed to clear event: {e}")
def main():
if not SERVICES:
print("ERROR: SERVICES environment variable is required")
print("Example: SERVICES=nginx,postgres,redis python detectors/service.py")
sys.exit(1)
services = [s.strip().lower() for s in SERVICES.split(",") if s.strip()]
print(f"Service Health Detector started")
print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s")
print(f" Monitoring: {', '.join(services)}")
print()
# Track which services have active alerts
active_alerts = set()
while True:
running = get_running_processes()
current_alerts = set()
for service in services:
event_id = f"service_{service}"
if service not in running:
send_event(event_id, 1, f"Service '{service}' is not running")
current_alerts.add(event_id)
else:
print(f"[OK] Service '{service}' is running")
# Clear alerts for services that are now running
for event_id in active_alerts - current_alerts:
clear_event(event_id)
active_alerts = current_alerts
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
main()