From 4f4f5da14c00bf3b7383a6bf3e620db0511ec903 Mon Sep 17 00:00:00 2001 From: Spencer Grimes Date: Mon, 9 Mar 2026 15:44:03 -0500 Subject: [PATCH] Rewrote CPU Detector to expire correctly --- detectors/base.py | 9 ++++++--- detectors/cpu.py | 39 +++++++++++++++------------------------ 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/detectors/base.py b/detectors/base.py index 6a7fbfb..b1570aa 100644 --- a/detectors/base.py +++ b/detectors/base.py @@ -7,9 +7,12 @@ import requests DEFAULT_AGGREGATOR_URL = "http://localhost:5100" -def send_event(url, event_id, priority, message, check_interval): - """Send an event to the aggregator with heartbeat TTL.""" - ttl = check_interval * 2 +def send_event(url, event_id, priority, message, check_interval=None, ttl=None): + """Send an event to the aggregator. + Pass ttl= (seconds) directly, or check_interval= to use heartbeat default (check_interval * 2). + """ + if ttl is None: + ttl = check_interval * 2 try: response = requests.post( f"{url}/event", diff --git a/detectors/cpu.py b/detectors/cpu.py index 310eae1..cb0251e 100644 --- a/detectors/cpu.py +++ b/detectors/cpu.py @@ -1,10 +1,13 @@ """ CPU Usage Detector -Monitors CPU usage and reports to the aggregator when thresholds are exceeded. +Polls CPU usage every 30 seconds. Sends a heartbeat event while usage is above +threshold; if usage drops below 85%, no event is sent and the previous event +expires naturally via its 30-second TTL. Environment variables: AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100) CHECK_INTERVAL - Seconds between checks (default: 30) + TTL - Event lifetime in seconds (default: 30) THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95) THRESHOLD_WARNING - Percent usage for warning alert (default: 85) """ @@ -13,11 +16,11 @@ import os import time import psutil -from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event +from detectors.base import DEFAULT_AGGREGATOR_URL, send_event -# Configuration from environment AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) +TTL = int(os.environ.get("TTL", 30)) THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95)) THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85)) @@ -25,35 +28,23 @@ EVENT_ID = "cpu_usage" def main(): - print(f"CPU Usage Detector started") + print("CPU Usage Detector started") print(f" Aggregator: {AGGREGATOR_URL}") - print(f" Interval: {CHECK_INTERVAL}s") + print(f" Interval: {CHECK_INTERVAL}s, TTL: {TTL}s") print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%") print() - active_alerts = set() - while True: - # Get CPU usage over a 1-second sample - cpu_percent = psutil.cpu_percent(interval=1) - current_alerts = set() + cpu = psutil.cpu_percent(interval=1) # 1-second blocking sample - if cpu_percent >= THRESHOLD_CRITICAL: - send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL) - current_alerts.add(EVENT_ID) - elif cpu_percent >= THRESHOLD_WARNING: - send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL) - current_alerts.add(EVENT_ID) + if cpu >= THRESHOLD_CRITICAL: + send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU {cpu:.0f}%", ttl=TTL) + elif cpu >= THRESHOLD_WARNING: + send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU {cpu:.0f}%", ttl=TTL) else: - print(f"[OK] CPU: {cpu_percent:.0f}%") + print(f"[OK] CPU: {cpu:.0f}%") - # Clear alerts that are no longer active - for eid in active_alerts - current_alerts: - clear_event(AGGREGATOR_URL, eid) - - active_alerts = current_alerts - - time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time + time.sleep(CHECK_INTERVAL - 1) # 29s sleep + 1s sample = 30s total if __name__ == "__main__":