Rewrote CPU Detector to expire correctly

This commit is contained in:
2026-03-09 15:44:03 -05:00
parent 1ecd7607f0
commit 4f4f5da14c
2 changed files with 21 additions and 27 deletions

View File

@@ -7,9 +7,12 @@ import requests
DEFAULT_AGGREGATOR_URL = "http://localhost:5100" DEFAULT_AGGREGATOR_URL = "http://localhost:5100"
def send_event(url, event_id, priority, message, check_interval): def send_event(url, event_id, priority, message, check_interval=None, ttl=None):
"""Send an event to the aggregator with heartbeat TTL.""" """Send an event to the aggregator.
ttl = check_interval * 2 Pass ttl= (seconds) directly, or check_interval= to use heartbeat default (check_interval * 2).
"""
if ttl is None:
ttl = check_interval * 2
try: try:
response = requests.post( response = requests.post(
f"{url}/event", f"{url}/event",

View File

@@ -1,10 +1,13 @@
""" """
CPU Usage Detector CPU Usage Detector
Monitors CPU usage and reports to the aggregator when thresholds are exceeded. Polls CPU usage every 30 seconds. Sends a heartbeat event while usage is above
threshold; if usage drops below 85%, no event is sent and the previous event
expires naturally via its 30-second TTL.
Environment variables: Environment variables:
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100) AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
CHECK_INTERVAL - Seconds between checks (default: 30) CHECK_INTERVAL - Seconds between checks (default: 30)
TTL - Event lifetime in seconds (default: 30)
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95) THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
THRESHOLD_WARNING - Percent usage for warning alert (default: 85) THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
""" """
@@ -13,11 +16,11 @@ import os
import time import time
import psutil import psutil
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event from detectors.base import DEFAULT_AGGREGATOR_URL, send_event
# Configuration from environment
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL) AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30)) CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
TTL = int(os.environ.get("TTL", 30))
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95)) THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85)) THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
@@ -25,35 +28,23 @@ EVENT_ID = "cpu_usage"
def main(): def main():
print(f"CPU Usage Detector started") print("CPU Usage Detector started")
print(f" Aggregator: {AGGREGATOR_URL}") print(f" Aggregator: {AGGREGATOR_URL}")
print(f" Interval: {CHECK_INTERVAL}s") print(f" Interval: {CHECK_INTERVAL}s, TTL: {TTL}s")
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%") print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
print() print()
active_alerts = set()
while True: while True:
# Get CPU usage over a 1-second sample cpu = psutil.cpu_percent(interval=1) # 1-second blocking sample
cpu_percent = psutil.cpu_percent(interval=1)
current_alerts = set()
if cpu_percent >= THRESHOLD_CRITICAL: if cpu >= THRESHOLD_CRITICAL:
send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL) send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU {cpu:.0f}%", ttl=TTL)
current_alerts.add(EVENT_ID) elif cpu >= THRESHOLD_WARNING:
elif cpu_percent >= THRESHOLD_WARNING: send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU {cpu:.0f}%", ttl=TTL)
send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
current_alerts.add(EVENT_ID)
else: else:
print(f"[OK] CPU: {cpu_percent:.0f}%") print(f"[OK] CPU: {cpu:.0f}%")
# Clear alerts that are no longer active time.sleep(CHECK_INTERVAL - 1) # 29s sleep + 1s sample = 30s total
for eid in active_alerts - current_alerts:
clear_event(AGGREGATOR_URL, eid)
active_alerts = current_alerts
time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time
if __name__ == "__main__": if __name__ == "__main__":