Rewrote CPU Detector to expire correctly
This commit is contained in:
@@ -7,9 +7,12 @@ import requests
|
|||||||
DEFAULT_AGGREGATOR_URL = "http://localhost:5100"
|
DEFAULT_AGGREGATOR_URL = "http://localhost:5100"
|
||||||
|
|
||||||
|
|
||||||
def send_event(url, event_id, priority, message, check_interval):
|
def send_event(url, event_id, priority, message, check_interval=None, ttl=None):
|
||||||
"""Send an event to the aggregator with heartbeat TTL."""
|
"""Send an event to the aggregator.
|
||||||
ttl = check_interval * 2
|
Pass ttl= (seconds) directly, or check_interval= to use heartbeat default (check_interval * 2).
|
||||||
|
"""
|
||||||
|
if ttl is None:
|
||||||
|
ttl = check_interval * 2
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
f"{url}/event",
|
f"{url}/event",
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
"""
|
"""
|
||||||
CPU Usage Detector
|
CPU Usage Detector
|
||||||
Monitors CPU usage and reports to the aggregator when thresholds are exceeded.
|
Polls CPU usage every 30 seconds. Sends a heartbeat event while usage is above
|
||||||
|
threshold; if usage drops below 85%, no event is sent and the previous event
|
||||||
|
expires naturally via its 30-second TTL.
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
|
AGGREGATOR_URL - URL of the aggregator (default: http://localhost:5100)
|
||||||
CHECK_INTERVAL - Seconds between checks (default: 30)
|
CHECK_INTERVAL - Seconds between checks (default: 30)
|
||||||
|
TTL - Event lifetime in seconds (default: 30)
|
||||||
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
|
THRESHOLD_CRITICAL - Percent usage for critical alert (default: 95)
|
||||||
THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
|
THRESHOLD_WARNING - Percent usage for warning alert (default: 85)
|
||||||
"""
|
"""
|
||||||
@@ -13,11 +16,11 @@ import os
|
|||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event, clear_event
|
from detectors.base import DEFAULT_AGGREGATOR_URL, send_event
|
||||||
|
|
||||||
# Configuration from environment
|
|
||||||
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
|
AGGREGATOR_URL = os.environ.get("AGGREGATOR_URL", DEFAULT_AGGREGATOR_URL)
|
||||||
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
|
CHECK_INTERVAL = int(os.environ.get("CHECK_INTERVAL", 30))
|
||||||
|
TTL = int(os.environ.get("TTL", 30))
|
||||||
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
|
THRESHOLD_CRITICAL = int(os.environ.get("THRESHOLD_CRITICAL", 95))
|
||||||
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
|
THRESHOLD_WARNING = int(os.environ.get("THRESHOLD_WARNING", 85))
|
||||||
|
|
||||||
@@ -25,35 +28,23 @@ EVENT_ID = "cpu_usage"
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print(f"CPU Usage Detector started")
|
print("CPU Usage Detector started")
|
||||||
print(f" Aggregator: {AGGREGATOR_URL}")
|
print(f" Aggregator: {AGGREGATOR_URL}")
|
||||||
print(f" Interval: {CHECK_INTERVAL}s")
|
print(f" Interval: {CHECK_INTERVAL}s, TTL: {TTL}s")
|
||||||
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
|
print(f" Thresholds: Warning={THRESHOLD_WARNING}%, Critical={THRESHOLD_CRITICAL}%")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
active_alerts = set()
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
# Get CPU usage over a 1-second sample
|
cpu = psutil.cpu_percent(interval=1) # 1-second blocking sample
|
||||||
cpu_percent = psutil.cpu_percent(interval=1)
|
|
||||||
current_alerts = set()
|
|
||||||
|
|
||||||
if cpu_percent >= THRESHOLD_CRITICAL:
|
if cpu >= THRESHOLD_CRITICAL:
|
||||||
send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
|
send_event(AGGREGATOR_URL, EVENT_ID, 1, f"CPU {cpu:.0f}%", ttl=TTL)
|
||||||
current_alerts.add(EVENT_ID)
|
elif cpu >= THRESHOLD_WARNING:
|
||||||
elif cpu_percent >= THRESHOLD_WARNING:
|
send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU {cpu:.0f}%", ttl=TTL)
|
||||||
send_event(AGGREGATOR_URL, EVENT_ID, 2, f"CPU at {cpu_percent:.0f}%", CHECK_INTERVAL)
|
|
||||||
current_alerts.add(EVENT_ID)
|
|
||||||
else:
|
else:
|
||||||
print(f"[OK] CPU: {cpu_percent:.0f}%")
|
print(f"[OK] CPU: {cpu:.0f}%")
|
||||||
|
|
||||||
# Clear alerts that are no longer active
|
time.sleep(CHECK_INTERVAL - 1) # 29s sleep + 1s sample = 30s total
|
||||||
for eid in active_alerts - current_alerts:
|
|
||||||
clear_event(AGGREGATOR_URL, eid)
|
|
||||||
|
|
||||||
active_alerts = current_alerts
|
|
||||||
|
|
||||||
time.sleep(CHECK_INTERVAL - 1) # Account for 1s sample time
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user