Files
TTN-VEGAPULS-Air-exporter/prometheus-alerts.yml
2026-01-04 11:07:38 +01:00

205 lines
7.5 KiB
YAML

# Prometheus Alert Rules for VEGAPULS Air Sensors
#
# Installation:
# 1. Copy this file to /etc/prometheus/rules/vegapuls-alerts.yml
# 2. Add to prometheus.yml:
# rule_files:
# - /etc/prometheus/rules/vegapuls-alerts.yml
# 3. Reload Prometheus: systemctl reload prometheus
groups:
- name: ttn_vegapuls_air_alerts
interval: 60s
rules:
# === Exporter Health ===
- alert: VEGAPULSExporterDown
expr: up{job="vegapuls-air"} == 0
for: 5m
labels:
severity: critical
component: exporter
annotations:
summary: "VEGAPULS Air exporter is down"
description: "The VEGAPULS Air Prometheus exporter has been down for more than 5 minutes. Check the service status."
runbook: "Check systemctl status vegapuls-exporter and journalctl -u vegapuls-exporter"
# === Device Online Status ===
- alert: VEGAPULSSensorOffline
expr: vegapulsair_device_online == 0
for: 10m
labels:
severity: warning
component: sensor
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} is offline"
description: "Sensor {{ $labels.device_id }} has not sent an uplink for more than 19 hours and is considered offline."
runbook: "Check sensor battery, LoRaWAN coverage, and TTN Console for error messages"
- alert: VEGAPULSSensorMissing
expr: |
(time() - vegapulsair_last_uplink_seconds_ago) > 86400
for: 30m
labels:
severity: critical
component: sensor
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} missing for over 24h"
description: "Sensor {{ $labels.device_id }} has not transmitted for over 24 hours. Last uplink: {{ $value | humanizeDuration }} ago."
runbook: "Physical inspection required. Check sensor power and installation."
# === Battery Monitoring ===
- alert: VEGAPULSBatteryCritical
expr: vegapulsair_battery_percent < 10
for: 1h
labels:
severity: critical
component: battery
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} battery critically low"
description: "Battery level at {{ $value }}%. Sensor will stop functioning soon. Immediate replacement required."
runbook: "Schedule urgent battery replacement"
- alert: VEGAPULSBatteryLow
expr: vegapulsair_battery_percent < 20
for: 6h
labels:
severity: warning
component: battery
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} battery low"
description: "Battery level at {{ $value }}%. Plan battery replacement soon."
runbook: "Schedule battery replacement within 2-4 weeks"
- alert: VEGAPULSBatteryWarning
expr: vegapulsair_battery_percent < 30
for: 12h
labels:
severity: info
component: battery
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} battery below 30%"
description: "Battery level at {{ $value }}%. Monitor and plan replacement."
runbook: "Add to maintenance schedule for next quarter"
# === Signal Quality ===
- alert: VEGAPULSWeakSignal
expr: vegapulsair_rssi_dbm < -120
for: 1h
labels:
severity: warning
component: network
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} has weak signal"
description: "RSSI is {{ $value }} dBm (very weak). May indicate coverage issues or antenna problems."
runbook: "Check gateway coverage, sensor placement, and antenna connection"
- alert: VEGAPULSPoorSNR
expr: vegapulsair_snr_db < -15
for: 1h
labels:
severity: warning
component: network
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} has poor SNR"
description: "Signal-to-Noise Ratio is {{ $value }} dB. Signal quality is degraded."
runbook: "Check for interference, gateway issues, or repositioning sensor"
# === Temperature Monitoring ===
- alert: VEGAPULSTemperatureExtreme
expr: |
vegapulsair_temperature_celsius > 60 or
vegapulsair_temperature_celsius < -20
for: 30m
labels:
severity: warning
component: environment
annotations:
summary: "VEGAPULS sensor {{ $labels.device_id }} extreme temperature"
description: "Temperature is {{ $value }}°C, outside normal operating range."
runbook: "Check sensor location and environmental conditions"
# === Data Quality ===
- alert: VEGAPULSNoDataReceived
expr: |
rate(vegapulsair_exporter_requests_total[5m]) > 0 and
vegapulsair_devices_total == 0
for: 15m
labels:
severity: warning
component: integration
annotations:
summary: "VEGAPULS exporter receiving no device data"
description: "Exporter is running and being scraped, but no device data is available. Check MQTT connection and TTN configuration."
runbook: "Check exporter logs, TTN Console live data, and MQTT credentials"
- alert: VEGAPULSAllDevicesOffline
expr: |
vegapulsair_devices_total > 0 and
vegapulsair_devices_online == 0
for: 30m
labels:
severity: critical
component: system
annotations:
summary: "All VEGAPULS sensors are offline"
description: "{{ $value }} devices are registered but none are online. System-wide issue suspected."
runbook: "Check TTN gateway status, network connectivity, and power supply"
# === Performance Monitoring ===
- alert: VEGAPULSHighScrapeRate
expr: rate(vegapulsair_exporter_requests_total[5m]) > 2
for: 10m
labels:
severity: info
component: performance
annotations:
summary: "High scrape rate on VEGAPULS exporter"
description: "Prometheus is scraping at {{ $value }} requests/second. Consider increasing scrape_interval."
runbook: "Review Prometheus configuration and adjust scrape_interval if needed"
# === Recording Rules for Easier Querying ===
- name: vegapuls_air_recording_rules
interval: 60s
rules:
# Battery drain rate (percent per day)
- record: vegapulsair_battery_drain_rate_percent_per_day
expr: |
rate(vegapulsair_battery_percent[7d]) * -86400
# Average signal strength per device (7 day)
- record: vegapulsair_rssi_avg_7d
expr: |
avg_over_time(vegapulsair_rssi_dbm[7d])
# Uplink frequency (uplinks per day)
- record: vegapulsair_uplink_frequency_per_day
expr: |
86400 / avg_over_time(vegapulsair_last_uplink_seconds_ago[7d])
# Device availability percentage (24h)
- record: vegapulsair_device_availability_percent_24h
expr: |
avg_over_time(vegapulsair_device_online[24h]) * 100
# === Usage Examples ===
#
# Query battery drain rate:
# vegapulsair_battery_drain_rate_percent_per_day
#
# Query devices with availability < 95%:
# vegapulsair_device_availability_percent_24h < 95
#
# Query average RSSI over 7 days:
# vegapulsair_rssi_avg_7d
#
# Query uplink frequency:
# vegapulsair_uplink_frequency_per_day