205 lines
7.5 KiB
YAML
205 lines
7.5 KiB
YAML
# Prometheus Alert Rules for VEGAPULS Air Sensors
|
|
#
|
|
# Installation:
|
|
# 1. Copy this file to /etc/prometheus/rules/vegapuls-alerts.yml
|
|
# 2. Add to prometheus.yml:
|
|
# rule_files:
|
|
# - /etc/prometheus/rules/vegapuls-alerts.yml
|
|
# 3. Reload Prometheus: systemctl reload prometheus
|
|
|
|
groups:
|
|
- name: ttn_vegapuls_air_alerts
|
|
interval: 60s
|
|
rules:
|
|
# === Exporter Health ===
|
|
|
|
- alert: VEGAPULSExporterDown
|
|
expr: up{job="vegapuls-air"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: exporter
|
|
annotations:
|
|
summary: "VEGAPULS Air exporter is down"
|
|
description: "The VEGAPULS Air Prometheus exporter has been down for more than 5 minutes. Check the service status."
|
|
runbook: "Check systemctl status vegapuls-exporter and journalctl -u vegapuls-exporter"
|
|
|
|
# === Device Online Status ===
|
|
|
|
- alert: VEGAPULSSensorOffline
|
|
expr: vegapulsair_device_online == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: sensor
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} is offline"
|
|
description: "Sensor {{ $labels.device_id }} has not sent an uplink for more than 19 hours and is considered offline."
|
|
runbook: "Check sensor battery, LoRaWAN coverage, and TTN Console for error messages"
|
|
|
|
- alert: VEGAPULSSensorMissing
|
|
expr: |
|
|
(time() - vegapulsair_last_uplink_seconds_ago) > 86400
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
component: sensor
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} missing for over 24h"
|
|
description: "Sensor {{ $labels.device_id }} has not transmitted for over 24 hours. Last uplink: {{ $value | humanizeDuration }} ago."
|
|
runbook: "Physical inspection required. Check sensor power and installation."
|
|
|
|
# === Battery Monitoring ===
|
|
|
|
- alert: VEGAPULSBatteryCritical
|
|
expr: vegapulsair_battery_percent < 10
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
component: battery
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} battery critically low"
|
|
description: "Battery level at {{ $value }}%. Sensor will stop functioning soon. Immediate replacement required."
|
|
runbook: "Schedule urgent battery replacement"
|
|
|
|
- alert: VEGAPULSBatteryLow
|
|
expr: vegapulsair_battery_percent < 20
|
|
for: 6h
|
|
labels:
|
|
severity: warning
|
|
component: battery
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} battery low"
|
|
description: "Battery level at {{ $value }}%. Plan battery replacement soon."
|
|
runbook: "Schedule battery replacement within 2-4 weeks"
|
|
|
|
- alert: VEGAPULSBatteryWarning
|
|
expr: vegapulsair_battery_percent < 30
|
|
for: 12h
|
|
labels:
|
|
severity: info
|
|
component: battery
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} battery below 30%"
|
|
description: "Battery level at {{ $value }}%. Monitor and plan replacement."
|
|
runbook: "Add to maintenance schedule for next quarter"
|
|
|
|
# === Signal Quality ===
|
|
|
|
- alert: VEGAPULSWeakSignal
|
|
expr: vegapulsair_rssi_dbm < -120
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
component: network
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} has weak signal"
|
|
description: "RSSI is {{ $value }} dBm (very weak). May indicate coverage issues or antenna problems."
|
|
runbook: "Check gateway coverage, sensor placement, and antenna connection"
|
|
|
|
- alert: VEGAPULSPoorSNR
|
|
expr: vegapulsair_snr_db < -15
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
component: network
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} has poor SNR"
|
|
description: "Signal-to-Noise Ratio is {{ $value }} dB. Signal quality is degraded."
|
|
runbook: "Check for interference, gateway issues, or repositioning sensor"
|
|
|
|
# === Temperature Monitoring ===
|
|
|
|
- alert: VEGAPULSTemperatureExtreme
|
|
expr: |
|
|
vegapulsair_temperature_celsius > 60 or
|
|
vegapulsair_temperature_celsius < -20
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
component: environment
|
|
annotations:
|
|
summary: "VEGAPULS sensor {{ $labels.device_id }} extreme temperature"
|
|
description: "Temperature is {{ $value }}°C, outside normal operating range."
|
|
runbook: "Check sensor location and environmental conditions"
|
|
|
|
# === Data Quality ===
|
|
|
|
- alert: VEGAPULSNoDataReceived
|
|
expr: |
|
|
rate(vegapulsair_exporter_requests_total[5m]) > 0 and
|
|
vegapulsair_devices_total == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
component: integration
|
|
annotations:
|
|
summary: "VEGAPULS exporter receiving no device data"
|
|
description: "Exporter is running and being scraped, but no device data is available. Check MQTT connection and TTN configuration."
|
|
runbook: "Check exporter logs, TTN Console live data, and MQTT credentials"
|
|
|
|
- alert: VEGAPULSAllDevicesOffline
|
|
expr: |
|
|
vegapulsair_devices_total > 0 and
|
|
vegapulsair_devices_online == 0
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
component: system
|
|
annotations:
|
|
summary: "All VEGAPULS sensors are offline"
|
|
description: "{{ $value }} devices are registered but none are online. System-wide issue suspected."
|
|
runbook: "Check TTN gateway status, network connectivity, and power supply"
|
|
|
|
# === Performance Monitoring ===
|
|
|
|
- alert: VEGAPULSHighScrapeRate
|
|
expr: rate(vegapulsair_exporter_requests_total[5m]) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: info
|
|
component: performance
|
|
annotations:
|
|
summary: "High scrape rate on VEGAPULS exporter"
|
|
description: "Prometheus is scraping at {{ $value }} requests/second. Consider increasing scrape_interval."
|
|
runbook: "Review Prometheus configuration and adjust scrape_interval if needed"
|
|
|
|
# === Recording Rules for Easier Querying ===
|
|
|
|
- name: vegapuls_air_recording_rules
|
|
interval: 60s
|
|
rules:
|
|
# Battery drain rate (percent per day)
|
|
- record: vegapulsair_battery_drain_rate_percent_per_day
|
|
expr: |
|
|
rate(vegapulsair_battery_percent[7d]) * -86400
|
|
|
|
# Average signal strength per device (7 day)
|
|
- record: vegapulsair_rssi_avg_7d
|
|
expr: |
|
|
avg_over_time(vegapulsair_rssi_dbm[7d])
|
|
|
|
# Uplink frequency (uplinks per day)
|
|
- record: vegapulsair_uplink_frequency_per_day
|
|
expr: |
|
|
86400 / avg_over_time(vegapulsair_last_uplink_seconds_ago[7d])
|
|
|
|
# Device availability percentage (24h)
|
|
- record: vegapulsair_device_availability_percent_24h
|
|
expr: |
|
|
avg_over_time(vegapulsair_device_online[24h]) * 100
|
|
|
|
# === Usage Examples ===
|
|
#
|
|
# Query battery drain rate:
|
|
# vegapulsair_battery_drain_rate_percent_per_day
|
|
#
|
|
# Query devices with availability < 95%:
|
|
# vegapulsair_device_availability_percent_24h < 95
|
|
#
|
|
# Query average RSSI over 7 days:
|
|
# vegapulsair_rssi_avg_7d
|
|
#
|
|
# Query uplink frequency:
|
|
# vegapulsair_uplink_frequency_per_day
|