diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..576a78f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.11-slim + +# Set metadata +LABEL maintainer="mail@hendrikschutter.com" +LABEL description="Prometheus exporter for VEGAPULS Air sensors via The Things Network" +LABEL version="2.0" + +# Create app directory +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY ttn-vegapuls-exporter.py . +COPY config.py . + +# Create non-root user +RUN useradd -r -u 1000 -g users exporter && \ + chown -R exporter:users /app + +# Switch to non-root user +USER exporter + +# Expose metrics port +EXPOSE 9106 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \ + CMD python -c 'import urllib.request; urllib.request.urlopen("http://localhost:9106/health")' || exit 1 + +# Set environment variables +ENV PYTHONUNBUFFERED=1 + +# Run the exporter +CMD ["python", "ttn-vegapuls-exporter.py"] diff --git a/README.md b/README.md index 073a6e2..09cc2e9 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,232 @@ -# The Things Network Exporter for VEGAPULS Air +# TTN VEGAPULS Air Prometheus Exporter -Export metrics of a VEGAPULS Air connected via TTN as a prometheus service. +A robust Prometheus exporter for VEGAPULS Air sensors connected via The Things Network (TTN). This exporter provides reliable monitoring with automatic reconnection, uplink caching, and timeout detection. -## Install ## +## Features -- `zypper install python311-paho-mqtt` -- `mkdir /opt/ttn-vegapulsair-exporter/` -- `cd /opt/ttn-vegapulsair-exporter/` -- import `ttn-vegapulsair-exporter.py` and `config.py` -- Set the constants in `config.py` -- `chmod +x /opt/ttn-vegapulsair-exporter/ttn-vegapulsair-exporter.py` -- `chown -R prometheus /opt/ttn-vegapulsair-exporter/` -- `nano /etc/systemd/system/ttn-vegapulsair-exporter.service` -- `systemctl daemon-reload && systemctl enable --now ttn-vegapulsair-exporter.service` +- **Uplink Caching**: Stores historical data with timestamps for each device +- **Timeout Detection**: Automatically detects offline sensors (configurable, default 19 hours) +- **Better Error Handling**: Comprehensive logging and error recovery +- **Multiple Device Support**: Automatically handles multiple sensors + +## Metrics Exported + +### Exporter Metrics +- `vegapulsair_exporter_uptime_seconds` - Exporter uptime in seconds +- `vegapulsair_exporter_requests_total` - Total number of metrics requests +- `vegapulsair_devices_total` - Total number of known devices +- `vegapulsair_devices_online` - Number of currently online devices + +### Per-Device Metrics +All device metrics include a `device_id` label: + +#### Status Metrics +- `vegapulsair_device_online{device_id="..."}` - Device online status (1=online, 0=offline) +- `vegapulsair_last_uplink_seconds_ago{device_id="..."}` - Seconds since last uplink + +#### Sensor Measurements +- `vegapulsair_distance_mm{device_id="..."}` - Distance measurement in millimeters +- `vegapulsair_temperature_celsius{device_id="..."}` - Temperature in Celsius +- `vegapulsair_inclination_degrees{device_id="..."}` - Inclination in degrees +- `vegapulsair_linear_percent{device_id="..."}` - Linear percentage +- `vegapulsair_percent{device_id="..."}` - Percentage value +- `vegapulsair_scaled_value{device_id="..."}` - Scaled measurement value +- `vegapulsair_battery_percent{device_id="..."}` - Remaining battery percentage + +#### LoRaWAN Metadata +- `vegapulsair_rssi_dbm{device_id="..."}` - RSSI in dBm +- `vegapulsair_channel_rssi_dbm{device_id="..."}` - Channel RSSI in dBm +- `vegapulsair_snr_db{device_id="..."}` - Signal-to-Noise Ratio in dB + +## Requirements + +- Python 3.7 or higher +- `paho-mqtt` library + +## Installation + +### Option 1: Manual Installation + +1. **Install Python dependencies:** + ```bash + pip install paho-mqtt --break-system-packages + # Or use a virtual environment: + python3 -m venv venv + source venv/bin/activate + pip install paho-mqtt + ``` + +2. **Create installation directory:** + ```bash + sudo mkdir -p /opt/ttn-vegapuls-exporter + cd /opt/ttn-vegapuls-exporter + ``` + +3. **Copy files:** + ```bash + sudo cp ttn-vegapuls-exporter.py /opt/ttn-vegapuls-exporter/ + sudo cp config.py /opt/ttn-vegapuls-exporter/ + sudo chmod +x /opt/ttn-vegapuls-exporter/ttn-vegapuls-exporter.py + ``` + +4. **Configure the exporter:** + ```bash + sudo nano /opt/ttn-vegapuls-exporter/config.py + ``` + + Set the following required parameters: + - `ttn_user`: Your TTN application ID (format: `your-app-id@ttn`) + - `ttn_key`: Your TTN API key (get from TTN Console) + - `ttn_region`: Your TTN region (EU1, NAM1, AU1, etc.) + +5. **Set permissions:** + ```bash + sudo useradd -r -s /bin/false prometheus # If user doesn't exist + sudo chown -R prometheus:prometheus /opt/ttn-vegapuls-exporter + ``` + +6. **Install systemd service:** + ```bash + sudo cp ttn-vegapuls-exporter.service /etc/systemd/system/ + sudo systemctl daemon-reload + sudo systemctl enable ttn-vegapuls-exporter.service + sudo systemctl start ttn-vegapuls-exporter.service + ``` + +7. **Check status:** + ```bash + sudo systemctl status ttn-vegapuls-exporter.service + sudo journalctl -u ttn-vegapuls-exporter.service -f + ``` + +### Option 2: Docker Installation +See `docker-compose.yml`. + +## Configuration + +Edit `config.py` to customize the exporter: + +```python +# HTTP Server configuration +hostName = "0.0.0.0" # Listen address +serverPort = 9106 # Port for metrics endpoint + +# TTN Configuration +ttn_user = "your-app@ttn" +ttn_key = "NNSXS...." # From TTN Console +ttn_region = "EU1" + +# Timeout configuration +sensor_timeout_hours = 19 # Mark sensor offline after N hours + +# Logging +log_level = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL +``` + +### Getting TTN Credentials + +1. Log in to [TTN Console](https://console.cloud.thethings.network/) +2. Select your application +3. Go to **Integrations** → **MQTT** +4. Copy the following: + - **Username**: Your application ID (format: `your-app-id@ttn`) + - **Password**: Generate an API key with "Read application traffic" permission + - **Region**: Your cluster region (visible in the URL, e.g., `eu1`) + +## Prometheus Configuration + +Add to your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: 'vegapuls-air' + static_configs: + - targets: ['localhost:9106'] + scrape_interval: 60s + scrape_timeout: 10s +``` + +### Example Prometheus Alerts + +See `prometheus-alerts.yml`. + +## Troubleshooting + +### No Metrics Appearing + +1. **Check MQTT connection:** + ```bash + sudo journalctl -u ttn-vegapuls-exporter.service | grep MQTT + ``` + + You should see: `Successfully connected to TTN MQTT broker` + +2. **Verify TTN credentials:** + - Ensure `ttn_user` format is correct: `your-app-id@ttn` + - Verify API key has "Read application traffic" permission + - Check region matches your TTN cluster + +3. **Test metrics endpoint:** + ```bash + curl http://localhost:9106/metrics + ``` + +### MQTT Disconnections + +The exporter now handles disconnections automatically with exponential backoff. Check logs: + +```bash +sudo journalctl -u ttn-vegapuls-exporter.service -f +``` + +If disconnections persist: +- Check network connectivity to TTN +- Verify firewall allows outbound port 8883 +- Ensure system time is correct (TLS certificates) + +### Devices Not Appearing + +1. **Verify devices are sending uplinks:** + - Check TTN Console → Applications → Your App → Live Data + - Ensure devices are joined and transmitting + +2. **Check user ID:** + - `ttn_user` must match your TTN application ID exactly + +3. **Verify payload decoder:** + - Devices must have decoded payload in TTN + - Check TTN Payload Formatter is configured + +### Debug Mode + +Enable debug logging in `config.py`: + +```python +log_level = "DEBUG" +``` + +This will show: +- All MQTT messages received +- Cache updates +- Device status changes +- Detailed error information + +### Data Flow + +``` +VEGAPULS Air Sensor + ↓ +LoRaWAN Gateway + ↓ +The Things Network + ↓ +MQTT Broker (TLS) + ↓ +Exporter (caches data) + ↓ +Prometheus (scrapes metrics) +``` + +## License + +See [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/__pycache__/config.cpython-311.pyc b/__pycache__/config.cpython-311.pyc deleted file mode 100644 index 7cf87be..0000000 Binary files a/__pycache__/config.cpython-311.pyc and /dev/null differ diff --git a/config.py b/config.py index 51d3308..59f4c20 100644 --- a/config.py +++ b/config.py @@ -1,12 +1,39 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" Author: Hendrik Schutter, mail@hendrikschutter.com +""" +Configuration for TTN VEGAPULS Air Prometheus Exporter +Author: Hendrik Schutter, mail@hendrikschutter.com """ +# HTTP Server configuration hostName = "127.0.0.1" serverPort = 9106 exporter_prefix = "vegapulsair_" -ttn_user = "appid@ttn" -ttn_key = "THE APP API KEY FROM TTN CONSOLE" -ttn_region = "EU1" \ No newline at end of file +# TTN MQTT Configuration +# Get your credentials from TTN Console -> Applications -> Your App -> Integrations -> MQTT +ttn_user = "appid@ttn" # Your application ID +ttn_key = "THE APP API KEY FROM TTN CONSOLE" # Your API key +ttn_region = "EU1" # TTN region: EU1, NAM1, AU1, etc. + +# Integration method: "mqtt" or "http" +# - mqtt: Subscribe to TTN MQTT broker (recommended for real-time updates) +# - http: Use HTTP Integration webhook (requires TTN webhook configuration) +integration_method = "mqtt" + +# Timeout configuration +# Time in hours after which a sensor is considered offline if no uplink is received +sensor_timeout_hours = 19 + +# MQTT specific settings +mqtt_keepalive = 60 # MQTT keepalive interval in seconds +mqtt_reconnect_delay = 5 # Delay between reconnection attempts in seconds +mqtt_reconnect_max_delay = 300 # Maximum delay between reconnection attempts + +# Logging configuration +log_level = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL +log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Cache configuration +cache_cleanup_interval = 3600 # Cleanup old cache entries every hour +max_cache_age_hours = 72 # Remove cache entries older than 72 hours diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1ffe6e1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,65 @@ +version: '3.8' + +services: + ttn-vegapuls-exporter: + image: python:3.11-slim + container_name: ttn-vegapuls-exporter + restart: unless-stopped + + # Install dependencies and run exporter + entrypoint: | + sh -c "pip install --no-cache-dir paho-mqtt && python ttn-vegapuls-exporter.py" + + working_dir: /app + + # Expose metrics port + ports: + - "9106:9106" + + # Mount application files (read-only) + volumes: + - ./ttn-vegapuls-exporter.py:/app/ttn-vegapuls-exporter.py:ro + - ./config.py:/app/config.py:ro + + # Environment variables + environment: + - PYTHONUNBUFFERED=1 + + # Health check + healthcheck: + test: ["CMD-SHELL", "python -c 'import urllib.request; urllib.request.urlopen(\"http://localhost:9106/health\")' || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + # Resource limits + deploy: + resources: + limits: + memory: 256M + cpus: '0.05' + reservations: + memory: 64M + + # Logging configuration + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # Network configuration + networks: + - monitoring + + # Security options + security_opt: + - no-new-privileges:true + + # Run as non-root user + user: "1000:1000" + +networks: + monitoring: + driver: bridge diff --git a/prometheus-alerts.yml b/prometheus-alerts.yml new file mode 100644 index 0000000..5670485 --- /dev/null +++ b/prometheus-alerts.yml @@ -0,0 +1,204 @@ +# Prometheus Alert Rules for VEGAPULS Air Sensors +# +# Installation: +# 1. Copy this file to /etc/prometheus/rules/vegapuls-alerts.yml +# 2. Add to prometheus.yml: +# rule_files: +# - /etc/prometheus/rules/vegapuls-alerts.yml +# 3. Reload Prometheus: systemctl reload prometheus + +groups: + - name: ttn_vegapuls_air_alerts + interval: 60s + rules: + # === Exporter Health === + + - alert: VEGAPULSExporterDown + expr: up{job="vegapuls-air"} == 0 + for: 5m + labels: + severity: critical + component: exporter + annotations: + summary: "VEGAPULS Air exporter is down" + description: "The VEGAPULS Air Prometheus exporter has been down for more than 5 minutes. Check the service status." + runbook: "Check systemctl status vegapuls-exporter and journalctl -u vegapuls-exporter" + + # === Device Online Status === + + - alert: VEGAPULSSensorOffline + expr: vegapulsair_device_online == 0 + for: 10m + labels: + severity: warning + component: sensor + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} is offline" + description: "Sensor {{ $labels.device_id }} has not sent an uplink for more than 19 hours and is considered offline." + runbook: "Check sensor battery, LoRaWAN coverage, and TTN Console for error messages" + + - alert: VEGAPULSSensorMissing + expr: | + (time() - vegapulsair_last_uplink_seconds_ago) > 86400 + for: 30m + labels: + severity: critical + component: sensor + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} missing for over 24h" + description: "Sensor {{ $labels.device_id }} has not transmitted for over 24 hours. Last uplink: {{ $value | humanizeDuration }} ago." + runbook: "Physical inspection required. Check sensor power and installation." + + # === Battery Monitoring === + + - alert: VEGAPULSBatteryCritical + expr: vegapulsair_battery_percent < 10 + for: 1h + labels: + severity: critical + component: battery + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} battery critically low" + description: "Battery level at {{ $value }}%. Sensor will stop functioning soon. Immediate replacement required." + runbook: "Schedule urgent battery replacement" + + - alert: VEGAPULSBatteryLow + expr: vegapulsair_battery_percent < 20 + for: 6h + labels: + severity: warning + component: battery + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} battery low" + description: "Battery level at {{ $value }}%. Plan battery replacement soon." + runbook: "Schedule battery replacement within 2-4 weeks" + + - alert: VEGAPULSBatteryWarning + expr: vegapulsair_battery_percent < 30 + for: 12h + labels: + severity: info + component: battery + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} battery below 30%" + description: "Battery level at {{ $value }}%. Monitor and plan replacement." + runbook: "Add to maintenance schedule for next quarter" + + # === Signal Quality === + + - alert: VEGAPULSWeakSignal + expr: vegapulsair_rssi_dbm < -120 + for: 1h + labels: + severity: warning + component: network + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} has weak signal" + description: "RSSI is {{ $value }} dBm (very weak). May indicate coverage issues or antenna problems." + runbook: "Check gateway coverage, sensor placement, and antenna connection" + + - alert: VEGAPULSPoorSNR + expr: vegapulsair_snr_db < -15 + for: 1h + labels: + severity: warning + component: network + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} has poor SNR" + description: "Signal-to-Noise Ratio is {{ $value }} dB. Signal quality is degraded." + runbook: "Check for interference, gateway issues, or repositioning sensor" + + # === Temperature Monitoring === + + - alert: VEGAPULSTemperatureExtreme + expr: | + vegapulsair_temperature_celsius > 60 or + vegapulsair_temperature_celsius < -20 + for: 30m + labels: + severity: warning + component: environment + annotations: + summary: "VEGAPULS sensor {{ $labels.device_id }} extreme temperature" + description: "Temperature is {{ $value }}°C, outside normal operating range." + runbook: "Check sensor location and environmental conditions" + + # === Data Quality === + + - alert: VEGAPULSNoDataReceived + expr: | + rate(vegapulsair_exporter_requests_total[5m]) > 0 and + vegapulsair_devices_total == 0 + for: 15m + labels: + severity: warning + component: integration + annotations: + summary: "VEGAPULS exporter receiving no device data" + description: "Exporter is running and being scraped, but no device data is available. Check MQTT connection and TTN configuration." + runbook: "Check exporter logs, TTN Console live data, and MQTT credentials" + + - alert: VEGAPULSAllDevicesOffline + expr: | + vegapulsair_devices_total > 0 and + vegapulsair_devices_online == 0 + for: 30m + labels: + severity: critical + component: system + annotations: + summary: "All VEGAPULS sensors are offline" + description: "{{ $value }} devices are registered but none are online. System-wide issue suspected." + runbook: "Check TTN gateway status, network connectivity, and power supply" + + # === Performance Monitoring === + + - alert: VEGAPULSHighScrapeRate + expr: rate(vegapulsair_exporter_requests_total[5m]) > 2 + for: 10m + labels: + severity: info + component: performance + annotations: + summary: "High scrape rate on VEGAPULS exporter" + description: "Prometheus is scraping at {{ $value }} requests/second. Consider increasing scrape_interval." + runbook: "Review Prometheus configuration and adjust scrape_interval if needed" + +# === Recording Rules for Easier Querying === + + - name: vegapuls_air_recording_rules + interval: 60s + rules: + # Battery drain rate (percent per day) + - record: vegapulsair_battery_drain_rate_percent_per_day + expr: | + rate(vegapulsair_battery_percent[7d]) * -86400 + + # Average signal strength per device (7 day) + - record: vegapulsair_rssi_avg_7d + expr: | + avg_over_time(vegapulsair_rssi_dbm[7d]) + + # Uplink frequency (uplinks per day) + - record: vegapulsair_uplink_frequency_per_day + expr: | + 86400 / avg_over_time(vegapulsair_last_uplink_seconds_ago[7d]) + + # Device availability percentage (24h) + - record: vegapulsair_device_availability_percent_24h + expr: | + avg_over_time(vegapulsair_device_online[24h]) * 100 + +# === Usage Examples === +# +# Query battery drain rate: +# vegapulsair_battery_drain_rate_percent_per_day +# +# Query devices with availability < 95%: +# vegapulsair_device_availability_percent_24h < 95 +# +# Query average RSSI over 7 days: +# vegapulsair_rssi_avg_7d +# +# Query uplink frequency: +# vegapulsair_uplink_frequency_per_day diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cedd4d5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +# TTN VEGAPULS Air Exporter - Python Dependencies + +# MQTT client for connecting to The Things Network +paho-mqtt>=2.0.0,<3.0.0 diff --git a/ttn-vegapulsair-exporter.py b/ttn-vegapulsair-exporter.py index cce64a5..d5aa6fb 100644 --- a/ttn-vegapulsair-exporter.py +++ b/ttn-vegapulsair-exporter.py @@ -1,286 +1,595 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" Author: Hendrik Schutter, mail@hendrikschutter.com +""" +TTN VEGAPULS Air Prometheus Exporter +Exports metrics from VEGAPULS Air sensors connected via The Things Network + +Author: Hendrik Schutter, mail@hendrikschutter.com """ -from http.server import BaseHTTPRequestHandler, HTTPServer -import paho.mqtt.client as mqtt -from datetime import datetime, timedelta -import threading -import time -import json import sys -import config +import json +import time +import threading import logging import ssl +from datetime import datetime, timedelta +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Dict, Optional, Any + +import paho.mqtt.client as mqtt + +import config -scrape_healthy = True -startTime = datetime.now() -lastMqttReception = datetime.now() -node_metrics = list() -mutex = threading.Lock() -request_count = 0 +class SensorDataCache: + """Thread-safe cache for sensor uplink data with timeout tracking""" -mqtt_client = None -mqtt_connected = False -mqtt_lock = threading.Lock() + def __init__(self, timeout_hours: int = 19): + self._data: Dict[str, Dict[str, Any]] = {} + self._lock = threading.RLock() + self.timeout_hours = timeout_hours -def monitor_timeout(): - global scrape_healthy - global lastMqttReception - global mqtt_connected + def update( + self, device_id: str, payload: Dict, metadata: list, timestamp: datetime + ): + """ + Update cached data for a device - while True: - time_since_last_reception = datetime.now() - lastMqttReception - if time_since_last_reception > timedelta(hours=config.ttn_timeout): - with mutex: - scrape_healthy = False - mqtt_connected = False - time.sleep(60) # Check timeout every minute + Args: + device_id: Unique device identifier + payload: Decoded payload from TTN + metadata: RX metadata from TTN + timestamp: Timestamp of the uplink + """ + with self._lock: + self._data[device_id] = { + "payload": payload, + "metadata": metadata, + "timestamp": timestamp, + "is_online": True, + } + logging.info(f"Updated cache for device {device_id}") -def reconnect_mqtt(): - global mqtt_client - global mqtt_connected + def get_all_devices(self) -> Dict[str, Dict[str, Any]]: + """ + Get all cached device data - while True: - if not mqtt_connected: - with mqtt_lock: - try: - if mqtt_client is None: - print("MQTT client is None, creating a new client...") - mqtt_client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION2) - mqtt_client.on_connect = on_connect - mqtt_client.on_message = on_message - mqtt_client.on_disconnect = on_disconnect - mqtt_client.username_pw_set(config.ttn_user, config.ttn_key) - mqtt_client.tls_set() + Returns: + Dictionary of device data + """ + with self._lock: + return dict(self._data) - print("Attempting to reconnect to MQTT broker...") - mqtt_client.connect( - config.ttn_region.lower() + ".cloud.thethings.network", 8883, 60 + def check_timeouts(self): + """Check all devices for timeout and mark offline ones""" + with self._lock: + now = datetime.now() + timeout_threshold = timedelta(hours=self.timeout_hours) + + for device_id, data in self._data.items(): + time_since_update = now - data["timestamp"] + was_online = data["is_online"] + data["is_online"] = time_since_update < timeout_threshold + + if was_online and not data["is_online"]: + logging.warning( + f"Device {device_id} marked as OFFLINE " + f"(no uplink for {time_since_update.total_seconds()/3600:.1f} hours)" ) - except Exception as e: - print(f"MQTT reconnect failed: {e}") - time.sleep(60) # Retry every 10 seconds + elif not was_online and data["is_online"]: + logging.info(f"Device {device_id} is back ONLINE") + + def cleanup_old_entries(self, max_age_hours: int = 72): + """Remove entries older than max_age_hours""" + with self._lock: + now = datetime.now() + max_age = timedelta(hours=max_age_hours) + + devices_to_remove = [ + device_id + for device_id, data in self._data.items() + if now - data["timestamp"] > max_age + ] + + for device_id in devices_to_remove: + del self._data[device_id] + logging.info(f"Removed stale cache entry for device {device_id}") -class RequestHandler(BaseHTTPRequestHandler): - def log_message(self, format, *args): - pass +class TTNMQTTClient: + """Manages MQTT connection to TTN with automatic reconnection""" - def get_metrics(self): - global request_count - global node_metrics - global mutex - mutex.acquire() - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - self.wfile.write( - bytes( - config.exporter_prefix - + "exporter_duration_seconds_sum " - + str(int((datetime.now() - startTime).total_seconds())) - + "\n", - "utf-8", - ) - ) - self.wfile.write( - bytes( - config.exporter_prefix - + "exporter_request_count " - + str(request_count) - + "\n", - "utf-8", - ) - ) - self.wfile.write( - bytes( - config.exporter_prefix - + "exporter_scrape_healthy " - + str(int(scrape_healthy)) - + "\n", - "utf-8", - ) - ) + def __init__(self, cache: SensorDataCache, config_module): + self.cache = cache + self.config = config_module + self.client: Optional[mqtt.Client] = None + self.connected = False + self._lock = threading.Lock() + self._should_run = True - for metric in node_metrics: - self.wfile.write(bytes(config.exporter_prefix + metric + "\n", "utf-8")) + # Setup logging + self.logger = logging.getLogger("TTNMQTTClient") - mutex.release() + def _on_connect(self, client, userdata, flags, reason_code, properties): + """Callback when connected to MQTT broker""" + if reason_code == 0: + self.logger.info("Successfully connected to TTN MQTT broker") + self.connected = True - def do_GET(self): - global request_count - request_count += 1 - if self.path.startswith("/metrics"): - self.get_metrics() + # Subscribe to uplink messages + topic = f"v3/{self.config.ttn_user}/devices/+/up" + client.subscribe(topic, qos=1) + self.logger.info(f"Subscribed to topic: {topic}") else: - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - self.wfile.write(bytes("", "utf-8")) - self.wfile.write( - bytes("
Exporter for VEGAPULS Air sensors connected via The Things Network
+ + + + """ + self.wfile.write(html.encode("utf-8")) - # Set up TLS/SSL - client.tls_set( - cert_reqs=ssl.CERT_REQUIRED, - tls_version=ssl.PROTOCOL_TLSv1_2, # Enforce TLS 1.2 + else: + self.send_response(404) + self.end_headers() + + return RequestHandler + + +class TimeoutMonitor: + """Background thread to monitor device timeouts""" + + def __init__(self, cache: SensorDataCache, config_module): + self.cache = cache + self.config = config_module + self._should_run = True + self.logger = logging.getLogger("TimeoutMonitor") + + def run(self): + """Main monitoring loop""" + while self._should_run: + try: + self.cache.check_timeouts() + + # Also cleanup old entries periodically + if hasattr(self.config, "cache_cleanup_interval"): + self.cache.cleanup_old_entries(self.config.max_cache_age_hours) + + except Exception as e: + self.logger.error(f"Error in timeout monitoring: {e}", exc_info=True) + + # Check every minute + time.sleep(60) + + def stop(self): + """Stop the monitor""" + self._should_run = False + + +def setup_logging(config_module): + """Configure logging""" + log_level = getattr(logging, config_module.log_level.upper(), logging.INFO) + log_format = getattr( + config_module, + "log_format", + "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + logging.basicConfig( + level=log_level, format=log_format, handlers=[logging.StreamHandler(sys.stdout)] ) - client.tls_insecure_set(False) # Enforce strict certificate validation - return client def main(): - global mqtt_client + """Main application entry point""" + # Setup logging + setup_logging(config) + logger = logging.getLogger("Main") - # Start timeout monitoring thread - timeout_thread = threading.Thread(target=monitor_timeout, daemon=True) - timeout_thread.start() + logger.info("=" * 60) + logger.info("TTN VEGAPULS Air Prometheus Exporter") + logger.info("=" * 60) + logger.info(f"Integration Method: {config.integration_method}") + logger.info(f"Sensor Timeout: {config.sensor_timeout_hours} hours") + logger.info(f"HTTP Server: {config.hostName}:{config.serverPort}") + logger.info("=" * 60) - # Start MQTT reconnect thread - reconnect_thread = threading.Thread(target=reconnect_mqtt, daemon=True) - reconnect_thread.start() + # Create sensor data cache + cache = SensorDataCache(timeout_hours=config.sensor_timeout_hours) - while True: - mqtt_client = configure_mqtt_client() - try: - # Connect to TTN broker - broker_url = f"{config.ttn_region.lower()}.cloud.thethings.network" - mqtt_client.connect(broker_url, 8883, 60) - - # Subscribe to all topics - mqtt_client.subscribe("#", 1) - logging.info(f"Subscribed to all topics.") + # Start timeout monitor + timeout_monitor = TimeoutMonitor(cache, config) + monitor_thread = threading.Thread( + target=timeout_monitor.run, daemon=True, name="TimeoutMonitor" + ) + monitor_thread.start() + logger.info("Started timeout monitor") - poll_mqtt_thread = threading.Thread(target=poll_mqtt, args=((mqtt_client,))) - poll_mqtt_thread.start() - except Exception as e: - logging.error(f"Error occurred: {e}") - mqtt_client.loop_stop() + # Start MQTT client if configured + mqtt_client = None + mqtt_thread = None + if config.integration_method.lower() == "mqtt": + mqtt_client = TTNMQTTClient(cache, config) + mqtt_thread = threading.Thread( + target=mqtt_client.run_with_reconnect, daemon=True, name="MQTTClient" + ) + mqtt_thread.start() + logger.info("Started MQTT client") + else: + logger.warning(f"Unsupported integration method: {config.integration_method}") + logger.warning("Only 'mqtt' is currently supported") - webServer = HTTPServer((config.hostName, config.serverPort), RequestHandler) - print("Server started http://%s:%s" % (config.hostName, config.serverPort)) + # Start HTTP server + metrics_server = MetricsServer(cache, config) + handler = metrics_server.create_handler() - try: - webServer.serve_forever() - except KeyboardInterrupt: - sys.exit(-1) + try: + http_server = HTTPServer((config.hostName, config.serverPort), handler) + logger.info( + f"HTTP server started at http://{config.hostName}:{config.serverPort}" + ) + logger.info("Press Ctrl+C to stop") + + http_server.serve_forever() + + except KeyboardInterrupt: + logger.info("\nShutdown requested by user") + except Exception as e: + logger.error(f"Fatal error: {e}", exc_info=True) + finally: + # Cleanup + logger.info("Shutting down...") + + if mqtt_client: + mqtt_client.stop() + + timeout_monitor.stop() + + logger.info("Shutdown complete") + sys.exit(0) - webServer.server_close() - print("Server stopped.") - poll_mqtt_thread.join() - except Exception as e: - print(e) - time.sleep(60) if __name__ == "__main__": main() diff --git a/ttn-vegapulsair-exporter.service b/ttn-vegapulsair-exporter.service index fb9868c..d072a0c 100644 --- a/ttn-vegapulsair-exporter.service +++ b/ttn-vegapulsair-exporter.service @@ -1,16 +1,45 @@ [Unit] -Description=TTN Exporter for VEGAPULS Air -After=syslog.target -After=network.target +Description=TTN VEGAPULS Air Prometheus Exporter +Documentation=https://git.mosad.xyz/localhorst/TTN-VEGAPULS-Air-exporter +After=network-online.target +Wants=network-online.target [Service] -Restart=on-failure -RestartSec=2s Type=simple User=prometheus Group=prometheus + +# Working directory WorkingDirectory=/opt/ttn-vegapulsair-exporter/ + +# Execution ExecStart=/usr/bin/python3 /opt/ttn-vegapulsair-exporter/ttn-vegapulsair-exporter.py +# Restart configuration +Restart=always +RestartSec=10 + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=ttn-vegapuls-exporter + +# Security settings +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/opt/ttn-vegapulsair-exporter/ +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true + +# Resource limits +MemoryLimit=256M +CPUQuota=5% + +# Environment +Environment="PYTHONUNBUFFERED=1" + [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target