Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
31 lines
648 B
Django/Jinja
31 lines
648 B
Django/Jinja
# Generated by Ansible — do not edit manually
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
external_labels:
|
|
instance: "{{ domain_base }}"
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets: ["alertmanager:9093"]
|
|
|
|
rule_files:
|
|
- /etc/prometheus/rules/*.yml
|
|
|
|
scrape_configs:
|
|
- job_name: prometheus
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
|
|
- job_name: node-exporter
|
|
static_configs:
|
|
- targets: ["node-exporter:9100"]
|
|
|
|
- job_name: cadvisor
|
|
static_configs:
|
|
- targets: ["cadvisor:8080"]
|
|
|
|
- job_name: alertmanager
|
|
static_configs:
|
|
- targets: ["alertmanager:9093"]
|