Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
28 lines
883 B
Django/Jinja
28 lines
883 B
Django/Jinja
# Generated by Ansible — do not edit manually
|
|
global:
|
|
resolve_timeout: 5m
|
|
|
|
route:
|
|
group_by: [alertname, severity]
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
receiver: telegram
|
|
|
|
receivers:
|
|
- name: telegram
|
|
telegram_configs:
|
|
- bot_token: "{{ alertmanager_telegram_token }}"
|
|
chat_id: {{ alertmanager_telegram_chat_id }}
|
|
message: |
|
|
{{ '{{' }} range .Alerts {{ '}}' }}
|
|
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
|
|
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
|
{{ '{{' }} .Annotations.description {{ '}}' }}
|
|
{{ '{{' }} end {{ '}}' }}
|
|
parse_mode: Markdown
|
|
|
|
inhibit_rules:
|
|
- source_matchers: [severity="critical"]
|
|
target_matchers: [severity="warning"]
|
|
equal: [alertname]
|