Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
86 lines
4.1 KiB
Django/Jinja
86 lines
4.1 KiB
Django/Jinja
# Generated by Ansible — do not edit manually
|
|
groups:
|
|
- name: host
|
|
rules:
|
|
- alert: HighCPULoad
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "CPU загружен более 85% на протяжении 5 минут."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Использование RAM превысило 85%."
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
|
|
|
- alert: DiskSpaceWarning
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
|
|
|
- alert: SwapUsageHigh
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Swap используется более чем на 50% — RAM под давлением."
|
|
|
|
- name: containers
|
|
rules:
|
|
- alert: ContainerDown
|
|
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
|
description: "Контейнер не отвечает более 2 минут."
|
|
|
|
- alert: ContainerHighMemory
|
|
expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
|
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
|
|
|
- alert: ContainerRestarting
|
|
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
|
description: "Контейнер не активен — проверьте docker ps."
|