infra/roles/services/templates/prometheus/rules/alerts.yml.j2
jack 6ebd237894
Some checks failed
CI/CD / deploy (push) Has been cancelled
CI/CD / syntax-check (push) Successful in 1m7s
feat: major infrastructure improvements
Reliability:
- Add swap role (2GB, swappiness=10, idempotent via /etc/fstab)
- Add mem_limit to plane-worker (512m) and plane-beat (256m)
- Add health checks to all services (traefik, vaultwarden, forgejo,
  plane-*, syncthing, prometheus, grafana, loki)

Code quality:
- Remove Traefik Docker labels (file provider used, labels were dead code)
- Add comment explaining file provider architecture

Observability:
- Add AlertManager with Telegram notifications
- Add Prometheus alert rules: CPU, RAM, disk, swap, container health
- Add Loki + Promtail for centralized log aggregation
- Add Loki datasource to Grafana
- Enable Traefik /ping endpoint for health checks

Backups:
- Add backup role: pg_dump for forgejo + plane DBs, tar for
  vaultwarden and forgejo data
- 7-day retention, daily cron at 03:00
- Backup script at /usr/local/bin/backup-services

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 03:28:16 +07:00

86 lines
4.1 KiB
Django/Jinja

# Generated by Ansible — do not edit manually
groups:
- name: host
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "CPU загружен более 85% на протяжении 5 минут."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Использование RAM превысило 85%."
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "RAM заполнена на 95%+. Возможны OOM kills."
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: SwapUsageHigh
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Swap используется более чем на 50% — RAM под давлением."
- name: containers
rules:
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
description: "Контейнер не отвечает более 2 минут."
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
description: "Контейнер близок к mem_limit — возможен OOM kill."
- alert: ContainerRestarting
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
for: 0m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
description: "Контейнер не активен — проверьте docker ps."