infra/roles/tools/templates/prometheus/rules/alerts.yml.j2
jack fde51352d7 feat: migrate monitoring to tools server, fix Outline S3 uploads
Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma)
moved from main to tools server. Prometheus now scrapes main exporters
over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100.
Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot
PROMETHEUS_URL updated to http://ip_tools:9090.

Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support
per-object ACLs — caused upload failures). Add CORS configuration task
for browser-side presigned uploads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 04:10:28 +07:00

86 lines
4.1 KiB
Django/Jinja

# Generated by Ansible — do not edit manually
groups:
- name: host
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "CPU загружен более 85% на протяжении 5 минут."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Использование RAM превысило 85%."
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "RAM заполнена на 95%+. Возможны OOM kills."
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: SwapUsageHigh
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Swap используется более чем на 50% — RAM под давлением."
- name: containers
rules:
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
description: "Контейнер не отвечает более 2 минут."
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
description: "Контейнер близок к mem_limit — возможен OOM kill."
- alert: ContainerRestarting
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
for: 0m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
description: "Контейнер не активен — проверьте docker ps."