infra/roles/services/templates/prometheus/rules/alerts.yml.j2
jack 1f03022086
Some checks failed
CI/CD / syntax-check (push) Successful in 53s
CI/CD / deploy (push) Failing after 57s
fix: correct invalid PromQL in ContainerHighMemory alert rule
Cannot use comparison operators inside label matchers {}.
Move the > 0 filter outside braces as a scalar filter on the
denominator — idiomatic Prometheus way to exclude unlimited containers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 03:59:56 +07:00

86 lines
4.1 KiB
Django/Jinja

# Generated by Ansible — do not edit manually
groups:
- name: host
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "CPU загружен более 85% на протяжении 5 минут."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Использование RAM превысило 85%."
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "RAM заполнена на 95%+. Возможны OOM kills."
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: SwapUsageHigh
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Swap используется более чем на 50% — RAM под давлением."
- name: containers
rules:
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
description: "Контейнер не отвечает более 2 минут."
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
description: "Контейнер близок к mem_limit — возможен OOM kill."
- alert: ContainerRestarting
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
for: 0m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
description: "Контейнер не активен — проверьте docker ps."