Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma) moved from main to tools server. Prometheus now scrapes main exporters over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100. Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot PROMETHEUS_URL updated to http://ip_tools:9090. Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support per-object ACLs — caused upload failures). Add CORS configuration task for browser-side presigned uploads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
86 lines
4.1 KiB
Django/Jinja
86 lines
4.1 KiB
Django/Jinja
# Generated by Ansible — do not edit manually
|
|
groups:
|
|
- name: host
|
|
rules:
|
|
- alert: HighCPULoad
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "CPU загружен более 85% на протяжении 5 минут."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Использование RAM превысило 85%."
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
|
|
|
- alert: DiskSpaceWarning
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
|
|
|
- alert: SwapUsageHigh
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
|
description: "Swap используется более чем на 50% — RAM под давлением."
|
|
|
|
- name: containers
|
|
rules:
|
|
- alert: ContainerDown
|
|
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
|
description: "Контейнер не отвечает более 2 минут."
|
|
|
|
- alert: ContainerHighMemory
|
|
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
|
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
|
|
|
- alert: ContainerRestarting
|
|
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
|
description: "Контейнер не активен — проверьте docker ps."
|