diff --git a/roles/services/templates/env.j2 b/roles/services/templates/env.j2 index 04f2ece..a246209 100644 --- a/roles/services/templates/env.j2 +++ b/roles/services/templates/env.j2 @@ -8,7 +8,6 @@ DOMAIN_GIT={{ domain_git }} DOMAIN_PLANE={{ domain_plane }} DOMAIN_TRAEFIK={{ domain_traefik }} FORGEJO_RUNNER_TOKEN={{ forgejo_runner_token }} -GRAFANA_ADMIN_PASSWORD={{ grafana_admin_password }} CROWDSEC_BOUNCER_KEY={{ crowdsec_bouncer_key }} # Cloudflare DNS-01 ACME challenge CF_DNS_API_TOKEN={{ cloudflare_dns_api_token }} diff --git a/roles/services/templates/prometheus/alertmanager.yml.j2 b/roles/services/templates/prometheus/alertmanager.yml.j2 deleted file mode 100644 index 877527c..0000000 --- a/roles/services/templates/prometheus/alertmanager.yml.j2 +++ /dev/null @@ -1,38 +0,0 @@ -# Generated by Ansible — do not edit manually -global: - resolve_timeout: 5m - -route: - group_by: [alertname, severity] - group_wait: 30s - group_interval: 5m - repeat_interval: 4h - receiver: all - -receivers: - - name: all - telegram_configs: - - bot_token: "{{ alertmanager_telegram_token }}" - chat_id: {{ alertmanager_telegram_chat_id }} - message: | - {{ '{{' }} range .Alerts {{ '}}' }} - {{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}* - {{ '{{' }} .Annotations.summary {{ '}}' }} - {{ '{{' }} .Annotations.description {{ '}}' }} - {{ '{{' }} end {{ '}}' }} - parse_mode: Markdown - discord_configs: - - webhook_url: "{{ discord_webhook_alerts }}" - title: >- - {{ '{{' }} if eq (index .Alerts 0).Status "firing" {{ '}}' }}🔴 Alert{{ '{{' }} else {{ '}}' }}🟢 Resolved{{ '{{' }} end {{ '}}' }} - message: | - {{ '{{' }} range .Alerts {{ '}}' }} - **{{ '{{' }} .Labels.alertname {{ '}}' }}** - {{ '{{' }} .Annotations.summary {{ '}}' }} - {{ '{{' }} .Annotations.description {{ '}}' }} - {{ '{{' }} end {{ '}}' }} - -inhibit_rules: - - source_matchers: [severity="critical"] - target_matchers: [severity="warning"] - equal: [alertname] diff --git a/roles/services/templates/prometheus/prometheus.yml.j2 b/roles/services/templates/prometheus/prometheus.yml.j2 deleted file mode 100644 index b7d3e91..0000000 --- a/roles/services/templates/prometheus/prometheus.yml.j2 +++ /dev/null @@ -1,31 +0,0 @@ -# Generated by Ansible — do not edit manually -global: - scrape_interval: 15s - evaluation_interval: 15s - external_labels: - instance: "{{ domain_base }}" - -alerting: - alertmanagers: - - static_configs: - - targets: ["alertmanager:9093"] - -rule_files: - - /etc/prometheus/rules/*.yml - -scrape_configs: - - job_name: prometheus - static_configs: - - targets: ["localhost:9090"] - - - job_name: node-exporter - static_configs: - - targets: ["node-exporter:9100"] - - - job_name: cadvisor - static_configs: - - targets: ["cadvisor:8080"] - - - job_name: alertmanager - static_configs: - - targets: ["alertmanager:9093"] diff --git a/roles/services/templates/prometheus/rules/alerts.yml.j2 b/roles/services/templates/prometheus/rules/alerts.yml.j2 deleted file mode 100644 index 4e8a5c2..0000000 --- a/roles/services/templates/prometheus/rules/alerts.yml.j2 +++ /dev/null @@ -1,86 +0,0 @@ -# Generated by Ansible — do not edit manually -groups: - - name: host - rules: - - alert: HighCPULoad - expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 - for: 5m - labels: - severity: warning - annotations: - summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "CPU загружен более 85% на протяжении 5 минут." - - - alert: HighMemoryUsage - expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 - for: 5m - labels: - severity: warning - annotations: - summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "Использование RAM превысило 85%." - - - alert: CriticalMemoryUsage - expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 - for: 2m - labels: - severity: critical - annotations: - summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "RAM заполнена на 95%+. Возможны OOM kills." - - - alert: DiskSpaceWarning - expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75 - for: 5m - labels: - severity: warning - annotations: - summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%." - - - alert: DiskSpaceCritical - expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90 - for: 2m - labels: - severity: critical - annotations: - summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%." - - - alert: SwapUsageHigh - expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50 - for: 5m - labels: - severity: warning - annotations: - summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" - description: "Swap используется более чем на 50% — RAM под давлением." - - - name: containers - rules: - - alert: ContainerDown - expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60 - for: 2m - labels: - severity: critical - annotations: - summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен" - description: "Контейнер не отвечает более 2 минут." - - - alert: ContainerHighMemory - expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90 - for: 5m - labels: - severity: warning - annotations: - summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти" - description: "Контейнер близок к mem_limit — возможен OOM kill." - - - alert: ContainerRestarting - expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0 - for: 0m - labels: - severity: warning - annotations: - summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается" - description: "Контейнер не активен — проверьте docker ps."