feat: migrate monitoring to tools server, fix Outline S3 uploads

Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma) moved from main to tools server. Prometheus now scrapes main exporters over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100. Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot PROMETHEUS_URL updated to http://ip_tools:9090. Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support per-object ACLs — caused upload failures). Add CORS configuration task for browser-side presigned uploads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 04:10:28 +07:00 · 2026-03-27 04:10:28 +07:00 · fde51352d7
commit fde51352d7
parent d6015b76a3
20 changed files with 17172 additions and 210 deletions
--- a/roles/services/tasks/configs.yml
+++ b/roles/services/tasks/configs.yml
@ -53,77 +53,19 @@
    mode: "0644"
  notify: Restart stack

- name: Deploy Prometheus config
-  ansible.builtin.template:
-    src: prometheus/prometheus.yml.j2
-    dest: "{{ services_root }}/prometheus/prometheus.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy Grafana datasource provisioning
-  ansible.builtin.template:
-    src: grafana/provisioning/datasources/prometheus.yml.j2
-    dest: "{{ services_root }}/grafana/provisioning/datasources/prometheus.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy Grafana dashboard provisioning config
-  ansible.builtin.template:
-    src: grafana/provisioning/dashboards/dashboards.yml.j2
-    dest: "{{ services_root }}/grafana/provisioning/dashboards/dashboards.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy Node Exporter Full dashboard JSON
-  ansible.builtin.copy:
-    src: grafana/dashboards/node-exporter-full.json
-    dest: "{{ services_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy cAdvisor dashboard JSON
-  ansible.builtin.copy:
-    src: grafana/dashboards/cadvisor.json
-    dest: "{{ services_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy Prometheus alert rules
-  ansible.builtin.template:
-    src: prometheus/rules/alerts.yml.j2
-    dest: "{{ services_root }}/prometheus/rules/alerts.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy AlertManager config
-  ansible.builtin.template:
-    src: prometheus/alertmanager.yml.j2
-    dest: "{{ services_root }}/prometheus/alertmanager.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
- name: Deploy Loki config
-  ansible.builtin.template:
-    src: loki/loki.yml.j2
-    dest: "{{ services_root }}/loki/loki.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
+- name: Configure CORS on walava-outline S3 bucket (required for browser uploads)
+  ansible.builtin.shell: |
+    docker run --rm \
+      -e AWS_ACCESS_KEY_ID={{ s3_access_key }} \
+      -e AWS_SECRET_ACCESS_KEY={{ s3_secret_key }} \
+      -e AWS_DEFAULT_REGION=ru-1 \
+      amazon/aws-cli:latest \
+      --endpoint-url https://s3.timeweb.cloud \
+      s3api put-bucket-cors \
+      --bucket walava-outline \
+      --cors-configuration '{"CORSRules":[{"AllowedOrigins":["https://{{ domain_wiki }}"],"AllowedMethods":["GET","PUT","POST","DELETE","HEAD"],"AllowedHeaders":["*"],"ExposeHeaders":["ETag"],"MaxAgeSeconds":3000}]}'
+  changed_when: false
+  ignore_errors: true

 - name: Deploy Promtail config
  ansible.builtin.template:
@ -134,15 +76,6 @@
    mode: "0644"
  notify: Restart stack

- name: Deploy Grafana Loki datasource
-  ansible.builtin.template:
-    src: grafana/provisioning/datasources/loki.yml.j2
-    dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
-    owner: "{{ deploy_user }}"
-    group: "{{ deploy_group }}"
-    mode: "0644"
-  notify: Restart stack
-
 - name: Deploy CrowdSec acquisition config
  ansible.builtin.template:
    src: crowdsec/acquis.yaml.j2
--- a/roles/services/tasks/directories.yml
+++ b/roles/services/tasks/directories.yml
@ -22,12 +22,6 @@
    - plane/pgdata
    - plane/media
    - act_runner
-    - prometheus
-    - grafana/provisioning/datasources
-    - grafana/provisioning/dashboards
-    - grafana/provisioning/dashboards/json
-    - prometheus/rules
    - loki
    - traefik/logs
    - crowdsec
-    - authelia
--- a/roles/services/tasks/main.yml
+++ b/roles/services/tasks/main.yml
@ -16,15 +16,10 @@
    - "{{ plane_redis_image }}"
    - "{{ plane_minio_image }}"
    - "{{ act_runner_image }}"
-    - "{{ prometheus_image }}"
    - "{{ node_exporter_image }}"
    - "{{ cadvisor_image }}"
-    - "{{ grafana_image }}"
-    - "{{ alertmanager_image }}"
-    - "{{ loki_image }}"
    - "{{ promtail_image }}"
    - "{{ crowdsec_image }}"
-    - "{{ uptime_kuma_image }}"
    - "{{ outline_image }}"
    - "{{ outline_db_image }}"
    - "{{ outline_redis_image }}"
@ -35,6 +30,21 @@
  delay: 30
  until: pull_result.rc == 0

+# ── UFW: allow tools Prometheus to scrape exporters on main ──────────────────
+- name: Allow tools server to scrape node-exporter
+  community.general.ufw:
+    rule: allow
+    port: "9100"
+    proto: tcp
+    src: "{{ ip_tools }}"
+
+- name: Allow tools server to scrape cAdvisor
+  community.general.ufw:
+    rule: allow
+    port: "8080"
+    proto: tcp
+    src: "{{ ip_tools }}"
+
 - name: Remove legacy SMTP relay UFW rule (port 1025)
  community.general.ufw:
    rule: allow
--- a/roles/services/templates/docker-compose.yml.j2
+++ b/roles/services/templates/docker-compose.yml.j2
@ -40,11 +40,7 @@ volumes:
  plane_minio_data:
  plane_media:
  act_runner_data:
-  prometheus_data:
-  grafana_data:
-  loki_data:
  crowdsec_data:
-  uptime_kuma_data:
  outline_db_data:
  outline_redis_data:
  n8n_data:
@ -381,52 +377,16 @@ services:
      - backend
      - runner-jobs

-  # ── Monitoring Stack ───────────────────────────────────────────────────────
-  prometheus:
-    image: {{ prometheus_image }}
-    container_name: prometheus
-    restart: unless-stopped
-    networks:
-      - monitoring
-    volumes:
-      - prometheus_data:/prometheus
-      - {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
-      - {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
-    command:
-      - "--config.file=/etc/prometheus/prometheus.yml"
-      - "--storage.tsdb.path=/prometheus"
-      - "--storage.tsdb.retention.time=30d"
-      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
-      - "--web.console.templates=/usr/share/prometheus/consoles"
-    healthcheck:
-      test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-
-  alertmanager:
-    image: {{ alertmanager_image }}
-    container_name: alertmanager
-    restart: unless-stopped
-    networks:
-      - monitoring
-    volumes:
-      - {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
-    command:
-      - "--config.file=/etc/alertmanager/alertmanager.yml"
-      - "--storage.path=/alertmanager"
-    healthcheck:
-      test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-
+  # ── Monitoring exporters (metrics scraped by tools Prometheus over network) ──
+  # Ports exposed: tools server must have UFW rules allowing ip_main:9100/8080
  node-exporter:
    image: {{ node_exporter_image }}
    container_name: node-exporter
    restart: unless-stopped
    networks:
      - monitoring
+    ports:
+      - "9100:9100"
    pid: host
    volumes:
      - /proc:/host/proc:ro
@ -443,6 +403,8 @@ services:
    restart: unless-stopped
    networks:
      - monitoring
+    ports:
+      - "8080:8080"
    privileged: true
    devices:
      - /dev/kmsg
@ -453,50 +415,7 @@ services:
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro

-  grafana:
-    image: {{ grafana_image }}
-    container_name: grafana
-    restart: unless-stopped
-    security_opt:
-      - no-new-privileges:true
-    depends_on:
-      - prometheus
-    networks:
-      - backend
-      - monitoring
-    volumes:
-      - grafana_data:/var/lib/grafana
-      - {{ services_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
-    environment:
-      - GF_SECURITY_ADMIN_USER=admin
-      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
-      - GF_USERS_ALLOW_SIGN_UP=false
-      - GF_SERVER_DOMAIN={{ domain_dashboard }}
-      - GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
-      - GF_AUTH_ANONYMOUS_ENABLED=false
-    healthcheck:
-      test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-
-  # ── Logging Stack ──────────────────────────────────────────────────────────
-  loki:
-    image: {{ loki_image }}
-    container_name: loki
-    restart: unless-stopped
-    networks:
-      - monitoring
-    volumes:
-      - loki_data:/loki
-      - {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
-    command: -config.file=/etc/loki/local-config.yaml
-    healthcheck:
-      test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-
+  # ── Logging (Promtail pushes to Loki on tools server) ─────────────────────
  promtail:
    image: {{ promtail_image }}
    container_name: promtail
@ -544,12 +463,11 @@ services:
      FORGEJO_TOKEN:  "${FORGEJO_RUNNER_TOKEN}"
      FORGEJO_URL:    "https://{{ domain_git }}"
      FORGEJO_REPO:   "jack/infra"
-      PROMETHEUS_URL: "http://prometheus:9090"
+      PROMETHEUS_URL: "http://{{ ip_tools }}:9090"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    networks:
-      - proxy       # Discord API (internet)
-      - monitoring  # Prometheus metrics
+      - proxy  # Discord API (internet) + reach tools server over public IP

  # ── Walava Landing ─────────────────────────────────────────────────────────
  # Landing page for walava.io — image built by walava-web repo CI/CD
@ -560,26 +478,6 @@ services:
    networks:
      - proxy

-  # ── Uptime Kuma ────────────────────────────────────────────────────────────
-  # Мониторинг доступности сервисов + публичная статус-страница
-  # Доступен по адресу: https://{{ domain_status }}
-  uptime-kuma:
-    image: {{ uptime_kuma_image }}
-    container_name: uptime-kuma
-    restart: unless-stopped
-    security_opt:
-      - no-new-privileges:true
-    networks:
-      - backend
-      - proxy  # needs internet access for Discord/Telegram notifications
-    volumes:
-      - uptime_kuma_data:/app/data
-    healthcheck:
-      test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-

  # ── Outline wiki ────────────────────────────────────────────────────────────
  outline:
--- a/roles/services/templates/env.outline.j2
+++ b/roles/services/templates/env.outline.j2
@ -21,7 +21,6 @@ AWS_REGION=ru-1
 AWS_S3_UPLOAD_BUCKET_NAME=walava-outline
 AWS_S3_UPLOAD_BUCKET_URL=https://s3.timeweb.cloud
 AWS_S3_FORCE_PATH_STYLE=true
-AWS_S3_ACL=private
 FILE_STORAGE=s3

 # Auth
--- a/roles/services/templates/loki/promtail.yml.j2
+++ b/roles/services/templates/loki/promtail.yml.j2
@ -7,7 +7,7 @@ positions:
  filename: /tmp/positions.yaml

 clients:
-  - url: http://loki:3100/loki/api/v1/push
+  - url: http://{{ ip_tools }}:3100/loki/api/v1/push

 scrape_configs:
  - job_name: docker
--- a/roles/services/templates/traefik/dynamic/routes.yml.j2
+++ b/roles/services/templates/traefik/dynamic/routes.yml.j2
@ -134,12 +134,12 @@ http:
    grafana:
      loadBalancer:
        servers:
-          - url: "http://grafana:3000"
+          - url: "http://{{ ip_tools }}:3000"

    uptime-kuma:
      loadBalancer:
        servers:
-          - url: "http://uptime-kuma:3001"
+          - url: "http://{{ ip_tools }}:3001"

    walava-landing:
      loadBalancer:
--- a/roles/tools/defaults/main.yml
+++ b/roles/tools/defaults/main.yml
@ -1,2 +1,11 @@
 ---
 tools_root: /opt/tools
+
+# Image versions (mirrors services role — keep in sync)
+prometheus_image:    "prom/prometheus:v3.4.0"
+node_exporter_image: "prom/node-exporter:v1.9.1"
+cadvisor_image:      "gcr.io/cadvisor/cadvisor:v0.52.1"
+grafana_image:       "grafana/grafana:11.6.1"
+alertmanager_image:  "prom/alertmanager:v0.28.1"
+loki_image:          "grafana/loki:3.4.3"
+uptime_kuma_image:   "louislam/uptime-kuma:1"
--- a/roles/tools/files/grafana/dashboards/cadvisor.json
+++ b/roles/tools/files/grafana/dashboards/cadvisor.json
@ -0,0 +1,817 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "Prometheus as the datasource is obligatory",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "7.4.5"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "table",
+      "name": "Table",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": 14282,
+  "graphTooltip": 0,
+  "id": null,
+  "iteration": 1617715580880,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 8,
+      "panels": [],
+      "title": "CPU",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 7,
+        "w": 24,
+        "x": 0,
+        "y": 1
+      },
+      "hiddenSeries": false,
+      "id": 15,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null as zero",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.5",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "CPU Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:606",
+          "format": "percent",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:607",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "collapsed": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 8
+      },
+      "id": 11,
+      "panels": [],
+      "title": "Memory",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 9
+      },
+      "hiddenSeries": false,
+      "id": 9,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null as zero",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.5",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:606",
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:607",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 9
+      },
+      "hiddenSeries": false,
+      "id": 14,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null as zero",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.5",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": true,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Memory Cached",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:606",
+          "format": "bytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:607",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "collapsed": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 17
+      },
+      "id": 2,
+      "panels": [],
+      "title": "Network",
+      "type": "row"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      },
+      "hiddenSeries": false,
+      "id": 4,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "hideEmpty": false,
+        "hideZero": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "sideWidth": null,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.5",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Received Network Traffic",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:674",
+          "format": "Bps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:675",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 18
+      },
+      "hiddenSeries": false,
+      "id": 6,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.5",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Sent Network Traffic",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:832",
+          "format": "Bps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "$$hashKey": "object:833",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "collapsed": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 26
+      },
+      "id": 19,
+      "panels": [],
+      "title": "Misc",
+      "type": "row"
+    },
+    {
+      "datasource": "${DS_PROMETHEUS}",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "align": null,
+            "filterable": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "id"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 260
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Running"
+            },
+            "properties": [
+              {
+                "id": "unit",
+                "value": "d"
+              },
+              {
+                "id": "decimals",
+                "value": 1
+              },
+              {
+                "id": "custom.displayMode",
+                "value": "color-text"
+              },
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "dark-green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 10,
+        "w": 24,
+        "x": 0,
+        "y": 27
+      },
+      "id": 17,
+      "options": {
+        "showHeader": true,
+        "sortBy": []
+      },
+      "pluginVersion": "7.4.5",
+      "targets": [
+        {
+          "expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400",
+          "format": "table",
+          "instant": true,
+          "interval": "",
+          "legendFormat": "{{name}}",
+          "refId": "A"
+        }
+      ],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Containers Info",
+      "transformations": [
+        {
+          "id": "filterFieldsByName",
+          "options": {
+            "include": {
+              "names": [
+                "container_label_com_docker_compose_project",
+                "container_label_com_docker_compose_project_working_dir",
+                "image",
+                "instance",
+                "name",
+                "Value",
+                "container_label_com_docker_compose_service"
+              ]
+            }
+          }
+        },
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {},
+            "indexByName": {},
+            "renameByName": {
+              "Value": "Running",
+              "container_label_com_docker_compose_project": "Label",
+              "container_label_com_docker_compose_project_working_dir": "Working dir",
+              "container_label_com_docker_compose_service": "Service",
+              "image": "Registry Image",
+              "instance": "Instance",
+              "name": "Name"
+            }
+          }
+        }
+      ],
+      "type": "table"
+    }
+  ],
+  "schemaVersion": 27,
+  "style": "dark",
+  "tags": [
+    "cadvisor",
+    "docker"
+  ],
+  "templating": {
+    "list": [
+      {
+        "allValue": ".*",
+        "current": {},
+        "datasource": "${DS_PROMETHEUS}",
+        "definition": "label_values({__name__=~\"container.*\"},instance)",
+        "description": null,
+        "error": null,
+        "hide": 0,
+        "includeAll": true,
+        "label": "Host",
+        "multi": false,
+        "name": "host",
+        "options": [],
+        "query": {
+          "query": "label_values({__name__=~\"container.*\"},instance)",
+          "refId": "Prometheus-host-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 5,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": ".*",
+        "current": {},
+        "datasource": "${DS_PROMETHEUS}",
+        "definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
+        "description": null,
+        "error": null,
+        "hide": 0,
+        "includeAll": true,
+        "label": "Container",
+        "multi": false,
+        "name": "container",
+        "options": [],
+        "query": {
+          "query": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
+          "refId": "Prometheus-container-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Cadvisor exporter",
+  "uid": "pMEd7m0Mz",
+  "version": 1,
+  "description": "Simple exporter for cadvisor only"
+}
--- a/roles/tools/files/grafana/dashboards/node-exporter-full.json
+++ b/roles/tools/files/grafana/dashboards/node-exporter-full.json
--- a/roles/tools/tasks/main.yml
+++ b/roles/tools/tasks/main.yml
@ -7,6 +7,29 @@
    group: "{{ deploy_group }}"
    mode: "0750"

+- name: Create tools subdirectories
+  ansible.builtin.file:
+    path: "{{ tools_root }}/{{ item }}"
+    state: directory
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0755"
+  loop:
+    - prometheus
+    - prometheus/rules
+    - grafana/provisioning/datasources
+    - grafana/provisioning/dashboards
+    - grafana/provisioning/dashboards/json
+    - loki
+
+- name: Deploy .env file
+  ansible.builtin.template:
+    src: env.j2
+    dest: "{{ tools_root }}/.env"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0600"
+
 - name: Deploy docker-compose.yml
  ansible.builtin.template:
    src: docker-compose.yml.j2
@ -15,8 +38,130 @@
    group: "{{ deploy_group }}"
    mode: "0640"

+- name: Deploy Prometheus config
+  ansible.builtin.template:
+    src: prometheus/prometheus.yml.j2
+    dest: "{{ tools_root }}/prometheus/prometheus.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Prometheus alert rules
+  ansible.builtin.template:
+    src: prometheus/rules/alerts.yml.j2
+    dest: "{{ tools_root }}/prometheus/rules/alerts.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy AlertManager config
+  ansible.builtin.template:
+    src: prometheus/alertmanager.yml.j2
+    dest: "{{ tools_root }}/prometheus/alertmanager.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Loki config
+  ansible.builtin.template:
+    src: loki/loki.yml.j2
+    dest: "{{ tools_root }}/loki/loki.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Grafana Prometheus datasource
+  ansible.builtin.template:
+    src: grafana/provisioning/datasources/prometheus.yml.j2
+    dest: "{{ tools_root }}/grafana/provisioning/datasources/prometheus.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Grafana Loki datasource
+  ansible.builtin.template:
+    src: grafana/provisioning/datasources/loki.yml.j2
+    dest: "{{ tools_root }}/grafana/provisioning/datasources/loki.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Grafana dashboard provisioning config
+  ansible.builtin.template:
+    src: grafana/provisioning/dashboards/dashboards.yml.j2
+    dest: "{{ tools_root }}/grafana/provisioning/dashboards/dashboards.yml"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy Node Exporter Full dashboard JSON
+  ansible.builtin.copy:
+    src: grafana/dashboards/node-exporter-full.json
+    dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Deploy cAdvisor dashboard JSON
+  ansible.builtin.copy:
+    src: grafana/dashboards/cadvisor.json
+    dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
+    owner: "{{ deploy_user }}"
+    group: "{{ deploy_group }}"
+    mode: "0644"
+
+- name: Pull monitoring images
+  ansible.builtin.command: docker pull {{ item }}
+  loop:
+    - "{{ prometheus_image }}"
+    - "{{ alertmanager_image }}"
+    - "{{ node_exporter_image }}"
+    - "{{ cadvisor_image }}"
+    - "{{ grafana_image }}"
+    - "{{ loki_image }}"
+    - "{{ uptime_kuma_image }}"
+  register: pull_result
+  changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
+  retries: 5
+  delay: 30
+  until: pull_result.rc == 0
+
+# ── UFW: allow main server to reach monitoring services ───────────────────────
+- name: Allow main server to reach Loki (Promtail log push)
+  community.general.ufw:
+    rule: allow
+    port: "3100"
+    proto: tcp
+    src: "{{ ip_main }}"
+
+- name: Allow main server to reach Prometheus (discord-bot metrics)
+  community.general.ufw:
+    rule: allow
+    port: "9090"
+    proto: tcp
+    src: "{{ ip_main }}"
+
+- name: Allow main Traefik to reach Grafana
+  community.general.ufw:
+    rule: allow
+    port: "3000"
+    proto: tcp
+    src: "{{ ip_main }}"
+
+- name: Allow main Traefik to reach Uptime Kuma
+  community.general.ufw:
+    rule: allow
+    port: "3001"
+    proto: tcp
+    src: "{{ ip_main }}"
+
 - name: Start tools stack
  community.docker.docker_compose_v2:
    project_src: "{{ tools_root }}"
    state: present
+    pull: never
    remove_orphans: true
+  retries: 3
+  delay: 15
+  register: compose_result
+  until: compose_result is succeeded
--- a/roles/tools/templates/docker-compose.yml.j2
+++ b/roles/tools/templates/docker-compose.yml.j2
@ -1,10 +1,157 @@
 # Tools stack — generated by Ansible
 # Do not edit manually; re-run ansible-playbook playbooks/tools.yml
-# All app services (Outline, n8n) have been migrated to main server.
-# Monitoring stack (Grafana, Prometheus, Loki, Alertmanager) will be added here.
+# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor

 networks:
-  front:
+  monitoring:
    driver: bridge

-services: {}
+volumes:
+  prometheus_data:
+  grafana_data:
+  loki_data:
+  uptime_kuma_data:
+
+services:
+
+  # ── Prometheus ─────────────────────────────────────────────────────────────
+  prometheus:
+    image: {{ prometheus_image }}
+    container_name: prometheus
+    restart: unless-stopped
+    networks:
+      - monitoring
+    ports:
+      - "127.0.0.1:9090:9090"  # exposed to main via UFW rule for discord-bot
+    volumes:
+      - prometheus_data:/prometheus
+      - {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.path=/prometheus"
+      - "--storage.tsdb.retention.time=30d"
+      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
+      - "--web.console.templates=/usr/share/prometheus/consoles"
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  alertmanager:
+    image: {{ alertmanager_image }}
+    container_name: alertmanager
+    restart: unless-stopped
+    networks:
+      - monitoring
+    volumes:
+      - {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+    command:
+      - "--config.file=/etc/alertmanager/alertmanager.yml"
+      - "--storage.path=/alertmanager"
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  # ── Exporters (monitor the tools host itself) ───────────────────────────────
+  node-exporter:
+    image: {{ node_exporter_image }}
+    container_name: node-exporter
+    restart: unless-stopped
+    networks:
+      - monitoring
+    pid: host
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - "--path.procfs=/host/proc"
+      - "--path.sysfs=/host/sys"
+      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
+
+  cadvisor:
+    image: {{ cadvisor_image }}
+    container_name: cadvisor
+    restart: unless-stopped
+    networks:
+      - monitoring
+    privileged: true
+    devices:
+      - /dev/kmsg
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker:/var/lib/docker:ro
+      - /dev/disk:/dev/disk:ro
+
+  # ── Grafana ─────────────────────────────────────────────────────────────────
+  grafana:
+    image: {{ grafana_image }}
+    container_name: grafana
+    restart: unless-stopped
+    security_opt:
+      - no-new-privileges:true
+    depends_on:
+      - prometheus
+    networks:
+      - monitoring
+    ports:
+      - "3000:3000"
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
+    env_file: .env
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_SERVER_DOMAIN={{ domain_dashboard }}
+      - GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
+      - GF_AUTH_ANONYMOUS_ENABLED=false
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  # ── Loki ────────────────────────────────────────────────────────────────────
+  loki:
+    image: {{ loki_image }}
+    container_name: loki
+    restart: unless-stopped
+    networks:
+      - monitoring
+    ports:
+      - "3100:3100"  # exposed to main for Promtail log ingestion
+    volumes:
+      - loki_data:/loki
+      - {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
+    command: -config.file=/etc/loki/local-config.yaml
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  # ── Uptime Kuma ─────────────────────────────────────────────────────────────
+  uptime-kuma:
+    image: {{ uptime_kuma_image }}
+    container_name: uptime-kuma
+    restart: unless-stopped
+    security_opt:
+      - no-new-privileges:true
+    networks:
+      - monitoring
+    ports:
+      - "3001:3001"
+    volumes:
+      - uptime_kuma_data:/app/data
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
--- a/roles/tools/templates/env.j2
+++ b/roles/tools/templates/env.j2
@ -0,0 +1,2 @@
+# Generated by Ansible — do not edit manually
+GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}
--- a/roles/tools/templates/grafana/provisioning/dashboards/dashboards.yml.j2
+++ b/roles/tools/templates/grafana/provisioning/dashboards/dashboards.yml.j2
@ -0,0 +1,13 @@
+# Generated by Ansible — do not edit manually
+apiVersion: 1
+
+providers:
+  - name: default
+    orgId: 1
+    folder: ""
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 30
+    allowUiUpdates: false
+    options:
+      path: /etc/grafana/provisioning/dashboards/json
--- a/roles/tools/templates/grafana/provisioning/datasources/loki.yml.j2
+++ b/roles/tools/templates/grafana/provisioning/datasources/loki.yml.j2
@ -0,0 +1,10 @@
+# Generated by Ansible — do not edit manually
+apiVersion: 1
+
+datasources:
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    isDefault: false
+    editable: false
--- a/roles/tools/templates/grafana/provisioning/datasources/prometheus.yml.j2
+++ b/roles/tools/templates/grafana/provisioning/datasources/prometheus.yml.j2
@ -0,0 +1,10 @@
+# Generated by Ansible — do not edit manually
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
--- a/roles/tools/templates/loki/loki.yml.j2
+++ b/roles/tools/templates/loki/loki.yml.j2
@ -0,0 +1,36 @@
+# Generated by Ansible — do not edit manually
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  retention_period: 30d
+
+compactor:
+  working_directory: /loki/retention
+  delete_request_store: filesystem
+  retention_enabled: true
--- a/roles/tools/templates/prometheus/alertmanager.yml.j2
+++ b/roles/tools/templates/prometheus/alertmanager.yml.j2
@ -0,0 +1,38 @@
+# Generated by Ansible — do not edit manually
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: [alertname, severity]
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  receiver: all
+
+receivers:
+  - name: all
+    telegram_configs:
+      - bot_token: "{{ alertmanager_telegram_token }}"
+        chat_id: {{ alertmanager_telegram_chat_id }}
+        message: |
+          {{ '{{' }} range .Alerts {{ '}}' }}
+          {{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
+          {{ '{{' }} .Annotations.summary {{ '}}' }}
+          {{ '{{' }} .Annotations.description {{ '}}' }}
+          {{ '{{' }} end {{ '}}' }}
+        parse_mode: Markdown
+    discord_configs:
+      - webhook_url: "{{ discord_webhook_alerts }}"
+        title: >-
+          {{ '{{' }} if eq (index .Alerts 0).Status "firing" {{ '}}' }}🔴 Alert{{ '{{' }} else {{ '}}' }}🟢 Resolved{{ '{{' }} end {{ '}}' }}
+        message: |
+          {{ '{{' }} range .Alerts {{ '}}' }}
+          **{{ '{{' }} .Labels.alertname {{ '}}' }}**
+          {{ '{{' }} .Annotations.summary {{ '}}' }}
+          {{ '{{' }} .Annotations.description {{ '}}' }}
+          {{ '{{' }} end {{ '}}' }}
+
+inhibit_rules:
+  - source_matchers: [severity="critical"]
+    target_matchers: [severity="warning"]
+    equal: [alertname]
--- a/roles/tools/templates/prometheus/prometheus.yml.j2
+++ b/roles/tools/templates/prometheus/prometheus.yml.j2
@ -0,0 +1,49 @@
+# Generated by Ansible — do not edit manually
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    instance: "{{ domain_base }}"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ["alertmanager:9093"]
+
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ["localhost:9090"]
+
+  # tools server metrics
+  - job_name: node-exporter-tools
+    static_configs:
+      - targets: ["node-exporter:9100"]
+        labels:
+          host: tools
+
+  - job_name: cadvisor-tools
+    static_configs:
+      - targets: ["cadvisor:8080"]
+        labels:
+          host: tools
+
+  - job_name: alertmanager
+    static_configs:
+      - targets: ["alertmanager:9093"]
+
+  # main server metrics (scraped over network)
+  - job_name: node-exporter-main
+    static_configs:
+      - targets: ["{{ ip_main }}:9100"]
+        labels:
+          host: main
+
+  - job_name: cadvisor-main
+    static_configs:
+      - targets: ["{{ ip_main }}:8080"]
+        labels:
+          host: main
--- a/roles/tools/templates/prometheus/rules/alerts.yml.j2
+++ b/roles/tools/templates/prometheus/rules/alerts.yml.j2
@ -0,0 +1,86 @@
+# Generated by Ansible — do not edit manually
+groups:
+  - name: host
+    rules:
+      - alert: HighCPULoad
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "CPU загружен более 85% на протяжении 5 минут."
+
+      - alert: HighMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "Использование RAM превысило 85%."
+
+      - alert: CriticalMemoryUsage
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "RAM заполнена на 95%+. Возможны OOM kills."
+
+      - alert: DiskSpaceWarning
+        expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
+
+      - alert: DiskSpaceCritical
+        expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
+
+      - alert: SwapUsageHigh
+        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
+          description: "Swap используется более чем на 50% — RAM под давлением."
+
+  - name: containers
+    rules:
+      - alert: ContainerDown
+        expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
+          description: "Контейнер не отвечает более 2 минут."
+
+      - alert: ContainerHighMemory
+        expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
+          description: "Контейнер близок к mem_limit — возможен OOM kill."
+
+      - alert: ContainerRestarting
+        expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
+          description: "Контейнер не активен — проверьте docker ps."