feat: migrate monitoring to tools server, fix Outline S3 uploads

Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma)
moved from main to tools server. Prometheus now scrapes main exporters
over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100.
Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot
PROMETHEUS_URL updated to http://ip_tools:9090.

Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support
per-object ACLs — caused upload failures). Add CORS configuration task
for browser-side presigned uploads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
jack 2026-03-27 04:10:28 +07:00
parent d6015b76a3
commit fde51352d7
20 changed files with 17172 additions and 210 deletions

View file

@ -53,77 +53,19 @@
mode: "0644"
notify: Restart stack
- name: Deploy Prometheus config
ansible.builtin.template:
src: prometheus/prometheus.yml.j2
dest: "{{ services_root }}/prometheus/prometheus.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Grafana datasource provisioning
ansible.builtin.template:
src: grafana/provisioning/datasources/prometheus.yml.j2
dest: "{{ services_root }}/grafana/provisioning/datasources/prometheus.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Grafana dashboard provisioning config
ansible.builtin.template:
src: grafana/provisioning/dashboards/dashboards.yml.j2
dest: "{{ services_root }}/grafana/provisioning/dashboards/dashboards.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Node Exporter Full dashboard JSON
ansible.builtin.copy:
src: grafana/dashboards/node-exporter-full.json
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy cAdvisor dashboard JSON
ansible.builtin.copy:
src: grafana/dashboards/cadvisor.json
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Prometheus alert rules
ansible.builtin.template:
src: prometheus/rules/alerts.yml.j2
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy AlertManager config
ansible.builtin.template:
src: prometheus/alertmanager.yml.j2
dest: "{{ services_root }}/prometheus/alertmanager.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Loki config
ansible.builtin.template:
src: loki/loki.yml.j2
dest: "{{ services_root }}/loki/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Configure CORS on walava-outline S3 bucket (required for browser uploads)
ansible.builtin.shell: |
docker run --rm \
-e AWS_ACCESS_KEY_ID={{ s3_access_key }} \
-e AWS_SECRET_ACCESS_KEY={{ s3_secret_key }} \
-e AWS_DEFAULT_REGION=ru-1 \
amazon/aws-cli:latest \
--endpoint-url https://s3.timeweb.cloud \
s3api put-bucket-cors \
--bucket walava-outline \
--cors-configuration '{"CORSRules":[{"AllowedOrigins":["https://{{ domain_wiki }}"],"AllowedMethods":["GET","PUT","POST","DELETE","HEAD"],"AllowedHeaders":["*"],"ExposeHeaders":["ETag"],"MaxAgeSeconds":3000}]}'
changed_when: false
ignore_errors: true
- name: Deploy Promtail config
ansible.builtin.template:
@ -134,15 +76,6 @@
mode: "0644"
notify: Restart stack
- name: Deploy Grafana Loki datasource
ansible.builtin.template:
src: grafana/provisioning/datasources/loki.yml.j2
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy CrowdSec acquisition config
ansible.builtin.template:
src: crowdsec/acquis.yaml.j2

View file

@ -22,12 +22,6 @@
- plane/pgdata
- plane/media
- act_runner
- prometheus
- grafana/provisioning/datasources
- grafana/provisioning/dashboards
- grafana/provisioning/dashboards/json
- prometheus/rules
- loki
- traefik/logs
- crowdsec
- authelia

View file

@ -16,15 +16,10 @@
- "{{ plane_redis_image }}"
- "{{ plane_minio_image }}"
- "{{ act_runner_image }}"
- "{{ prometheus_image }}"
- "{{ node_exporter_image }}"
- "{{ cadvisor_image }}"
- "{{ grafana_image }}"
- "{{ alertmanager_image }}"
- "{{ loki_image }}"
- "{{ promtail_image }}"
- "{{ crowdsec_image }}"
- "{{ uptime_kuma_image }}"
- "{{ outline_image }}"
- "{{ outline_db_image }}"
- "{{ outline_redis_image }}"
@ -35,6 +30,21 @@
delay: 30
until: pull_result.rc == 0
# ── UFW: allow tools Prometheus to scrape exporters on main ──────────────────
- name: Allow tools server to scrape node-exporter
community.general.ufw:
rule: allow
port: "9100"
proto: tcp
src: "{{ ip_tools }}"
- name: Allow tools server to scrape cAdvisor
community.general.ufw:
rule: allow
port: "8080"
proto: tcp
src: "{{ ip_tools }}"
- name: Remove legacy SMTP relay UFW rule (port 1025)
community.general.ufw:
rule: allow

View file

@ -40,11 +40,7 @@ volumes:
plane_minio_data:
plane_media:
act_runner_data:
prometheus_data:
grafana_data:
loki_data:
crowdsec_data:
uptime_kuma_data:
outline_db_data:
outline_redis_data:
n8n_data:
@ -381,52 +377,16 @@ services:
- backend
- runner-jobs
# ── Monitoring Stack ───────────────────────────────────────────────────────
prometheus:
image: {{ prometheus_image }}
container_name: prometheus
restart: unless-stopped
networks:
- monitoring
volumes:
- prometheus_data:/prometheus
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
alertmanager:
image: {{ alertmanager_image }}
container_name: alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
# ── Monitoring exporters (metrics scraped by tools Prometheus over network) ──
# Ports exposed: tools server must have UFW rules allowing ip_main:9100/8080
node-exporter:
image: {{ node_exporter_image }}
container_name: node-exporter
restart: unless-stopped
networks:
- monitoring
ports:
- "9100:9100"
pid: host
volumes:
- /proc:/host/proc:ro
@ -443,6 +403,8 @@ services:
restart: unless-stopped
networks:
- monitoring
ports:
- "8080:8080"
privileged: true
devices:
- /dev/kmsg
@ -453,50 +415,7 @@ services:
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
grafana:
image: {{ grafana_image }}
container_name: grafana
restart: unless-stopped
security_opt:
- no-new-privileges:true
depends_on:
- prometheus
networks:
- backend
- monitoring
volumes:
- grafana_data:/var/lib/grafana
- {{ services_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_DOMAIN={{ domain_dashboard }}
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
- GF_AUTH_ANONYMOUS_ENABLED=false
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Logging Stack ──────────────────────────────────────────────────────────
loki:
image: {{ loki_image }}
container_name: loki
restart: unless-stopped
networks:
- monitoring
volumes:
- loki_data:/loki
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
# ── Logging (Promtail pushes to Loki on tools server) ─────────────────────
promtail:
image: {{ promtail_image }}
container_name: promtail
@ -544,12 +463,11 @@ services:
FORGEJO_TOKEN: "${FORGEJO_RUNNER_TOKEN}"
FORGEJO_URL: "https://{{ domain_git }}"
FORGEJO_REPO: "jack/infra"
PROMETHEUS_URL: "http://prometheus:9090"
PROMETHEUS_URL: "http://{{ ip_tools }}:9090"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- proxy # Discord API (internet)
- monitoring # Prometheus metrics
- proxy # Discord API (internet) + reach tools server over public IP
# ── Walava Landing ─────────────────────────────────────────────────────────
# Landing page for walava.io — image built by walava-web repo CI/CD
@ -560,26 +478,6 @@ services:
networks:
- proxy
# ── Uptime Kuma ────────────────────────────────────────────────────────────
# Мониторинг доступности сервисов + публичная статус-страница
# Доступен по адресу: https://{{ domain_status }}
uptime-kuma:
image: {{ uptime_kuma_image }}
container_name: uptime-kuma
restart: unless-stopped
security_opt:
- no-new-privileges:true
networks:
- backend
- proxy # needs internet access for Discord/Telegram notifications
volumes:
- uptime_kuma_data:/app/data
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
interval: 30s
timeout: 5s
retries: 3
# ── Outline wiki ────────────────────────────────────────────────────────────
outline:

View file

@ -21,7 +21,6 @@ AWS_REGION=ru-1
AWS_S3_UPLOAD_BUCKET_NAME=walava-outline
AWS_S3_UPLOAD_BUCKET_URL=https://s3.timeweb.cloud
AWS_S3_FORCE_PATH_STYLE=true
AWS_S3_ACL=private
FILE_STORAGE=s3
# Auth

View file

@ -7,7 +7,7 @@ positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
- url: http://{{ ip_tools }}:3100/loki/api/v1/push
scrape_configs:
- job_name: docker

View file

@ -134,12 +134,12 @@ http:
grafana:
loadBalancer:
servers:
- url: "http://grafana:3000"
- url: "http://{{ ip_tools }}:3000"
uptime-kuma:
loadBalancer:
servers:
- url: "http://uptime-kuma:3001"
- url: "http://{{ ip_tools }}:3001"
walava-landing:
loadBalancer:

View file

@ -1,2 +1,11 @@
---
tools_root: /opt/tools
# Image versions (mirrors services role — keep in sync)
prometheus_image: "prom/prometheus:v3.4.0"
node_exporter_image: "prom/node-exporter:v1.9.1"
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1"
grafana_image: "grafana/grafana:11.6.1"
alertmanager_image: "prom/alertmanager:v0.28.1"
loki_image: "grafana/loki:3.4.3"
uptime_kuma_image: "louislam/uptime-kuma:1"

View file

@ -0,0 +1,817 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "Prometheus as the datasource is obligatory",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.4.5"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "table",
"name": "Table",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": 14282,
"graphTooltip": 0,
"id": null,
"iteration": 1617715580880,
"links": [],
"panels": [
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 8,
"panels": [],
"title": "CPU",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 1
},
"hiddenSeries": false,
"id": 15,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
"hide": false,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:606",
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:607",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 8
},
"id": 11,
"panels": [],
"title": "Memory",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"hiddenSeries": false,
"id": 9,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"hide": false,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:606",
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:607",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 9
},
"hiddenSeries": false,
"id": 14,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null as zero",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
"hide": false,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Memory Cached",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:606",
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:607",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 17
},
"id": 2,
"panels": [],
"title": "Network",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 18
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"hide": false,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Received Network Traffic",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:674",
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:675",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 18
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Sent Network Traffic",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:832",
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:833",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": false,
"datasource": "${DS_PROMETHEUS}",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 26
},
"id": 19,
"panels": [],
"title": "Misc",
"type": "row"
},
{
"datasource": "${DS_PROMETHEUS}",
"fieldConfig": {
"defaults": {
"custom": {
"align": null,
"filterable": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "id"
},
"properties": [
{
"id": "custom.width",
"value": 260
}
]
},
{
"matcher": {
"id": "byName",
"options": "Running"
},
"properties": [
{
"id": "unit",
"value": "d"
},
{
"id": "decimals",
"value": 1
},
{
"id": "custom.displayMode",
"value": "color-text"
},
{
"id": "color",
"value": {
"fixedColor": "dark-green",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 27
},
"id": 17,
"options": {
"showHeader": true,
"sortBy": []
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "{{name}}",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Containers Info",
"transformations": [
{
"id": "filterFieldsByName",
"options": {
"include": {
"names": [
"container_label_com_docker_compose_project",
"container_label_com_docker_compose_project_working_dir",
"image",
"instance",
"name",
"Value",
"container_label_com_docker_compose_service"
]
}
}
},
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"Value": "Running",
"container_label_com_docker_compose_project": "Label",
"container_label_com_docker_compose_project_working_dir": "Working dir",
"container_label_com_docker_compose_service": "Service",
"image": "Registry Image",
"instance": "Instance",
"name": "Name"
}
}
}
],
"type": "table"
}
],
"schemaVersion": 27,
"style": "dark",
"tags": [
"cadvisor",
"docker"
],
"templating": {
"list": [
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values({__name__=~\"container.*\"},instance)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": "Host",
"multi": false,
"name": "host",
"options": [],
"query": {
"query": "label_values({__name__=~\"container.*\"},instance)",
"refId": "Prometheus-host-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 5,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
"description": null,
"error": null,
"hide": 0,
"includeAll": true,
"label": "Container",
"multi": false,
"name": "container",
"options": [],
"query": {
"query": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
"refId": "Prometheus-container-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Cadvisor exporter",
"uid": "pMEd7m0Mz",
"version": 1,
"description": "Simple exporter for cadvisor only"
}

File diff suppressed because it is too large Load diff

View file

@ -7,6 +7,29 @@
group: "{{ deploy_group }}"
mode: "0750"
- name: Create tools subdirectories
ansible.builtin.file:
path: "{{ tools_root }}/{{ item }}"
state: directory
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0755"
loop:
- prometheus
- prometheus/rules
- grafana/provisioning/datasources
- grafana/provisioning/dashboards
- grafana/provisioning/dashboards/json
- loki
- name: Deploy .env file
ansible.builtin.template:
src: env.j2
dest: "{{ tools_root }}/.env"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0600"
- name: Deploy docker-compose.yml
ansible.builtin.template:
src: docker-compose.yml.j2
@ -15,8 +38,130 @@
group: "{{ deploy_group }}"
mode: "0640"
- name: Deploy Prometheus config
ansible.builtin.template:
src: prometheus/prometheus.yml.j2
dest: "{{ tools_root }}/prometheus/prometheus.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Prometheus alert rules
ansible.builtin.template:
src: prometheus/rules/alerts.yml.j2
dest: "{{ tools_root }}/prometheus/rules/alerts.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy AlertManager config
ansible.builtin.template:
src: prometheus/alertmanager.yml.j2
dest: "{{ tools_root }}/prometheus/alertmanager.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Loki config
ansible.builtin.template:
src: loki/loki.yml.j2
dest: "{{ tools_root }}/loki/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Grafana Prometheus datasource
ansible.builtin.template:
src: grafana/provisioning/datasources/prometheus.yml.j2
dest: "{{ tools_root }}/grafana/provisioning/datasources/prometheus.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Grafana Loki datasource
ansible.builtin.template:
src: grafana/provisioning/datasources/loki.yml.j2
dest: "{{ tools_root }}/grafana/provisioning/datasources/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Grafana dashboard provisioning config
ansible.builtin.template:
src: grafana/provisioning/dashboards/dashboards.yml.j2
dest: "{{ tools_root }}/grafana/provisioning/dashboards/dashboards.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy Node Exporter Full dashboard JSON
ansible.builtin.copy:
src: grafana/dashboards/node-exporter-full.json
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Deploy cAdvisor dashboard JSON
ansible.builtin.copy:
src: grafana/dashboards/cadvisor.json
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
- name: Pull monitoring images
ansible.builtin.command: docker pull {{ item }}
loop:
- "{{ prometheus_image }}"
- "{{ alertmanager_image }}"
- "{{ node_exporter_image }}"
- "{{ cadvisor_image }}"
- "{{ grafana_image }}"
- "{{ loki_image }}"
- "{{ uptime_kuma_image }}"
register: pull_result
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
retries: 5
delay: 30
until: pull_result.rc == 0
# ── UFW: allow main server to reach monitoring services ───────────────────────
- name: Allow main server to reach Loki (Promtail log push)
community.general.ufw:
rule: allow
port: "3100"
proto: tcp
src: "{{ ip_main }}"
- name: Allow main server to reach Prometheus (discord-bot metrics)
community.general.ufw:
rule: allow
port: "9090"
proto: tcp
src: "{{ ip_main }}"
- name: Allow main Traefik to reach Grafana
community.general.ufw:
rule: allow
port: "3000"
proto: tcp
src: "{{ ip_main }}"
- name: Allow main Traefik to reach Uptime Kuma
community.general.ufw:
rule: allow
port: "3001"
proto: tcp
src: "{{ ip_main }}"
- name: Start tools stack
community.docker.docker_compose_v2:
project_src: "{{ tools_root }}"
state: present
pull: never
remove_orphans: true
retries: 3
delay: 15
register: compose_result
until: compose_result is succeeded

View file

@ -1,10 +1,157 @@
# Tools stack — generated by Ansible
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
# All app services (Outline, n8n) have been migrated to main server.
# Monitoring stack (Grafana, Prometheus, Loki, Alertmanager) will be added here.
# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor
networks:
front:
monitoring:
driver: bridge
services: {}
volumes:
prometheus_data:
grafana_data:
loki_data:
uptime_kuma_data:
services:
# ── Prometheus ─────────────────────────────────────────────────────────────
prometheus:
image: {{ prometheus_image }}
container_name: prometheus
restart: unless-stopped
networks:
- monitoring
ports:
- "127.0.0.1:9090:9090" # exposed to main via UFW rule for discord-bot
volumes:
- prometheus_data:/prometheus
- {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
alertmanager:
image: {{ alertmanager_image }}
container_name: alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
- {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
# ── Exporters (monitor the tools host itself) ───────────────────────────────
node-exporter:
image: {{ node_exporter_image }}
container_name: node-exporter
restart: unless-stopped
networks:
- monitoring
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
cadvisor:
image: {{ cadvisor_image }}
container_name: cadvisor
restart: unless-stopped
networks:
- monitoring
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
# ── Grafana ─────────────────────────────────────────────────────────────────
grafana:
image: {{ grafana_image }}
container_name: grafana
restart: unless-stopped
security_opt:
- no-new-privileges:true
depends_on:
- prometheus
networks:
- monitoring
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
env_file: .env
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_DOMAIN={{ domain_dashboard }}
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
- GF_AUTH_ANONYMOUS_ENABLED=false
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Loki ────────────────────────────────────────────────────────────────────
loki:
image: {{ loki_image }}
container_name: loki
restart: unless-stopped
networks:
- monitoring
ports:
- "3100:3100" # exposed to main for Promtail log ingestion
volumes:
- loki_data:/loki
- {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
# ── Uptime Kuma ─────────────────────────────────────────────────────────────
uptime-kuma:
image: {{ uptime_kuma_image }}
container_name: uptime-kuma
restart: unless-stopped
security_opt:
- no-new-privileges:true
networks:
- monitoring
ports:
- "3001:3001"
volumes:
- uptime_kuma_data:/app/data
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
interval: 30s
timeout: 5s
retries: 3

View file

@ -0,0 +1,2 @@
# Generated by Ansible — do not edit manually
GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}

View file

@ -0,0 +1,13 @@
# Generated by Ansible — do not edit manually
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: false
options:
path: /etc/grafana/provisioning/dashboards/json

View file

@ -0,0 +1,10 @@
# Generated by Ansible — do not edit manually
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: false
editable: false

View file

@ -0,0 +1,10 @@
# Generated by Ansible — do not edit manually
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false

View file

@ -0,0 +1,36 @@
# Generated by Ansible — do not edit manually
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 30d
compactor:
working_directory: /loki/retention
delete_request_store: filesystem
retention_enabled: true

View file

@ -0,0 +1,38 @@
# Generated by Ansible — do not edit manually
global:
resolve_timeout: 5m
route:
group_by: [alertname, severity]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: all
receivers:
- name: all
telegram_configs:
- bot_token: "{{ alertmanager_telegram_token }}"
chat_id: {{ alertmanager_telegram_chat_id }}
message: |
{{ '{{' }} range .Alerts {{ '}}' }}
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
{{ '{{' }} .Annotations.summary {{ '}}' }}
{{ '{{' }} .Annotations.description {{ '}}' }}
{{ '{{' }} end {{ '}}' }}
parse_mode: Markdown
discord_configs:
- webhook_url: "{{ discord_webhook_alerts }}"
title: >-
{{ '{{' }} if eq (index .Alerts 0).Status "firing" {{ '}}' }}🔴 Alert{{ '{{' }} else {{ '}}' }}🟢 Resolved{{ '{{' }} end {{ '}}' }}
message: |
{{ '{{' }} range .Alerts {{ '}}' }}
**{{ '{{' }} .Labels.alertname {{ '}}' }}**
{{ '{{' }} .Annotations.summary {{ '}}' }}
{{ '{{' }} .Annotations.description {{ '}}' }}
{{ '{{' }} end {{ '}}' }}
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
equal: [alertname]

View file

@ -0,0 +1,49 @@
# Generated by Ansible — do not edit manually
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
instance: "{{ domain_base }}"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
# tools server metrics
- job_name: node-exporter-tools
static_configs:
- targets: ["node-exporter:9100"]
labels:
host: tools
- job_name: cadvisor-tools
static_configs:
- targets: ["cadvisor:8080"]
labels:
host: tools
- job_name: alertmanager
static_configs:
- targets: ["alertmanager:9093"]
# main server metrics (scraped over network)
- job_name: node-exporter-main
static_configs:
- targets: ["{{ ip_main }}:9100"]
labels:
host: main
- job_name: cadvisor-main
static_configs:
- targets: ["{{ ip_main }}:8080"]
labels:
host: main

View file

@ -0,0 +1,86 @@
# Generated by Ansible — do not edit manually
groups:
- name: host
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "CPU загружен более 85% на протяжении 5 минут."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Использование RAM превысило 85%."
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "RAM заполнена на 95%+. Возможны OOM kills."
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: SwapUsageHigh
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Swap используется более чем на 50% — RAM под давлением."
- name: containers
rules:
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
description: "Контейнер не отвечает более 2 минут."
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
description: "Контейнер близок к mem_limit — возможен OOM kill."
- alert: ContainerRestarting
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
for: 0m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
description: "Контейнер не активен — проверьте docker ps."