feat: migrate monitoring to tools server, fix Outline S3 uploads
Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma) moved from main to tools server. Prometheus now scrapes main exporters over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100. Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot PROMETHEUS_URL updated to http://ip_tools:9090. Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support per-object ACLs — caused upload failures). Add CORS configuration task for browser-side presigned uploads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d6015b76a3
commit
fde51352d7
20 changed files with 17172 additions and 210 deletions
|
|
@ -53,77 +53,19 @@
|
|||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Prometheus config
|
||||
ansible.builtin.template:
|
||||
src: prometheus/prometheus.yml.j2
|
||||
dest: "{{ services_root }}/prometheus/prometheus.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Grafana datasource provisioning
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/datasources/prometheus.yml.j2
|
||||
dest: "{{ services_root }}/grafana/provisioning/datasources/prometheus.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Grafana dashboard provisioning config
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/dashboards/dashboards.yml.j2
|
||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/dashboards.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Node Exporter Full dashboard JSON
|
||||
ansible.builtin.copy:
|
||||
src: grafana/dashboards/node-exporter-full.json
|
||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy cAdvisor dashboard JSON
|
||||
ansible.builtin.copy:
|
||||
src: grafana/dashboards/cadvisor.json
|
||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Prometheus alert rules
|
||||
ansible.builtin.template:
|
||||
src: prometheus/rules/alerts.yml.j2
|
||||
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy AlertManager config
|
||||
ansible.builtin.template:
|
||||
src: prometheus/alertmanager.yml.j2
|
||||
dest: "{{ services_root }}/prometheus/alertmanager.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Loki config
|
||||
ansible.builtin.template:
|
||||
src: loki/loki.yml.j2
|
||||
dest: "{{ services_root }}/loki/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
- name: Configure CORS on walava-outline S3 bucket (required for browser uploads)
|
||||
ansible.builtin.shell: |
|
||||
docker run --rm \
|
||||
-e AWS_ACCESS_KEY_ID={{ s3_access_key }} \
|
||||
-e AWS_SECRET_ACCESS_KEY={{ s3_secret_key }} \
|
||||
-e AWS_DEFAULT_REGION=ru-1 \
|
||||
amazon/aws-cli:latest \
|
||||
--endpoint-url https://s3.timeweb.cloud \
|
||||
s3api put-bucket-cors \
|
||||
--bucket walava-outline \
|
||||
--cors-configuration '{"CORSRules":[{"AllowedOrigins":["https://{{ domain_wiki }}"],"AllowedMethods":["GET","PUT","POST","DELETE","HEAD"],"AllowedHeaders":["*"],"ExposeHeaders":["ETag"],"MaxAgeSeconds":3000}]}'
|
||||
changed_when: false
|
||||
ignore_errors: true
|
||||
|
||||
- name: Deploy Promtail config
|
||||
ansible.builtin.template:
|
||||
|
|
@ -134,15 +76,6 @@
|
|||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Grafana Loki datasource
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/datasources/loki.yml.j2
|
||||
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy CrowdSec acquisition config
|
||||
ansible.builtin.template:
|
||||
src: crowdsec/acquis.yaml.j2
|
||||
|
|
|
|||
|
|
@ -22,12 +22,6 @@
|
|||
- plane/pgdata
|
||||
- plane/media
|
||||
- act_runner
|
||||
- prometheus
|
||||
- grafana/provisioning/datasources
|
||||
- grafana/provisioning/dashboards
|
||||
- grafana/provisioning/dashboards/json
|
||||
- prometheus/rules
|
||||
- loki
|
||||
- traefik/logs
|
||||
- crowdsec
|
||||
- authelia
|
||||
|
|
|
|||
|
|
@ -16,15 +16,10 @@
|
|||
- "{{ plane_redis_image }}"
|
||||
- "{{ plane_minio_image }}"
|
||||
- "{{ act_runner_image }}"
|
||||
- "{{ prometheus_image }}"
|
||||
- "{{ node_exporter_image }}"
|
||||
- "{{ cadvisor_image }}"
|
||||
- "{{ grafana_image }}"
|
||||
- "{{ alertmanager_image }}"
|
||||
- "{{ loki_image }}"
|
||||
- "{{ promtail_image }}"
|
||||
- "{{ crowdsec_image }}"
|
||||
- "{{ uptime_kuma_image }}"
|
||||
- "{{ outline_image }}"
|
||||
- "{{ outline_db_image }}"
|
||||
- "{{ outline_redis_image }}"
|
||||
|
|
@ -35,6 +30,21 @@
|
|||
delay: 30
|
||||
until: pull_result.rc == 0
|
||||
|
||||
# ── UFW: allow tools Prometheus to scrape exporters on main ──────────────────
|
||||
- name: Allow tools server to scrape node-exporter
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "9100"
|
||||
proto: tcp
|
||||
src: "{{ ip_tools }}"
|
||||
|
||||
- name: Allow tools server to scrape cAdvisor
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "8080"
|
||||
proto: tcp
|
||||
src: "{{ ip_tools }}"
|
||||
|
||||
- name: Remove legacy SMTP relay UFW rule (port 1025)
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
|
|
|
|||
|
|
@ -40,11 +40,7 @@ volumes:
|
|||
plane_minio_data:
|
||||
plane_media:
|
||||
act_runner_data:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
loki_data:
|
||||
crowdsec_data:
|
||||
uptime_kuma_data:
|
||||
outline_db_data:
|
||||
outline_redis_data:
|
||||
n8n_data:
|
||||
|
|
@ -381,52 +377,16 @@ services:
|
|||
- backend
|
||||
- runner-jobs
|
||||
|
||||
# ── Monitoring Stack ───────────────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: {{ prometheus_image }}
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: {{ alertmanager_image }}
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Monitoring exporters (metrics scraped by tools Prometheus over network) ──
|
||||
# Ports exposed: tools server must have UFW rules allowing ip_main:9100/8080
|
||||
node-exporter:
|
||||
image: {{ node_exporter_image }}
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "9100:9100"
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
|
|
@ -443,6 +403,8 @@ services:
|
|||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "8080:8080"
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
|
|
@ -453,50 +415,7 @@ services:
|
|||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /dev/disk:/dev/disk:ro
|
||||
|
||||
grafana:
|
||||
image: {{ grafana_image }}
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- backend
|
||||
- monitoring
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- {{ services_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
||||
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Logging Stack ──────────────────────────────────────────────────────────
|
||||
loki:
|
||||
image: {{ loki_image }}
|
||||
container_name: loki
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- loki_data:/loki
|
||||
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Logging (Promtail pushes to Loki on tools server) ─────────────────────
|
||||
promtail:
|
||||
image: {{ promtail_image }}
|
||||
container_name: promtail
|
||||
|
|
@ -544,12 +463,11 @@ services:
|
|||
FORGEJO_TOKEN: "${FORGEJO_RUNNER_TOKEN}"
|
||||
FORGEJO_URL: "https://{{ domain_git }}"
|
||||
FORGEJO_REPO: "jack/infra"
|
||||
PROMETHEUS_URL: "http://prometheus:9090"
|
||||
PROMETHEUS_URL: "http://{{ ip_tools }}:9090"
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
networks:
|
||||
- proxy # Discord API (internet)
|
||||
- monitoring # Prometheus metrics
|
||||
- proxy # Discord API (internet) + reach tools server over public IP
|
||||
|
||||
# ── Walava Landing ─────────────────────────────────────────────────────────
|
||||
# Landing page for walava.io — image built by walava-web repo CI/CD
|
||||
|
|
@ -560,26 +478,6 @@ services:
|
|||
networks:
|
||||
- proxy
|
||||
|
||||
# ── Uptime Kuma ────────────────────────────────────────────────────────────
|
||||
# Мониторинг доступности сервисов + публичная статус-страница
|
||||
# Доступен по адресу: https://{{ domain_status }}
|
||||
uptime-kuma:
|
||||
image: {{ uptime_kuma_image }}
|
||||
container_name: uptime-kuma
|
||||
restart: unless-stopped
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
networks:
|
||||
- backend
|
||||
- proxy # needs internet access for Discord/Telegram notifications
|
||||
volumes:
|
||||
- uptime_kuma_data:/app/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
|
||||
# ── Outline wiki ────────────────────────────────────────────────────────────
|
||||
outline:
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ AWS_REGION=ru-1
|
|||
AWS_S3_UPLOAD_BUCKET_NAME=walava-outline
|
||||
AWS_S3_UPLOAD_BUCKET_URL=https://s3.timeweb.cloud
|
||||
AWS_S3_FORCE_PATH_STYLE=true
|
||||
AWS_S3_ACL=private
|
||||
FILE_STORAGE=s3
|
||||
|
||||
# Auth
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ positions:
|
|||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
- url: http://{{ ip_tools }}:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
|
|
|
|||
|
|
@ -134,12 +134,12 @@ http:
|
|||
grafana:
|
||||
loadBalancer:
|
||||
servers:
|
||||
- url: "http://grafana:3000"
|
||||
- url: "http://{{ ip_tools }}:3000"
|
||||
|
||||
uptime-kuma:
|
||||
loadBalancer:
|
||||
servers:
|
||||
- url: "http://uptime-kuma:3001"
|
||||
- url: "http://{{ ip_tools }}:3001"
|
||||
|
||||
walava-landing:
|
||||
loadBalancer:
|
||||
|
|
|
|||
|
|
@ -1,2 +1,11 @@
|
|||
---
|
||||
tools_root: /opt/tools
|
||||
|
||||
# Image versions (mirrors services role — keep in sync)
|
||||
prometheus_image: "prom/prometheus:v3.4.0"
|
||||
node_exporter_image: "prom/node-exporter:v1.9.1"
|
||||
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1"
|
||||
grafana_image: "grafana/grafana:11.6.1"
|
||||
alertmanager_image: "prom/alertmanager:v0.28.1"
|
||||
loki_image: "grafana/loki:3.4.3"
|
||||
uptime_kuma_image: "louislam/uptime-kuma:1"
|
||||
|
|
|
|||
817
roles/tools/files/grafana/dashboards/cadvisor.json
Normal file
817
roles/tools/files/grafana/dashboards/cadvisor.json
Normal file
|
|
@ -0,0 +1,817 @@
|
|||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "Prometheus as the datasource is obligatory",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "7.4.5"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "graph",
|
||||
"name": "Graph",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "table",
|
||||
"name": "Table",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": 14282,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"iteration": 1617715580880,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 8,
|
||||
"panels": [],
|
||||
"title": "CPU",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 15,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null as zero",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.5",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "CPU Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:606",
|
||||
"format": "percent",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:607",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"id": 11,
|
||||
"panels": [],
|
||||
"title": "Memory",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 9
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 9,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null as zero",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.5",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Memory Usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:606",
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:607",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 9
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 14,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null as zero",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.5",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": true,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Memory Cached",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:606",
|
||||
"format": "bytes",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:607",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 17
|
||||
},
|
||||
"id": 2,
|
||||
"panels": [],
|
||||
"title": "Network",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 18
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 4,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"sideWidth": null,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.5",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Received Network Traffic",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:674",
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:675",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"fill": 1,
|
||||
"fillGradient": 0,
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 18
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 6,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": false,
|
||||
"max": true,
|
||||
"min": false,
|
||||
"rightSide": true,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"nullPointMode": "null",
|
||||
"options": {
|
||||
"alertThreshold": true
|
||||
},
|
||||
"percentage": false,
|
||||
"pluginVersion": "7.4.5",
|
||||
"pointradius": 2,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeRegions": [],
|
||||
"timeShift": null,
|
||||
"title": "Sent Network Traffic",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"$$hashKey": "object:832",
|
||||
"format": "Bps",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"$$hashKey": "object:833",
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 26
|
||||
},
|
||||
"id": 19,
|
||||
"panels": [],
|
||||
"title": "Misc",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": null,
|
||||
"filterable": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "id"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.width",
|
||||
"value": 260
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "Running"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "d"
|
||||
},
|
||||
{
|
||||
"id": "decimals",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-text"
|
||||
},
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"fixedColor": "dark-green",
|
||||
"mode": "fixed"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
},
|
||||
"id": 17,
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": []
|
||||
},
|
||||
"pluginVersion": "7.4.5",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"legendFormat": "{{name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Containers Info",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": [
|
||||
"container_label_com_docker_compose_project",
|
||||
"container_label_com_docker_compose_project_working_dir",
|
||||
"image",
|
||||
"instance",
|
||||
"name",
|
||||
"Value",
|
||||
"container_label_com_docker_compose_service"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {},
|
||||
"indexByName": {},
|
||||
"renameByName": {
|
||||
"Value": "Running",
|
||||
"container_label_com_docker_compose_project": "Label",
|
||||
"container_label_com_docker_compose_project_working_dir": "Working dir",
|
||||
"container_label_com_docker_compose_service": "Service",
|
||||
"image": "Registry Image",
|
||||
"instance": "Instance",
|
||||
"name": "Name"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 27,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"cadvisor",
|
||||
"docker"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "label_values({__name__=~\"container.*\"},instance)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Host",
|
||||
"multi": false,
|
||||
"name": "host",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values({__name__=~\"container.*\"},instance)",
|
||||
"refId": "Prometheus-host-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 5,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Container",
|
||||
"multi": false,
|
||||
"name": "container",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
|
||||
"refId": "Prometheus-container-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 0,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Cadvisor exporter",
|
||||
"uid": "pMEd7m0Mz",
|
||||
"version": 1,
|
||||
"description": "Simple exporter for cadvisor only"
|
||||
}
|
||||
15766
roles/tools/files/grafana/dashboards/node-exporter-full.json
Normal file
15766
roles/tools/files/grafana/dashboards/node-exporter-full.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -7,6 +7,29 @@
|
|||
group: "{{ deploy_group }}"
|
||||
mode: "0750"
|
||||
|
||||
- name: Create tools subdirectories
|
||||
ansible.builtin.file:
|
||||
path: "{{ tools_root }}/{{ item }}"
|
||||
state: directory
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0755"
|
||||
loop:
|
||||
- prometheus
|
||||
- prometheus/rules
|
||||
- grafana/provisioning/datasources
|
||||
- grafana/provisioning/dashboards
|
||||
- grafana/provisioning/dashboards/json
|
||||
- loki
|
||||
|
||||
- name: Deploy .env file
|
||||
ansible.builtin.template:
|
||||
src: env.j2
|
||||
dest: "{{ tools_root }}/.env"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0600"
|
||||
|
||||
- name: Deploy docker-compose.yml
|
||||
ansible.builtin.template:
|
||||
src: docker-compose.yml.j2
|
||||
|
|
@ -15,8 +38,130 @@
|
|||
group: "{{ deploy_group }}"
|
||||
mode: "0640"
|
||||
|
||||
- name: Deploy Prometheus config
|
||||
ansible.builtin.template:
|
||||
src: prometheus/prometheus.yml.j2
|
||||
dest: "{{ tools_root }}/prometheus/prometheus.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Prometheus alert rules
|
||||
ansible.builtin.template:
|
||||
src: prometheus/rules/alerts.yml.j2
|
||||
dest: "{{ tools_root }}/prometheus/rules/alerts.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy AlertManager config
|
||||
ansible.builtin.template:
|
||||
src: prometheus/alertmanager.yml.j2
|
||||
dest: "{{ tools_root }}/prometheus/alertmanager.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Loki config
|
||||
ansible.builtin.template:
|
||||
src: loki/loki.yml.j2
|
||||
dest: "{{ tools_root }}/loki/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Grafana Prometheus datasource
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/datasources/prometheus.yml.j2
|
||||
dest: "{{ tools_root }}/grafana/provisioning/datasources/prometheus.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Grafana Loki datasource
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/datasources/loki.yml.j2
|
||||
dest: "{{ tools_root }}/grafana/provisioning/datasources/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Grafana dashboard provisioning config
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/dashboards/dashboards.yml.j2
|
||||
dest: "{{ tools_root }}/grafana/provisioning/dashboards/dashboards.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy Node Exporter Full dashboard JSON
|
||||
ansible.builtin.copy:
|
||||
src: grafana/dashboards/node-exporter-full.json
|
||||
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Deploy cAdvisor dashboard JSON
|
||||
ansible.builtin.copy:
|
||||
src: grafana/dashboards/cadvisor.json
|
||||
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Pull monitoring images
|
||||
ansible.builtin.command: docker pull {{ item }}
|
||||
loop:
|
||||
- "{{ prometheus_image }}"
|
||||
- "{{ alertmanager_image }}"
|
||||
- "{{ node_exporter_image }}"
|
||||
- "{{ cadvisor_image }}"
|
||||
- "{{ grafana_image }}"
|
||||
- "{{ loki_image }}"
|
||||
- "{{ uptime_kuma_image }}"
|
||||
register: pull_result
|
||||
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
|
||||
retries: 5
|
||||
delay: 30
|
||||
until: pull_result.rc == 0
|
||||
|
||||
# ── UFW: allow main server to reach monitoring services ───────────────────────
|
||||
- name: Allow main server to reach Loki (Promtail log push)
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "3100"
|
||||
proto: tcp
|
||||
src: "{{ ip_main }}"
|
||||
|
||||
- name: Allow main server to reach Prometheus (discord-bot metrics)
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "9090"
|
||||
proto: tcp
|
||||
src: "{{ ip_main }}"
|
||||
|
||||
- name: Allow main Traefik to reach Grafana
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "3000"
|
||||
proto: tcp
|
||||
src: "{{ ip_main }}"
|
||||
|
||||
- name: Allow main Traefik to reach Uptime Kuma
|
||||
community.general.ufw:
|
||||
rule: allow
|
||||
port: "3001"
|
||||
proto: tcp
|
||||
src: "{{ ip_main }}"
|
||||
|
||||
- name: Start tools stack
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "{{ tools_root }}"
|
||||
state: present
|
||||
pull: never
|
||||
remove_orphans: true
|
||||
retries: 3
|
||||
delay: 15
|
||||
register: compose_result
|
||||
until: compose_result is succeeded
|
||||
|
|
|
|||
|
|
@ -1,10 +1,157 @@
|
|||
# Tools stack — generated by Ansible
|
||||
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
|
||||
# All app services (Outline, n8n) have been migrated to main server.
|
||||
# Monitoring stack (Grafana, Prometheus, Loki, Alertmanager) will be added here.
|
||||
# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor
|
||||
|
||||
networks:
|
||||
front:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
services: {}
|
||||
volumes:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
loki_data:
|
||||
uptime_kuma_data:
|
||||
|
||||
services:
|
||||
|
||||
# ── Prometheus ─────────────────────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: {{ prometheus_image }}
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090" # exposed to main via UFW rule for discord-bot
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: {{ alertmanager_image }}
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Exporters (monitor the tools host itself) ───────────────────────────────
|
||||
node-exporter:
|
||||
image: {{ node_exporter_image }}
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.sysfs=/host/sys"
|
||||
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
|
||||
cadvisor:
|
||||
image: {{ cadvisor_image }}
|
||||
container_name: cadvisor
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /dev/disk:/dev/disk:ro
|
||||
|
||||
# ── Grafana ─────────────────────────────────────────────────────────────────
|
||||
grafana:
|
||||
image: {{ grafana_image }}
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
env_file: .env
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
||||
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Loki ────────────────────────────────────────────────────────────────────
|
||||
loki:
|
||||
image: {{ loki_image }}
|
||||
container_name: loki
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "3100:3100" # exposed to main for Promtail log ingestion
|
||||
volumes:
|
||||
- loki_data:/loki
|
||||
- {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Uptime Kuma ─────────────────────────────────────────────────────────────
|
||||
uptime-kuma:
|
||||
image: {{ uptime_kuma_image }}
|
||||
container_name: uptime-kuma
|
||||
restart: unless-stopped
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
networks:
|
||||
- monitoring
|
||||
ports:
|
||||
- "3001:3001"
|
||||
volumes:
|
||||
- uptime_kuma_data:/app/data
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
|
|
|||
2
roles/tools/templates/env.j2
Normal file
2
roles/tools/templates/env.j2
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: default
|
||||
orgId: 1
|
||||
folder: ""
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards/json
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
editable: false
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
36
roles/tools/templates/loki/loki.yml.j2
Normal file
36
roles/tools/templates/loki/loki.yml.j2
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
retention_period: 30d
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/retention
|
||||
delete_request_store: filesystem
|
||||
retention_enabled: true
|
||||
38
roles/tools/templates/prometheus/alertmanager.yml.j2
Normal file
38
roles/tools/templates/prometheus/alertmanager.yml.j2
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: [alertname, severity]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: all
|
||||
|
||||
receivers:
|
||||
- name: all
|
||||
telegram_configs:
|
||||
- bot_token: "{{ alertmanager_telegram_token }}"
|
||||
chat_id: {{ alertmanager_telegram_chat_id }}
|
||||
message: |
|
||||
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
|
||||
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||
{{ '{{' }} end {{ '}}' }}
|
||||
parse_mode: Markdown
|
||||
discord_configs:
|
||||
- webhook_url: "{{ discord_webhook_alerts }}"
|
||||
title: >-
|
||||
{{ '{{' }} if eq (index .Alerts 0).Status "firing" {{ '}}' }}🔴 Alert{{ '{{' }} else {{ '}}' }}🟢 Resolved{{ '{{' }} end {{ '}}' }}
|
||||
message: |
|
||||
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||
**{{ '{{' }} .Labels.alertname {{ '}}' }}**
|
||||
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||
{{ '{{' }} end {{ '}}' }}
|
||||
|
||||
inhibit_rules:
|
||||
- source_matchers: [severity="critical"]
|
||||
target_matchers: [severity="warning"]
|
||||
equal: [alertname]
|
||||
49
roles/tools/templates/prometheus/prometheus.yml.j2
Normal file
49
roles/tools/templates/prometheus/prometheus.yml.j2
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
instance: "{{ domain_base }}"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# tools server metrics
|
||||
- job_name: node-exporter-tools
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
labels:
|
||||
host: tools
|
||||
|
||||
- job_name: cadvisor-tools
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
labels:
|
||||
host: tools
|
||||
|
||||
- job_name: alertmanager
|
||||
static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
# main server metrics (scraped over network)
|
||||
- job_name: node-exporter-main
|
||||
static_configs:
|
||||
- targets: ["{{ ip_main }}:9100"]
|
||||
labels:
|
||||
host: main
|
||||
|
||||
- job_name: cadvisor-main
|
||||
static_configs:
|
||||
- targets: ["{{ ip_main }}:8080"]
|
||||
labels:
|
||||
host: main
|
||||
86
roles/tools/templates/prometheus/rules/alerts.yml.j2
Normal file
86
roles/tools/templates/prometheus/rules/alerts.yml.j2
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
groups:
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HighCPULoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "CPU загружен более 85% на протяжении 5 минут."
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Использование RAM превысило 85%."
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
||||
|
||||
- alert: DiskSpaceWarning
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||
|
||||
- alert: SwapUsageHigh
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Swap используется более чем на 50% — RAM под давлением."
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: ContainerDown
|
||||
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
||||
description: "Контейнер не отвечает более 2 минут."
|
||||
|
||||
- alert: ContainerHighMemory
|
||||
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
||||
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
||||
|
||||
- alert: ContainerRestarting
|
||||
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
||||
description: "Контейнер не активен — проверьте docker ps."
|
||||
Loading…
Reference in a new issue