feat: migrate monitoring to tools server, fix Outline S3 uploads
Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma) moved from main to tools server. Prometheus now scrapes main exporters over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100. Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot PROMETHEUS_URL updated to http://ip_tools:9090. Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support per-object ACLs — caused upload failures). Add CORS configuration task for browser-side presigned uploads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d6015b76a3
commit
fde51352d7
20 changed files with 17172 additions and 210 deletions
|
|
@ -53,77 +53,19 @@
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
notify: Restart stack
|
notify: Restart stack
|
||||||
|
|
||||||
- name: Deploy Prometheus config
|
- name: Configure CORS on walava-outline S3 bucket (required for browser uploads)
|
||||||
ansible.builtin.template:
|
ansible.builtin.shell: |
|
||||||
src: prometheus/prometheus.yml.j2
|
docker run --rm \
|
||||||
dest: "{{ services_root }}/prometheus/prometheus.yml"
|
-e AWS_ACCESS_KEY_ID={{ s3_access_key }} \
|
||||||
owner: "{{ deploy_user }}"
|
-e AWS_SECRET_ACCESS_KEY={{ s3_secret_key }} \
|
||||||
group: "{{ deploy_group }}"
|
-e AWS_DEFAULT_REGION=ru-1 \
|
||||||
mode: "0644"
|
amazon/aws-cli:latest \
|
||||||
notify: Restart stack
|
--endpoint-url https://s3.timeweb.cloud \
|
||||||
|
s3api put-bucket-cors \
|
||||||
- name: Deploy Grafana datasource provisioning
|
--bucket walava-outline \
|
||||||
ansible.builtin.template:
|
--cors-configuration '{"CORSRules":[{"AllowedOrigins":["https://{{ domain_wiki }}"],"AllowedMethods":["GET","PUT","POST","DELETE","HEAD"],"AllowedHeaders":["*"],"ExposeHeaders":["ETag"],"MaxAgeSeconds":3000}]}'
|
||||||
src: grafana/provisioning/datasources/prometheus.yml.j2
|
changed_when: false
|
||||||
dest: "{{ services_root }}/grafana/provisioning/datasources/prometheus.yml"
|
ignore_errors: true
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy Grafana dashboard provisioning config
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: grafana/provisioning/dashboards/dashboards.yml.j2
|
|
||||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/dashboards.yml"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy Node Exporter Full dashboard JSON
|
|
||||||
ansible.builtin.copy:
|
|
||||||
src: grafana/dashboards/node-exporter-full.json
|
|
||||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy cAdvisor dashboard JSON
|
|
||||||
ansible.builtin.copy:
|
|
||||||
src: grafana/dashboards/cadvisor.json
|
|
||||||
dest: "{{ services_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy Prometheus alert rules
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: prometheus/rules/alerts.yml.j2
|
|
||||||
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy AlertManager config
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: prometheus/alertmanager.yml.j2
|
|
||||||
dest: "{{ services_root }}/prometheus/alertmanager.yml"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy Loki config
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: loki/loki.yml.j2
|
|
||||||
dest: "{{ services_root }}/loki/loki.yml"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy Promtail config
|
- name: Deploy Promtail config
|
||||||
ansible.builtin.template:
|
ansible.builtin.template:
|
||||||
|
|
@ -134,15 +76,6 @@
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
notify: Restart stack
|
notify: Restart stack
|
||||||
|
|
||||||
- name: Deploy Grafana Loki datasource
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: grafana/provisioning/datasources/loki.yml.j2
|
|
||||||
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
|
|
||||||
owner: "{{ deploy_user }}"
|
|
||||||
group: "{{ deploy_group }}"
|
|
||||||
mode: "0644"
|
|
||||||
notify: Restart stack
|
|
||||||
|
|
||||||
- name: Deploy CrowdSec acquisition config
|
- name: Deploy CrowdSec acquisition config
|
||||||
ansible.builtin.template:
|
ansible.builtin.template:
|
||||||
src: crowdsec/acquis.yaml.j2
|
src: crowdsec/acquis.yaml.j2
|
||||||
|
|
|
||||||
|
|
@ -22,12 +22,6 @@
|
||||||
- plane/pgdata
|
- plane/pgdata
|
||||||
- plane/media
|
- plane/media
|
||||||
- act_runner
|
- act_runner
|
||||||
- prometheus
|
|
||||||
- grafana/provisioning/datasources
|
|
||||||
- grafana/provisioning/dashboards
|
|
||||||
- grafana/provisioning/dashboards/json
|
|
||||||
- prometheus/rules
|
|
||||||
- loki
|
- loki
|
||||||
- traefik/logs
|
- traefik/logs
|
||||||
- crowdsec
|
- crowdsec
|
||||||
- authelia
|
|
||||||
|
|
|
||||||
|
|
@ -16,15 +16,10 @@
|
||||||
- "{{ plane_redis_image }}"
|
- "{{ plane_redis_image }}"
|
||||||
- "{{ plane_minio_image }}"
|
- "{{ plane_minio_image }}"
|
||||||
- "{{ act_runner_image }}"
|
- "{{ act_runner_image }}"
|
||||||
- "{{ prometheus_image }}"
|
|
||||||
- "{{ node_exporter_image }}"
|
- "{{ node_exporter_image }}"
|
||||||
- "{{ cadvisor_image }}"
|
- "{{ cadvisor_image }}"
|
||||||
- "{{ grafana_image }}"
|
|
||||||
- "{{ alertmanager_image }}"
|
|
||||||
- "{{ loki_image }}"
|
|
||||||
- "{{ promtail_image }}"
|
- "{{ promtail_image }}"
|
||||||
- "{{ crowdsec_image }}"
|
- "{{ crowdsec_image }}"
|
||||||
- "{{ uptime_kuma_image }}"
|
|
||||||
- "{{ outline_image }}"
|
- "{{ outline_image }}"
|
||||||
- "{{ outline_db_image }}"
|
- "{{ outline_db_image }}"
|
||||||
- "{{ outline_redis_image }}"
|
- "{{ outline_redis_image }}"
|
||||||
|
|
@ -35,6 +30,21 @@
|
||||||
delay: 30
|
delay: 30
|
||||||
until: pull_result.rc == 0
|
until: pull_result.rc == 0
|
||||||
|
|
||||||
|
# ── UFW: allow tools Prometheus to scrape exporters on main ──────────────────
|
||||||
|
- name: Allow tools server to scrape node-exporter
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "9100"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_tools }}"
|
||||||
|
|
||||||
|
- name: Allow tools server to scrape cAdvisor
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "8080"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_tools }}"
|
||||||
|
|
||||||
- name: Remove legacy SMTP relay UFW rule (port 1025)
|
- name: Remove legacy SMTP relay UFW rule (port 1025)
|
||||||
community.general.ufw:
|
community.general.ufw:
|
||||||
rule: allow
|
rule: allow
|
||||||
|
|
|
||||||
|
|
@ -40,11 +40,7 @@ volumes:
|
||||||
plane_minio_data:
|
plane_minio_data:
|
||||||
plane_media:
|
plane_media:
|
||||||
act_runner_data:
|
act_runner_data:
|
||||||
prometheus_data:
|
|
||||||
grafana_data:
|
|
||||||
loki_data:
|
|
||||||
crowdsec_data:
|
crowdsec_data:
|
||||||
uptime_kuma_data:
|
|
||||||
outline_db_data:
|
outline_db_data:
|
||||||
outline_redis_data:
|
outline_redis_data:
|
||||||
n8n_data:
|
n8n_data:
|
||||||
|
|
@ -381,52 +377,16 @@ services:
|
||||||
- backend
|
- backend
|
||||||
- runner-jobs
|
- runner-jobs
|
||||||
|
|
||||||
# ── Monitoring Stack ───────────────────────────────────────────────────────
|
# ── Monitoring exporters (metrics scraped by tools Prometheus over network) ──
|
||||||
prometheus:
|
# Ports exposed: tools server must have UFW rules allowing ip_main:9100/8080
|
||||||
image: {{ prometheus_image }}
|
|
||||||
container_name: prometheus
|
|
||||||
restart: unless-stopped
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
volumes:
|
|
||||||
- prometheus_data:/prometheus
|
|
||||||
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
||||||
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
|
||||||
command:
|
|
||||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
||||||
- "--storage.tsdb.path=/prometheus"
|
|
||||||
- "--storage.tsdb.retention.time=30d"
|
|
||||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
|
||||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
alertmanager:
|
|
||||||
image: {{ alertmanager_image }}
|
|
||||||
container_name: alertmanager
|
|
||||||
restart: unless-stopped
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
volumes:
|
|
||||||
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|
||||||
command:
|
|
||||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
|
||||||
- "--storage.path=/alertmanager"
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
node-exporter:
|
node-exporter:
|
||||||
image: {{ node_exporter_image }}
|
image: {{ node_exporter_image }}
|
||||||
container_name: node-exporter
|
container_name: node-exporter
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
networks:
|
||||||
- monitoring
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "9100:9100"
|
||||||
pid: host
|
pid: host
|
||||||
volumes:
|
volumes:
|
||||||
- /proc:/host/proc:ro
|
- /proc:/host/proc:ro
|
||||||
|
|
@ -443,6 +403,8 @@ services:
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
networks:
|
networks:
|
||||||
- monitoring
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
privileged: true
|
privileged: true
|
||||||
devices:
|
devices:
|
||||||
- /dev/kmsg
|
- /dev/kmsg
|
||||||
|
|
@ -453,50 +415,7 @@ services:
|
||||||
- /var/lib/docker:/var/lib/docker:ro
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
- /dev/disk:/dev/disk:ro
|
- /dev/disk:/dev/disk:ro
|
||||||
|
|
||||||
grafana:
|
# ── Logging (Promtail pushes to Loki on tools server) ─────────────────────
|
||||||
image: {{ grafana_image }}
|
|
||||||
container_name: grafana
|
|
||||||
restart: unless-stopped
|
|
||||||
security_opt:
|
|
||||||
- no-new-privileges:true
|
|
||||||
depends_on:
|
|
||||||
- prometheus
|
|
||||||
networks:
|
|
||||||
- backend
|
|
||||||
- monitoring
|
|
||||||
volumes:
|
|
||||||
- grafana_data:/var/lib/grafana
|
|
||||||
- {{ services_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
||||||
environment:
|
|
||||||
- GF_SECURITY_ADMIN_USER=admin
|
|
||||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
|
|
||||||
- GF_USERS_ALLOW_SIGN_UP=false
|
|
||||||
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
|
||||||
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
|
||||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
# ── Logging Stack ──────────────────────────────────────────────────────────
|
|
||||||
loki:
|
|
||||||
image: {{ loki_image }}
|
|
||||||
container_name: loki
|
|
||||||
restart: unless-stopped
|
|
||||||
networks:
|
|
||||||
- monitoring
|
|
||||||
volumes:
|
|
||||||
- loki_data:/loki
|
|
||||||
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
|
||||||
command: -config.file=/etc/loki/local-config.yaml
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
promtail:
|
promtail:
|
||||||
image: {{ promtail_image }}
|
image: {{ promtail_image }}
|
||||||
container_name: promtail
|
container_name: promtail
|
||||||
|
|
@ -544,12 +463,11 @@ services:
|
||||||
FORGEJO_TOKEN: "${FORGEJO_RUNNER_TOKEN}"
|
FORGEJO_TOKEN: "${FORGEJO_RUNNER_TOKEN}"
|
||||||
FORGEJO_URL: "https://{{ domain_git }}"
|
FORGEJO_URL: "https://{{ domain_git }}"
|
||||||
FORGEJO_REPO: "jack/infra"
|
FORGEJO_REPO: "jack/infra"
|
||||||
PROMETHEUS_URL: "http://prometheus:9090"
|
PROMETHEUS_URL: "http://{{ ip_tools }}:9090"
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
networks:
|
networks:
|
||||||
- proxy # Discord API (internet)
|
- proxy # Discord API (internet) + reach tools server over public IP
|
||||||
- monitoring # Prometheus metrics
|
|
||||||
|
|
||||||
# ── Walava Landing ─────────────────────────────────────────────────────────
|
# ── Walava Landing ─────────────────────────────────────────────────────────
|
||||||
# Landing page for walava.io — image built by walava-web repo CI/CD
|
# Landing page for walava.io — image built by walava-web repo CI/CD
|
||||||
|
|
@ -560,26 +478,6 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- proxy
|
- proxy
|
||||||
|
|
||||||
# ── Uptime Kuma ────────────────────────────────────────────────────────────
|
|
||||||
# Мониторинг доступности сервисов + публичная статус-страница
|
|
||||||
# Доступен по адресу: https://{{ domain_status }}
|
|
||||||
uptime-kuma:
|
|
||||||
image: {{ uptime_kuma_image }}
|
|
||||||
container_name: uptime-kuma
|
|
||||||
restart: unless-stopped
|
|
||||||
security_opt:
|
|
||||||
- no-new-privileges:true
|
|
||||||
networks:
|
|
||||||
- backend
|
|
||||||
- proxy # needs internet access for Discord/Telegram notifications
|
|
||||||
volumes:
|
|
||||||
- uptime_kuma_data:/app/data
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
|
|
||||||
# ── Outline wiki ────────────────────────────────────────────────────────────
|
# ── Outline wiki ────────────────────────────────────────────────────────────
|
||||||
outline:
|
outline:
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,6 @@ AWS_REGION=ru-1
|
||||||
AWS_S3_UPLOAD_BUCKET_NAME=walava-outline
|
AWS_S3_UPLOAD_BUCKET_NAME=walava-outline
|
||||||
AWS_S3_UPLOAD_BUCKET_URL=https://s3.timeweb.cloud
|
AWS_S3_UPLOAD_BUCKET_URL=https://s3.timeweb.cloud
|
||||||
AWS_S3_FORCE_PATH_STYLE=true
|
AWS_S3_FORCE_PATH_STYLE=true
|
||||||
AWS_S3_ACL=private
|
|
||||||
FILE_STORAGE=s3
|
FILE_STORAGE=s3
|
||||||
|
|
||||||
# Auth
|
# Auth
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ positions:
|
||||||
filename: /tmp/positions.yaml
|
filename: /tmp/positions.yaml
|
||||||
|
|
||||||
clients:
|
clients:
|
||||||
- url: http://loki:3100/loki/api/v1/push
|
- url: http://{{ ip_tools }}:3100/loki/api/v1/push
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: docker
|
- job_name: docker
|
||||||
|
|
|
||||||
|
|
@ -134,12 +134,12 @@ http:
|
||||||
grafana:
|
grafana:
|
||||||
loadBalancer:
|
loadBalancer:
|
||||||
servers:
|
servers:
|
||||||
- url: "http://grafana:3000"
|
- url: "http://{{ ip_tools }}:3000"
|
||||||
|
|
||||||
uptime-kuma:
|
uptime-kuma:
|
||||||
loadBalancer:
|
loadBalancer:
|
||||||
servers:
|
servers:
|
||||||
- url: "http://uptime-kuma:3001"
|
- url: "http://{{ ip_tools }}:3001"
|
||||||
|
|
||||||
walava-landing:
|
walava-landing:
|
||||||
loadBalancer:
|
loadBalancer:
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,11 @@
|
||||||
---
|
---
|
||||||
tools_root: /opt/tools
|
tools_root: /opt/tools
|
||||||
|
|
||||||
|
# Image versions (mirrors services role — keep in sync)
|
||||||
|
prometheus_image: "prom/prometheus:v3.4.0"
|
||||||
|
node_exporter_image: "prom/node-exporter:v1.9.1"
|
||||||
|
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1"
|
||||||
|
grafana_image: "grafana/grafana:11.6.1"
|
||||||
|
alertmanager_image: "prom/alertmanager:v0.28.1"
|
||||||
|
loki_image: "grafana/loki:3.4.3"
|
||||||
|
uptime_kuma_image: "louislam/uptime-kuma:1"
|
||||||
|
|
|
||||||
817
roles/tools/files/grafana/dashboards/cadvisor.json
Normal file
817
roles/tools/files/grafana/dashboards/cadvisor.json
Normal file
|
|
@ -0,0 +1,817 @@
|
||||||
|
{
|
||||||
|
"__inputs": [
|
||||||
|
{
|
||||||
|
"name": "DS_PROMETHEUS",
|
||||||
|
"label": "Prometheus",
|
||||||
|
"description": "Prometheus as the datasource is obligatory",
|
||||||
|
"type": "datasource",
|
||||||
|
"pluginId": "prometheus",
|
||||||
|
"pluginName": "Prometheus"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
{
|
||||||
|
"type": "grafana",
|
||||||
|
"id": "grafana",
|
||||||
|
"name": "Grafana",
|
||||||
|
"version": "7.4.5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "graph",
|
||||||
|
"name": "Graph",
|
||||||
|
"version": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "datasource",
|
||||||
|
"id": "prometheus",
|
||||||
|
"name": "Prometheus",
|
||||||
|
"version": "1.0.0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "table",
|
||||||
|
"name": "Table",
|
||||||
|
"version": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": "-- Grafana --",
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"gnetId": 14282,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": null,
|
||||||
|
"iteration": 1617715580880,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"collapsed": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"gridPos": {
|
||||||
|
"h": 1,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"id": 8,
|
||||||
|
"panels": [],
|
||||||
|
"title": "CPU",
|
||||||
|
"type": "row"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 7,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 1
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 15,
|
||||||
|
"legend": {
|
||||||
|
"alignAsTable": true,
|
||||||
|
"avg": true,
|
||||||
|
"current": false,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"rightSide": true,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"options": {
|
||||||
|
"alertThreshold": true
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name) *100",
|
||||||
|
"hide": false,
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:606",
|
||||||
|
"format": "percent",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:607",
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapsed": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"gridPos": {
|
||||||
|
"h": 1,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 8
|
||||||
|
},
|
||||||
|
"id": 11,
|
||||||
|
"panels": [],
|
||||||
|
"title": "Memory",
|
||||||
|
"type": "row"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 9
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 9,
|
||||||
|
"legend": {
|
||||||
|
"alignAsTable": true,
|
||||||
|
"avg": true,
|
||||||
|
"current": false,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"rightSide": true,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"options": {
|
||||||
|
"alertThreshold": true
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(container_memory_rss{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||||
|
"hide": false,
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:606",
|
||||||
|
"format": "bytes",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:607",
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 9
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 14,
|
||||||
|
"legend": {
|
||||||
|
"alignAsTable": true,
|
||||||
|
"avg": true,
|
||||||
|
"current": false,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"rightSide": true,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null as zero",
|
||||||
|
"options": {
|
||||||
|
"alertThreshold": true
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": true,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(container_memory_cache{instance=~\"$host\",name=~\"$container\",name=~\".+\"}) by (name)",
|
||||||
|
"hide": false,
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Memory Cached",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:606",
|
||||||
|
"format": "bytes",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:607",
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapsed": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"gridPos": {
|
||||||
|
"h": 1,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 17
|
||||||
|
},
|
||||||
|
"id": 2,
|
||||||
|
"panels": [],
|
||||||
|
"title": "Network",
|
||||||
|
"type": "row"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 18
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 4,
|
||||||
|
"legend": {
|
||||||
|
"alignAsTable": true,
|
||||||
|
"avg": true,
|
||||||
|
"current": false,
|
||||||
|
"hideEmpty": false,
|
||||||
|
"hideZero": false,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"rightSide": true,
|
||||||
|
"show": true,
|
||||||
|
"sideWidth": null,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"alertThreshold": true
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||||
|
"hide": false,
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Received Network Traffic",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:674",
|
||||||
|
"format": "Bps",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:675",
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"aliasColors": {},
|
||||||
|
"bars": false,
|
||||||
|
"dashLength": 10,
|
||||||
|
"dashes": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
},
|
||||||
|
"fill": 1,
|
||||||
|
"fillGradient": 0,
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 18
|
||||||
|
},
|
||||||
|
"hiddenSeries": false,
|
||||||
|
"id": 6,
|
||||||
|
"legend": {
|
||||||
|
"alignAsTable": true,
|
||||||
|
"avg": true,
|
||||||
|
"current": false,
|
||||||
|
"max": true,
|
||||||
|
"min": false,
|
||||||
|
"rightSide": true,
|
||||||
|
"show": true,
|
||||||
|
"total": false,
|
||||||
|
"values": true
|
||||||
|
},
|
||||||
|
"lines": true,
|
||||||
|
"linewidth": 1,
|
||||||
|
"nullPointMode": "null",
|
||||||
|
"options": {
|
||||||
|
"alertThreshold": true
|
||||||
|
},
|
||||||
|
"percentage": false,
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"pointradius": 2,
|
||||||
|
"points": false,
|
||||||
|
"renderer": "flot",
|
||||||
|
"seriesOverrides": [],
|
||||||
|
"spaceLength": 10,
|
||||||
|
"stack": false,
|
||||||
|
"steppedLine": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$host\",name=~\"$container\",name=~\".+\"}[5m])) by (name)",
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": [],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeRegions": [],
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Sent Network Traffic",
|
||||||
|
"tooltip": {
|
||||||
|
"shared": true,
|
||||||
|
"sort": 0,
|
||||||
|
"value_type": "individual"
|
||||||
|
},
|
||||||
|
"type": "graph",
|
||||||
|
"xaxis": {
|
||||||
|
"buckets": null,
|
||||||
|
"mode": "time",
|
||||||
|
"name": null,
|
||||||
|
"show": true,
|
||||||
|
"values": []
|
||||||
|
},
|
||||||
|
"yaxes": [
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:832",
|
||||||
|
"format": "Bps",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$$hashKey": "object:833",
|
||||||
|
"format": "short",
|
||||||
|
"label": null,
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": null,
|
||||||
|
"show": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"yaxis": {
|
||||||
|
"align": false,
|
||||||
|
"alignLevel": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"collapsed": false,
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"gridPos": {
|
||||||
|
"h": 1,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 26
|
||||||
|
},
|
||||||
|
"id": 19,
|
||||||
|
"panels": [],
|
||||||
|
"title": "Misc",
|
||||||
|
"type": "row"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"custom": {
|
||||||
|
"align": null,
|
||||||
|
"filterable": false
|
||||||
|
},
|
||||||
|
"mappings": [],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 80
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {
|
||||||
|
"id": "byName",
|
||||||
|
"options": "id"
|
||||||
|
},
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "custom.width",
|
||||||
|
"value": 260
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"matcher": {
|
||||||
|
"id": "byName",
|
||||||
|
"options": "Running"
|
||||||
|
},
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "unit",
|
||||||
|
"value": "d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "decimals",
|
||||||
|
"value": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "custom.displayMode",
|
||||||
|
"value": "color-text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "color",
|
||||||
|
"value": {
|
||||||
|
"fixedColor": "dark-green",
|
||||||
|
"mode": "fixed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 10,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 27
|
||||||
|
},
|
||||||
|
"id": 17,
|
||||||
|
"options": {
|
||||||
|
"showHeader": true,
|
||||||
|
"sortBy": []
|
||||||
|
},
|
||||||
|
"pluginVersion": "7.4.5",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "(time() - container_start_time_seconds{instance=~\"$host\",name=~\"$container\",name=~\".+\"})/86400",
|
||||||
|
"format": "table",
|
||||||
|
"instant": true,
|
||||||
|
"interval": "",
|
||||||
|
"legendFormat": "{{name}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"timeFrom": null,
|
||||||
|
"timeShift": null,
|
||||||
|
"title": "Containers Info",
|
||||||
|
"transformations": [
|
||||||
|
{
|
||||||
|
"id": "filterFieldsByName",
|
||||||
|
"options": {
|
||||||
|
"include": {
|
||||||
|
"names": [
|
||||||
|
"container_label_com_docker_compose_project",
|
||||||
|
"container_label_com_docker_compose_project_working_dir",
|
||||||
|
"image",
|
||||||
|
"instance",
|
||||||
|
"name",
|
||||||
|
"Value",
|
||||||
|
"container_label_com_docker_compose_service"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "organize",
|
||||||
|
"options": {
|
||||||
|
"excludeByName": {},
|
||||||
|
"indexByName": {},
|
||||||
|
"renameByName": {
|
||||||
|
"Value": "Running",
|
||||||
|
"container_label_com_docker_compose_project": "Label",
|
||||||
|
"container_label_com_docker_compose_project_working_dir": "Working dir",
|
||||||
|
"container_label_com_docker_compose_service": "Service",
|
||||||
|
"image": "Registry Image",
|
||||||
|
"instance": "Instance",
|
||||||
|
"name": "Name"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"type": "table"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 27,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": [
|
||||||
|
"cadvisor",
|
||||||
|
"docker"
|
||||||
|
],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"allValue": ".*",
|
||||||
|
"current": {},
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"definition": "label_values({__name__=~\"container.*\"},instance)",
|
||||||
|
"description": null,
|
||||||
|
"error": null,
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "Host",
|
||||||
|
"multi": false,
|
||||||
|
"name": "host",
|
||||||
|
"options": [],
|
||||||
|
"query": {
|
||||||
|
"query": "label_values({__name__=~\"container.*\"},instance)",
|
||||||
|
"refId": "Prometheus-host-Variable-Query"
|
||||||
|
},
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 5,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": ".*",
|
||||||
|
"current": {},
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"definition": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
|
||||||
|
"description": null,
|
||||||
|
"error": null,
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "Container",
|
||||||
|
"multi": false,
|
||||||
|
"name": "container",
|
||||||
|
"options": [],
|
||||||
|
"query": {
|
||||||
|
"query": "label_values({__name__=~\"container.*\", instance=~\"$host\"},name)",
|
||||||
|
"refId": "Prometheus-container-Variable-Query"
|
||||||
|
},
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 0,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-6h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "",
|
||||||
|
"title": "Cadvisor exporter",
|
||||||
|
"uid": "pMEd7m0Mz",
|
||||||
|
"version": 1,
|
||||||
|
"description": "Simple exporter for cadvisor only"
|
||||||
|
}
|
||||||
15766
roles/tools/files/grafana/dashboards/node-exporter-full.json
Normal file
15766
roles/tools/files/grafana/dashboards/node-exporter-full.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -7,6 +7,29 @@
|
||||||
group: "{{ deploy_group }}"
|
group: "{{ deploy_group }}"
|
||||||
mode: "0750"
|
mode: "0750"
|
||||||
|
|
||||||
|
- name: Create tools subdirectories
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ tools_root }}/{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0755"
|
||||||
|
loop:
|
||||||
|
- prometheus
|
||||||
|
- prometheus/rules
|
||||||
|
- grafana/provisioning/datasources
|
||||||
|
- grafana/provisioning/dashboards
|
||||||
|
- grafana/provisioning/dashboards/json
|
||||||
|
- loki
|
||||||
|
|
||||||
|
- name: Deploy .env file
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: env.j2
|
||||||
|
dest: "{{ tools_root }}/.env"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0600"
|
||||||
|
|
||||||
- name: Deploy docker-compose.yml
|
- name: Deploy docker-compose.yml
|
||||||
ansible.builtin.template:
|
ansible.builtin.template:
|
||||||
src: docker-compose.yml.j2
|
src: docker-compose.yml.j2
|
||||||
|
|
@ -15,8 +38,130 @@
|
||||||
group: "{{ deploy_group }}"
|
group: "{{ deploy_group }}"
|
||||||
mode: "0640"
|
mode: "0640"
|
||||||
|
|
||||||
|
- name: Deploy Prometheus config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus/prometheus.yml.j2
|
||||||
|
dest: "{{ tools_root }}/prometheus/prometheus.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Prometheus alert rules
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus/rules/alerts.yml.j2
|
||||||
|
dest: "{{ tools_root }}/prometheus/rules/alerts.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy AlertManager config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus/alertmanager.yml.j2
|
||||||
|
dest: "{{ tools_root }}/prometheus/alertmanager.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Loki config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: loki/loki.yml.j2
|
||||||
|
dest: "{{ tools_root }}/loki/loki.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Grafana Prometheus datasource
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: grafana/provisioning/datasources/prometheus.yml.j2
|
||||||
|
dest: "{{ tools_root }}/grafana/provisioning/datasources/prometheus.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Grafana Loki datasource
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: grafana/provisioning/datasources/loki.yml.j2
|
||||||
|
dest: "{{ tools_root }}/grafana/provisioning/datasources/loki.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Grafana dashboard provisioning config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: grafana/provisioning/dashboards/dashboards.yml.j2
|
||||||
|
dest: "{{ tools_root }}/grafana/provisioning/dashboards/dashboards.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy Node Exporter Full dashboard JSON
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: grafana/dashboards/node-exporter-full.json
|
||||||
|
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/node-exporter-full.json"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Deploy cAdvisor dashboard JSON
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: grafana/dashboards/cadvisor.json
|
||||||
|
dest: "{{ tools_root }}/grafana/provisioning/dashboards/json/cadvisor.json"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
|
||||||
|
- name: Pull monitoring images
|
||||||
|
ansible.builtin.command: docker pull {{ item }}
|
||||||
|
loop:
|
||||||
|
- "{{ prometheus_image }}"
|
||||||
|
- "{{ alertmanager_image }}"
|
||||||
|
- "{{ node_exporter_image }}"
|
||||||
|
- "{{ cadvisor_image }}"
|
||||||
|
- "{{ grafana_image }}"
|
||||||
|
- "{{ loki_image }}"
|
||||||
|
- "{{ uptime_kuma_image }}"
|
||||||
|
register: pull_result
|
||||||
|
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
|
||||||
|
retries: 5
|
||||||
|
delay: 30
|
||||||
|
until: pull_result.rc == 0
|
||||||
|
|
||||||
|
# ── UFW: allow main server to reach monitoring services ───────────────────────
|
||||||
|
- name: Allow main server to reach Loki (Promtail log push)
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "3100"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_main }}"
|
||||||
|
|
||||||
|
- name: Allow main server to reach Prometheus (discord-bot metrics)
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "9090"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_main }}"
|
||||||
|
|
||||||
|
- name: Allow main Traefik to reach Grafana
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "3000"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_main }}"
|
||||||
|
|
||||||
|
- name: Allow main Traefik to reach Uptime Kuma
|
||||||
|
community.general.ufw:
|
||||||
|
rule: allow
|
||||||
|
port: "3001"
|
||||||
|
proto: tcp
|
||||||
|
src: "{{ ip_main }}"
|
||||||
|
|
||||||
- name: Start tools stack
|
- name: Start tools stack
|
||||||
community.docker.docker_compose_v2:
|
community.docker.docker_compose_v2:
|
||||||
project_src: "{{ tools_root }}"
|
project_src: "{{ tools_root }}"
|
||||||
state: present
|
state: present
|
||||||
|
pull: never
|
||||||
remove_orphans: true
|
remove_orphans: true
|
||||||
|
retries: 3
|
||||||
|
delay: 15
|
||||||
|
register: compose_result
|
||||||
|
until: compose_result is succeeded
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,157 @@
|
||||||
# Tools stack — generated by Ansible
|
# Tools stack — generated by Ansible
|
||||||
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
|
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
|
||||||
# All app services (Outline, n8n) have been migrated to main server.
|
# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor
|
||||||
# Monitoring stack (Grafana, Prometheus, Loki, Alertmanager) will be added here.
|
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
front:
|
monitoring:
|
||||||
driver: bridge
|
driver: bridge
|
||||||
|
|
||||||
services: {}
|
volumes:
|
||||||
|
prometheus_data:
|
||||||
|
grafana_data:
|
||||||
|
loki_data:
|
||||||
|
uptime_kuma_data:
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
# ── Prometheus ─────────────────────────────────────────────────────────────
|
||||||
|
prometheus:
|
||||||
|
image: {{ prometheus_image }}
|
||||||
|
container_name: prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:9090:9090" # exposed to main via UFW rule for discord-bot
|
||||||
|
volumes:
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
- {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
||||||
|
command:
|
||||||
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||||
|
- "--storage.tsdb.path=/prometheus"
|
||||||
|
- "--storage.tsdb.retention.time=30d"
|
||||||
|
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||||
|
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: {{ alertmanager_image }}
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
volumes:
|
||||||
|
- {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
command:
|
||||||
|
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||||
|
- "--storage.path=/alertmanager"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# ── Exporters (monitor the tools host itself) ───────────────────────────────
|
||||||
|
node-exporter:
|
||||||
|
image: {{ node_exporter_image }}
|
||||||
|
container_name: node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
pid: host
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
command:
|
||||||
|
- "--path.procfs=/host/proc"
|
||||||
|
- "--path.sysfs=/host/sys"
|
||||||
|
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
||||||
|
|
||||||
|
cadvisor:
|
||||||
|
image: {{ cadvisor_image }}
|
||||||
|
container_name: cadvisor
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
privileged: true
|
||||||
|
devices:
|
||||||
|
- /dev/kmsg
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:ro
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
- /dev/disk:/dev/disk:ro
|
||||||
|
|
||||||
|
# ── Grafana ─────────────────────────────────────────────────────────────────
|
||||||
|
grafana:
|
||||||
|
image: {{ grafana_image }}
|
||||||
|
container_name: grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
volumes:
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
- {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
- GF_SECURITY_ADMIN_USER=admin
|
||||||
|
- GF_USERS_ALLOW_SIGN_UP=false
|
||||||
|
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
||||||
|
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
||||||
|
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# ── Loki ────────────────────────────────────────────────────────────────────
|
||||||
|
loki:
|
||||||
|
image: {{ loki_image }}
|
||||||
|
container_name: loki
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "3100:3100" # exposed to main for Promtail log ingestion
|
||||||
|
volumes:
|
||||||
|
- loki_data:/loki
|
||||||
|
- {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
||||||
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# ── Uptime Kuma ─────────────────────────────────────────────────────────────
|
||||||
|
uptime-kuma:
|
||||||
|
image: {{ uptime_kuma_image }}
|
||||||
|
container_name: uptime-kuma
|
||||||
|
restart: unless-stopped
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
ports:
|
||||||
|
- "3001:3001"
|
||||||
|
volumes:
|
||||||
|
- uptime_kuma_data:/app/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
|
||||||
2
roles/tools/templates/env.j2
Normal file
2
roles/tools/templates/env.j2
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD={{ grafana_admin_password }}
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: default
|
||||||
|
orgId: 1
|
||||||
|
folder: ""
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 30
|
||||||
|
allowUiUpdates: false
|
||||||
|
options:
|
||||||
|
path: /etc/grafana/provisioning/dashboards/json
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
isDefault: false
|
||||||
|
editable: false
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: false
|
||||||
36
roles/tools/templates/loki/loki.yml.j2
Normal file
36
roles/tools/templates/loki/loki.yml.j2
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
grpc_listen_port: 9096
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2020-10-24
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 30d
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
working_directory: /loki/retention
|
||||||
|
delete_request_store: filesystem
|
||||||
|
retention_enabled: true
|
||||||
38
roles/tools/templates/prometheus/alertmanager.yml.j2
Normal file
38
roles/tools/templates/prometheus/alertmanager.yml.j2
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: [alertname, severity]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 4h
|
||||||
|
receiver: all
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: all
|
||||||
|
telegram_configs:
|
||||||
|
- bot_token: "{{ alertmanager_telegram_token }}"
|
||||||
|
chat_id: {{ alertmanager_telegram_chat_id }}
|
||||||
|
message: |
|
||||||
|
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||||
|
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
|
||||||
|
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||||
|
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||||
|
{{ '{{' }} end {{ '}}' }}
|
||||||
|
parse_mode: Markdown
|
||||||
|
discord_configs:
|
||||||
|
- webhook_url: "{{ discord_webhook_alerts }}"
|
||||||
|
title: >-
|
||||||
|
{{ '{{' }} if eq (index .Alerts 0).Status "firing" {{ '}}' }}🔴 Alert{{ '{{' }} else {{ '}}' }}🟢 Resolved{{ '{{' }} end {{ '}}' }}
|
||||||
|
message: |
|
||||||
|
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||||
|
**{{ '{{' }} .Labels.alertname {{ '}}' }}**
|
||||||
|
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||||
|
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||||
|
{{ '{{' }} end {{ '}}' }}
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_matchers: [severity="critical"]
|
||||||
|
target_matchers: [severity="warning"]
|
||||||
|
equal: [alertname]
|
||||||
49
roles/tools/templates/prometheus/prometheus.yml.j2
Normal file
49
roles/tools/templates/prometheus/prometheus.yml.j2
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
external_labels:
|
||||||
|
instance: "{{ domain_base }}"
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ["alertmanager:9093"]
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- /etc/prometheus/rules/*.yml
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: prometheus
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
|
||||||
|
# tools server metrics
|
||||||
|
- job_name: node-exporter-tools
|
||||||
|
static_configs:
|
||||||
|
- targets: ["node-exporter:9100"]
|
||||||
|
labels:
|
||||||
|
host: tools
|
||||||
|
|
||||||
|
- job_name: cadvisor-tools
|
||||||
|
static_configs:
|
||||||
|
- targets: ["cadvisor:8080"]
|
||||||
|
labels:
|
||||||
|
host: tools
|
||||||
|
|
||||||
|
- job_name: alertmanager
|
||||||
|
static_configs:
|
||||||
|
- targets: ["alertmanager:9093"]
|
||||||
|
|
||||||
|
# main server metrics (scraped over network)
|
||||||
|
- job_name: node-exporter-main
|
||||||
|
static_configs:
|
||||||
|
- targets: ["{{ ip_main }}:9100"]
|
||||||
|
labels:
|
||||||
|
host: main
|
||||||
|
|
||||||
|
- job_name: cadvisor-main
|
||||||
|
static_configs:
|
||||||
|
- targets: ["{{ ip_main }}:8080"]
|
||||||
|
labels:
|
||||||
|
host: main
|
||||||
86
roles/tools/templates/prometheus/rules/alerts.yml.j2
Normal file
86
roles/tools/templates/prometheus/rules/alerts.yml.j2
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
groups:
|
||||||
|
- name: host
|
||||||
|
rules:
|
||||||
|
- alert: HighCPULoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "CPU загружен более 85% на протяжении 5 минут."
|
||||||
|
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Использование RAM превысило 85%."
|
||||||
|
|
||||||
|
- alert: CriticalMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
||||||
|
|
||||||
|
- alert: DiskSpaceWarning
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||||
|
|
||||||
|
- alert: DiskSpaceCritical
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||||
|
|
||||||
|
- alert: SwapUsageHigh
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Swap используется более чем на 50% — RAM под давлением."
|
||||||
|
|
||||||
|
- name: containers
|
||||||
|
rules:
|
||||||
|
- alert: ContainerDown
|
||||||
|
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
||||||
|
description: "Контейнер не отвечает более 2 минут."
|
||||||
|
|
||||||
|
- alert: ContainerHighMemory
|
||||||
|
expr: (container_memory_usage_bytes{name=~".+"} / (container_spec_memory_limit_bytes{name=~".+"} > 0)) * 100 > 90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
||||||
|
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
||||||
|
|
||||||
|
- alert: ContainerRestarting
|
||||||
|
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
||||||
|
description: "Контейнер не активен — проверьте docker ps."
|
||||||
Loading…
Reference in a new issue