infra/roles/tools/templates/docker-compose.yml.j2
jack fde51352d7 feat: migrate monitoring to tools server, fix Outline S3 uploads
Monitoring stack (Prometheus, AlertManager, Grafana, Loki, Uptime Kuma)
moved from main to tools server. Prometheus now scrapes main exporters
over network (ip_main:9100/8080). Promtail pushes logs to ip_tools:3100.
Traefik routes for dash/status.walava.io updated to ip_tools. discord-bot
PROMETHEUS_URL updated to http://ip_tools:9090.

Outline S3 fix: remove AWS_S3_ACL=private (Timeweb doesn't support
per-object ACLs — caused upload failures). Add CORS configuration task
for browser-side presigned uploads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 04:10:28 +07:00

157 lines
5 KiB
Django/Jinja

# Tools stack — generated by Ansible
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor
networks:
monitoring:
driver: bridge
volumes:
prometheus_data:
grafana_data:
loki_data:
uptime_kuma_data:
services:
# ── Prometheus ─────────────────────────────────────────────────────────────
prometheus:
image: {{ prometheus_image }}
container_name: prometheus
restart: unless-stopped
networks:
- monitoring
ports:
- "127.0.0.1:9090:9090" # exposed to main via UFW rule for discord-bot
volumes:
- prometheus_data:/prometheus
- {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
alertmanager:
image: {{ alertmanager_image }}
container_name: alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
- {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
# ── Exporters (monitor the tools host itself) ───────────────────────────────
node-exporter:
image: {{ node_exporter_image }}
container_name: node-exporter
restart: unless-stopped
networks:
- monitoring
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
cadvisor:
image: {{ cadvisor_image }}
container_name: cadvisor
restart: unless-stopped
networks:
- monitoring
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
# ── Grafana ─────────────────────────────────────────────────────────────────
grafana:
image: {{ grafana_image }}
container_name: grafana
restart: unless-stopped
security_opt:
- no-new-privileges:true
depends_on:
- prometheus
networks:
- monitoring
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
env_file: .env
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_DOMAIN={{ domain_dashboard }}
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
- GF_AUTH_ANONYMOUS_ENABLED=false
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Loki ────────────────────────────────────────────────────────────────────
loki:
image: {{ loki_image }}
container_name: loki
restart: unless-stopped
networks:
- monitoring
ports:
- "3100:3100" # exposed to main for Promtail log ingestion
volumes:
- loki_data:/loki
- {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
# ── Uptime Kuma ─────────────────────────────────────────────────────────────
uptime-kuma:
image: {{ uptime_kuma_image }}
container_name: uptime-kuma
restart: unless-stopped
security_opt:
- no-new-privileges:true
networks:
- monitoring
ports:
- "3001:3001"
volumes:
- uptime_kuma_data:/app/data
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
interval: 30s
timeout: 5s
retries: 3