# Tools stack — generated by Ansible # Do not edit manually; re-run ansible-playbook playbooks/tools.yml # Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor networks: monitoring: driver: bridge volumes: prometheus_data: grafana_data: loki_data: uptime_kuma_data: services: # ── Prometheus ───────────────────────────────────────────────────────────── prometheus: image: {{ prometheus_image }} container_name: prometheus restart: unless-stopped networks: - monitoring ports: - "127.0.0.1:9090:9090" # exposed to main via UFW rule for discord-bot volumes: - prometheus_data:/prometheus - {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=30d" - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"] interval: 30s timeout: 5s retries: 3 alertmanager: image: {{ alertmanager_image }} container_name: alertmanager restart: unless-stopped networks: - monitoring volumes: - {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro command: - "--config.file=/etc/alertmanager/alertmanager.yml" - "--storage.path=/alertmanager" healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"] interval: 30s timeout: 5s retries: 3 # ── Exporters (monitor the tools host itself) ─────────────────────────────── node-exporter: image: {{ node_exporter_image }} container_name: node-exporter restart: unless-stopped networks: - monitoring pid: host volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro command: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" cadvisor: image: {{ cadvisor_image }} container_name: cadvisor restart: unless-stopped networks: - monitoring privileged: true devices: - /dev/kmsg volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro # ── Grafana ───────────────────────────────────────────────────────────────── grafana: image: {{ grafana_image }} container_name: grafana restart: unless-stopped security_opt: - no-new-privileges:true depends_on: - prometheus networks: - monitoring ports: - "3000:3000" volumes: - grafana_data:/var/lib/grafana - {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro env_file: .env environment: - GF_SECURITY_ADMIN_USER=admin - GF_USERS_ALLOW_SIGN_UP=false - GF_SERVER_DOMAIN={{ domain_dashboard }} - GF_SERVER_ROOT_URL=https://{{ domain_dashboard }} - GF_AUTH_ANONYMOUS_ENABLED=false healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"] interval: 30s timeout: 5s retries: 3 # ── Loki ──────────────────────────────────────────────────────────────────── loki: image: {{ loki_image }} container_name: loki restart: unless-stopped networks: - monitoring ports: - "3100:3100" # exposed to main for Promtail log ingestion volumes: - loki_data:/loki - {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro command: -config.file=/etc/loki/local-config.yaml healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"] interval: 30s timeout: 5s retries: 3 # ── Uptime Kuma ───────────────────────────────────────────────────────────── uptime-kuma: image: {{ uptime_kuma_image }} container_name: uptime-kuma restart: unless-stopped security_opt: - no-new-privileges:true networks: - monitoring ports: - "3001:3001" volumes: - uptime_kuma_data:/app/data healthcheck: test: ["CMD", "curl", "-sf", "http://localhost:3001/"] interval: 30s timeout: 5s retries: 3