infra/roles/tools/templates/docker-compose.yml.j2

# Tools stack — generated by Ansible
# Do not edit manually; re-run ansible-playbook playbooks/tools.yml
# Monitoring: Prometheus, Grafana, Loki, AlertManager, Uptime Kuma, node-exporter, cAdvisor

networks:
  monitoring:
    driver: bridge

volumes:
  prometheus_data:
  grafana_data:
  loki_data:
  uptime_kuma_data:

services:

  # ── Prometheus ─────────────────────────────────────────────────────────────
  prometheus:
    image: {{ prometheus_image }}
    container_name: prometheus
    restart: unless-stopped
    networks:
      - monitoring
    ports:
      - "127.0.0.1:9090:9090"  # exposed to main via UFW rule for discord-bot
    volumes:
      - prometheus_data:/prometheus
      - {{ tools_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - {{ tools_root }}/prometheus/rules:/etc/prometheus/rules:ro
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--storage.tsdb.retention.time=30d"
      - "--web.console.libraries=/usr/share/prometheus/console_libraries"
      - "--web.console.templates=/usr/share/prometheus/consoles"
    healthcheck:
      test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 5s
      retries: 3

  alertmanager:
    image: {{ alertmanager_image }}
    container_name: alertmanager
    restart: unless-stopped
    networks:
      - monitoring
    volumes:
      - {{ tools_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    command:
      - "--config.file=/etc/alertmanager/alertmanager.yml"
      - "--storage.path=/alertmanager"
    healthcheck:
      test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
      interval: 30s
      timeout: 5s
      retries: 3

  # ── Exporters (monitor the tools host itself) ───────────────────────────────
  node-exporter:
    image: {{ node_exporter_image }}
    container_name: node-exporter
    restart: unless-stopped
    networks:
      - monitoring
    pid: host
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - "--path.procfs=/host/proc"
      - "--path.sysfs=/host/sys"
      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"

  cadvisor:
    image: {{ cadvisor_image }}
    container_name: cadvisor
    restart: unless-stopped
    networks:
      - monitoring
    privileged: true
    devices:
      - /dev/kmsg
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker:/var/lib/docker:ro
      - /dev/disk:/dev/disk:ro

  # ── Grafana ─────────────────────────────────────────────────────────────────
  grafana:
    image: {{ grafana_image }}
    container_name: grafana
    restart: unless-stopped
    security_opt:
      - no-new-privileges:true
    depends_on:
      - prometheus
    networks:
      - monitoring
    ports:
      - "3000:3000"
    volumes:
      - grafana_data:/var/lib/grafana
      - {{ tools_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
    env_file: .env
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_DOMAIN={{ domain_dashboard }}
      - GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
      - GF_AUTH_ANONYMOUS_ENABLED=false
    healthcheck:
      test: ["CMD", "wget", "-qO-", "http://localhost:3000/api/health"]
      interval: 30s
      timeout: 5s
      retries: 3

  # ── Loki ────────────────────────────────────────────────────────────────────
  loki:
    image: {{ loki_image }}
    container_name: loki
    restart: unless-stopped
    networks:
      - monitoring
    ports:
      - "3100:3100"  # exposed to main for Promtail log ingestion
    volumes:
      - loki_data:/loki
      - {{ tools_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
    command: -config.file=/etc/loki/local-config.yaml
    healthcheck:
      test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
      interval: 30s
      timeout: 5s
      retries: 3

  # ── Uptime Kuma ─────────────────────────────────────────────────────────────
  uptime-kuma:
    image: {{ uptime_kuma_image }}
    container_name: uptime-kuma
    restart: unless-stopped
    security_opt:
      - no-new-privileges:true
    networks:
      - monitoring
    ports:
      - "3001:3001"
    volumes:
      - uptime_kuma_data:/app/data
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:3001/"]
      interval: 30s
      timeout: 5s
      retries: 3