infra/roles/services/templates/docker-compose.yml.j2
jack 6ebd237894
Some checks failed
CI/CD / deploy (push) Has been cancelled
CI/CD / syntax-check (push) Successful in 1m7s
feat: major infrastructure improvements
Reliability:
- Add swap role (2GB, swappiness=10, idempotent via /etc/fstab)
- Add mem_limit to plane-worker (512m) and plane-beat (256m)
- Add health checks to all services (traefik, vaultwarden, forgejo,
  plane-*, syncthing, prometheus, grafana, loki)

Code quality:
- Remove Traefik Docker labels (file provider used, labels were dead code)
- Add comment explaining file provider architecture

Observability:
- Add AlertManager with Telegram notifications
- Add Prometheus alert rules: CPU, RAM, disk, swap, container health
- Add Loki + Promtail for centralized log aggregation
- Add Loki datasource to Grafana
- Enable Traefik /ping endpoint for health checks

Backups:
- Add backup role: pg_dump for forgejo + plane DBs, tar for
  vaultwarden and forgejo data
- 7-day retention, daily cron at 03:00
- Backup script at /usr/local/bin/backup-services

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-22 03:28:16 +07:00

536 lines
17 KiB
Django/Jinja
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Docker Compose stack — generated by Ansible
# Do not edit manually; re-run ansible-playbook deploy.yml
#
# NOTE: Traefik uses the file provider (routes.yml.j2) — Docker labels on
# containers are intentionally absent. Adding labels here has no effect.
networks:
# proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа
# (ACME Let's Encrypt, внешние сервисы). backend — internal: true, поэтому
# сервисы не имеют прямого исходящего доступа в интернет.
proxy:
driver: bridge
backend:
driver: bridge
internal: true
forgejo-db:
driver: bridge
internal: true
forgejo-ssh:
driver: bridge
plane-internal:
driver: bridge
internal: true
runner-jobs:
driver: bridge
monitoring:
driver: bridge
internal: true
volumes:
vaultwarden_data:
forgejo_data:
forgejo_db_data:
plane_pgdata:
plane_redis_data:
plane_minio_data:
plane_media:
syncthing_config:
syncthing_data:
act_runner_data:
prometheus_data:
grafana_data:
loki_data:
services:
# ── Traefik ────────────────────────────────────────────────────────────────
# proxy — для ACME (исходящий интернет), backend — для маршрутизации к сервисам
traefik:
image: {{ traefik_image }}
container_name: traefik
restart: unless-stopped
ports:
- "80:80"
- "443:443"
networks:
- proxy
- backend
volumes:
- {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro
- {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro
- {{ services_root }}/traefik/acme.json:/acme/acme.json
healthcheck:
test: ["CMD", "traefik", "healthcheck", "--ping"]
interval: 30s
timeout: 5s
retries: 3
# ── Vaultwarden ────────────────────────────────────────────────────────────
vaultwarden:
image: {{ vaultwarden_image }}
container_name: vaultwarden
restart: unless-stopped
networks:
- backend
volumes:
- vaultwarden_data:/data
environment:
- ADMIN_TOKEN=${VAULTWARDEN_ADMIN_TOKEN}
- DOMAIN=https://{{ domain_vault }}
- SIGNUPS_ALLOWED=false
- INVITATIONS_ALLOWED=true
- LOG_LEVEL=warn
- EXTENDED_LOGGING=true
- TZ=UTC
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
# ── Forgejo ────────────────────────────────────────────────────────────────
forgejo:
image: {{ forgejo_image }}
container_name: forgejo
restart: unless-stopped
depends_on:
forgejo-db:
condition: service_healthy
networks:
- backend
- forgejo-db
- forgejo-ssh
volumes:
- forgejo_data:/data
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
environment:
- USER_UID=1000
- USER_GID=1000
- FORGEJO__database__DB_TYPE=postgres
- FORGEJO__database__HOST=forgejo-db:5432
- FORGEJO__database__NAME=forgejo
- FORGEJO__database__USER=forgejo
- FORGEJO__database__PASSWD=${FORGEJO_DB_PASSWORD}
- FORGEJO__server__DOMAIN={{ domain_git }}
- FORGEJO__server__ROOT_URL=https://{{ domain_git }}
- FORGEJO__server__SSH_DOMAIN={{ domain_git }}
- FORGEJO__server__SSH_PORT=2222
- FORGEJO__service__DISABLE_REGISTRATION=true
ports:
- "2222:22"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
forgejo-db:
image: {{ forgejo_db_image }}
container_name: forgejo-db
restart: unless-stopped
networks:
- forgejo-db
volumes:
- forgejo_db_data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=forgejo
- POSTGRES_PASSWORD=${FORGEJO_DB_PASSWORD}
- POSTGRES_DB=forgejo
- PGDATA=/var/lib/postgresql/data/pgdata
healthcheck:
test: ["CMD-SHELL", "pg_isready -U forgejo"]
interval: 10s
timeout: 5s
retries: 5
mem_limit: 512m
# ── Plane ──────────────────────────────────────────────────────────────────
# Маршрутизация через Traefik:
# /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal)
# остальное → plane-web:3000 (Next.js, на backend + plane-internal)
# Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически.
#
# NOTE: Plane не публикует конкретные version tags — используем :stable.
# Следить за обновлениями: https://github.com/makeplane/plane/releases
plane-web:
image: {{ plane_frontend_image }}
container_name: plane-web
restart: unless-stopped
depends_on:
- plane-api
networks:
- backend
- plane-internal
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-admin:
image: {{ plane_admin_image }}
container_name: plane-admin
restart: unless-stopped
depends_on:
- plane-api
- plane-web
networks:
- backend
- plane-internal
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-space:
image: {{ plane_space_image }}
container_name: plane-space
restart: unless-stopped
depends_on:
- plane-api
- plane-web
networks:
- backend
- plane-internal
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-api:
image: {{ plane_backend_image }}
container_name: plane-api
restart: unless-stopped
mem_limit: 512m
command: ./bin/docker-entrypoint-api.sh
depends_on:
plane-db:
condition: service_healthy
plane-redis:
condition: service_started
plane-minio:
condition: service_healthy
networks:
- backend
- plane-internal
volumes:
- plane_media:/app/media
environment:
- DATABASE_URL=postgresql://plane:${PLANE_DB_PASSWORD}@plane-db:5432/plane
- REDIS_URL=redis://plane-redis:6379/
- AMQP_URL=redis://plane-redis:6379/
- SECRET_KEY=${PLANE_SECRET_KEY}
- DEBUG=0
- DJANGO_SETTINGS_MODULE=plane.settings.production
- WEB_URL=https://{{ domain_plane }}
- FILE_SIZE_LIMIT=5242880
- USE_MINIO=1
- AWS_REGION=us-east-1
- AWS_ACCESS_KEY_ID=plane-minio
- AWS_SECRET_ACCESS_KEY=${PLANE_MINIO_PASSWORD}
- AWS_S3_ENDPOINT_URL=http://plane-minio:9000
- AWS_S3_BUCKET_NAME=uploads
- MINIO_ROOT_USER=plane-minio
- MINIO_ROOT_PASSWORD=${PLANE_MINIO_PASSWORD}
- GUNICORN_WORKERS=2
- APP_BASE_URL=https://{{ domain_plane }}
- ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode
- SPACE_BASE_URL=https://{{ domain_plane }}/spaces
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8000/api/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
plane-worker:
image: {{ plane_backend_image }}
container_name: plane-worker
restart: unless-stopped
mem_limit: 512m
command: ./bin/docker-entrypoint-worker.sh
depends_on:
- plane-api
networks:
- plane-internal
volumes:
- plane_media:/app/media
environment:
- DATABASE_URL=postgresql://plane:${PLANE_DB_PASSWORD}@plane-db:5432/plane
- REDIS_URL=redis://plane-redis:6379/
- AMQP_URL=redis://plane-redis:6379/
- SECRET_KEY=${PLANE_SECRET_KEY}
- DEBUG=0
- DJANGO_SETTINGS_MODULE=plane.settings.production
- USE_MINIO=1
- AWS_REGION=us-east-1
- AWS_ACCESS_KEY_ID=plane-minio
- AWS_SECRET_ACCESS_KEY=${PLANE_MINIO_PASSWORD}
- AWS_S3_ENDPOINT_URL=http://plane-minio:9000
- AWS_S3_BUCKET_NAME=uploads
- MINIO_ROOT_USER=plane-minio
- MINIO_ROOT_PASSWORD=${PLANE_MINIO_PASSWORD}
plane-beat:
image: {{ plane_backend_image }}
container_name: plane-beat
restart: unless-stopped
mem_limit: 256m
command: ./bin/docker-entrypoint-beat.sh
depends_on:
- plane-api
networks:
- plane-internal
environment:
- DATABASE_URL=postgresql://plane:${PLANE_DB_PASSWORD}@plane-db:5432/plane
- REDIS_URL=redis://plane-redis:6379/
- AMQP_URL=redis://plane-redis:6379/
- SECRET_KEY=${PLANE_SECRET_KEY}
- DEBUG=0
- DJANGO_SETTINGS_MODULE=plane.settings.production
- USE_MINIO=1
- AWS_REGION=us-east-1
- AWS_ACCESS_KEY_ID=plane-minio
- AWS_SECRET_ACCESS_KEY=${PLANE_MINIO_PASSWORD}
- AWS_S3_ENDPOINT_URL=http://plane-minio:9000
- AWS_S3_BUCKET_NAME=uploads
- MINIO_ROOT_USER=plane-minio
- MINIO_ROOT_PASSWORD=${PLANE_MINIO_PASSWORD}
plane-db:
image: {{ plane_db_image }}
container_name: plane-db
restart: unless-stopped
mem_limit: 512m
networks:
- plane-internal
volumes:
- plane_pgdata:/var/lib/postgresql/data
environment:
- POSTGRES_USER=plane
- POSTGRES_PASSWORD=${PLANE_DB_PASSWORD}
- POSTGRES_DB=plane
- PGDATA=/var/lib/postgresql/data/pgdata
healthcheck:
test: ["CMD-SHELL", "pg_isready -U plane"]
interval: 10s
timeout: 5s
retries: 5
plane-redis:
image: {{ plane_redis_image }}
container_name: plane-redis
restart: unless-stopped
networks:
- plane-internal
volumes:
- plane_redis_data:/data
command: redis-server --appendonly yes
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 3s
retries: 5
plane-minio:
image: {{ plane_minio_image }}
container_name: plane-minio
restart: unless-stopped
mem_limit: 512m
networks:
- plane-internal
volumes:
- plane_minio_data:/data
environment:
- MINIO_ROOT_USER=plane-minio
- MINIO_ROOT_PASSWORD=${PLANE_MINIO_PASSWORD}
command: server /data --console-address ":9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
# ── Syncthing ──────────────────────────────────────────────────────────────
# Порты 22000 и 21027 нужны для синхронизации между устройствами (не только UI).
# backend — internal: true, но Syncthing на published ports выходит наружу через host.
syncthing:
image: {{ syncthing_image }}
container_name: syncthing
restart: unless-stopped
networks:
- backend
ports:
- "22000:22000/tcp"
- "22000:22000/udp"
- "21027:21027/udp"
volumes:
- syncthing_config:/var/syncthing/config
- syncthing_data:/var/syncthing/data
environment:
- PUID=1000
- PGID=1000
- TZ=UTC
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8384/rest/noauth/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Forgejo Actions Runner ─────────────────────────────────────────────────
# backend — для связи с Forgejo по внутренней сети (http://forgejo:3000)
# runner-jobs — сеть с интернет-доступом для job-контейнеров
act_runner:
image: {{ act_runner_image }}
container_name: act_runner
restart: unless-stopped
depends_on:
- forgejo
environment:
- GITEA_INSTANCE_URL=https://{{ domain_git }}
- GITEA_RUNNER_REGISTRATION_TOKEN=${FORGEJO_RUNNER_TOKEN}
- GITEA_RUNNER_NAME=vps-runner
- CONFIG_FILE=/data/config.yaml
volumes:
- act_runner_data:/data
- /var/run/docker.sock:/var/run/docker.sock
- {{ services_root }}/act_runner/config.yaml:/data/config.yaml:ro
networks:
- backend
- runner-jobs
# ── Monitoring Stack ───────────────────────────────────────────────────────
prometheus:
image: {{ prometheus_image }}
container_name: prometheus
restart: unless-stopped
networks:
- monitoring
volumes:
- prometheus_data:/prometheus
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
alertmanager:
image: {{ alertmanager_image }}
container_name: alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
node-exporter:
image: {{ node_exporter_image }}
container_name: node-exporter
restart: unless-stopped
networks:
- monitoring
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
cadvisor:
image: {{ cadvisor_image }}
container_name: cadvisor
restart: unless-stopped
networks:
- monitoring
privileged: true
devices:
- /dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
grafana:
image: {{ grafana_image }}
container_name: grafana
restart: unless-stopped
depends_on:
- prometheus
networks:
- backend
- monitoring
volumes:
- grafana_data:/var/lib/grafana
- {{ services_root }}/grafana/provisioning:/etc/grafana/provisioning:ro
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_DOMAIN={{ domain_dashboard }}
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
- GF_AUTH_ANONYMOUS_ENABLED=false
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Logging Stack ──────────────────────────────────────────────────────────
loki:
image: {{ loki_image }}
container_name: loki
restart: unless-stopped
networks:
- monitoring
volumes:
- loki_data:/loki
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
promtail:
image: {{ promtail_image }}
container_name: promtail
restart: unless-stopped
networks:
- monitoring
volumes:
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- {{ services_root }}/loki/promtail.yml:/etc/promtail/config.yml:ro
command: -config.file=/etc/promtail/config.yml