From 6ebd237894e6a7c31d6c94e7782149df6e47a2fb Mon Sep 17 00:00:00 2001 From: jack Date: Sun, 22 Mar 2026 03:28:16 +0700 Subject: [PATCH] feat: major infrastructure improvements Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 --- inventory/group_vars/all/main.yml | 4 +- inventory/group_vars/all/vault.yml | 91 ++++----- playbooks/deploy.yml | 2 + roles/backup/defaults/main.yml | 4 + roles/backup/tasks/main.yml | 25 +++ roles/backup/templates/backup.sh.j2 | 51 ++++++ roles/base/tasks/main.yml | 1 + roles/base/tasks/swap.yml | 42 +++++ roles/services/defaults/main.yml | 3 + roles/services/tasks/configs.yml | 45 +++++ roles/services/tasks/directories.yml | 2 + roles/services/tasks/main.yml | 3 + .../services/templates/docker-compose.yml.j2 | 172 ++++++++++++------ .../provisioning/datasources/loki.yml.j2 | 10 + roles/services/templates/loki/loki.yml.j2 | 36 ++++ roles/services/templates/loki/promtail.yml.j2 | 38 ++++ .../templates/prometheus/alertmanager.yml.j2 | 28 +++ .../templates/prometheus/prometheus.yml.j2 | 12 ++ .../templates/prometheus/rules/alerts.yml.j2 | 86 +++++++++ .../services/templates/traefik/traefik.yml.j2 | 2 + 20 files changed, 558 insertions(+), 99 deletions(-) create mode 100644 roles/backup/defaults/main.yml create mode 100644 roles/backup/tasks/main.yml create mode 100644 roles/backup/templates/backup.sh.j2 create mode 100644 roles/base/tasks/swap.yml create mode 100644 roles/services/templates/grafana/provisioning/datasources/loki.yml.j2 create mode 100644 roles/services/templates/loki/loki.yml.j2 create mode 100644 roles/services/templates/loki/promtail.yml.j2 create mode 100644 roles/services/templates/prometheus/alertmanager.yml.j2 create mode 100644 roles/services/templates/prometheus/rules/alerts.yml.j2 diff --git a/inventory/group_vars/all/main.yml b/inventory/group_vars/all/main.yml index 07689af..fa81942 100644 --- a/inventory/group_vars/all/main.yml +++ b/inventory/group_vars/all/main.yml @@ -25,7 +25,9 @@ plane_minio_password: "{{ vault_plane_minio_password }}" traefik_dashboard_htpasswd: "{{ vault_traefik_dashboard_htpasswd }}" syncthing_basic_auth_htpasswd: "{{ vault_syncthing_basic_auth_htpasswd }}" forgejo_runner_token: "{{ vault_forgejo_runner_token }}" -grafana_admin_password: "{{ vault_grafana_admin_password }}" +grafana_admin_password: "{{ vault_grafana_admin_password }}" +alertmanager_telegram_token: "{{ vault_alertmanager_telegram_token }}" +alertmanager_telegram_chat_id: "{{ vault_alertmanager_telegram_chat_id }}" # CI/CD deploy key (public key — not a secret) ci_deploy_pubkey: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHdr9mRSSUqt7Ym4wA5RpVyz76wEXSOtVfh2/yCSMIbg ci-deploy@forgejo-runner" diff --git a/inventory/group_vars/all/vault.yml b/inventory/group_vars/all/vault.yml index ee3426b..ef872f5 100644 --- a/inventory/group_vars/all/vault.yml +++ b/inventory/group_vars/all/vault.yml @@ -1,44 +1,49 @@ $ANSIBLE_VAULT;1.1;AES256 -31613031396131376665643639663630323363366332396162356132316233323063356465643762 -3637303062366530313132303165333161323737643030650a653032646534303463616562633234 -65373032373464303839346430626665316437656363666164636532613837613637323064326630 -3736343663396461350a396434613034393131623037333436306532373038623534363334313066 -38333535376230653538633261366633366238636530656464343939623834646139396637343461 -62396564616566373231343138653061366663366461343239636463633261643464633361373834 -30386237383561613232626361323536636363373831356635653535656633323332666264383061 -63633331343162396431653237333664663439383738333932373866643030303735643534386330 -32366438623966303131613666613265313235323530626132316661383462313033653038363933 -34646463666632396231313563363064396539663236356565306564653433353735333335656534 -65396235656365376236613832626366666666653834626633373937386366656232633766636634 -63346535643936646333663735333630623538393234383336613461343863623935393865343532 -37306161646434656439323832383238643963316463643033306262613231646334303361653833 -63646363653235326261333538666236353231376437653636316131623135323138636661386665 -66316663393934663031623135633137366131633030626664306564396635663235383636343431 -65343330373036316161626165343738323730646130663839616363326232653039303932363765 -61633432386361353861393263306236343162313066653962363761373161656365353538656335 -36643163383435326230326135616164333134363666616339653330656531326539653764633632 -33396431346530633933626533313939316337363035333763613237356133396162316163646632 -39633165653565333237376137303737383831373838656461346434663331313965323235626330 -37303665313564346233666632656531323932316533613463633636633435646137653064653137 -63356439336664383133376530393036653061613466383961613939633866306337653865366330 -66366132383237336534363563643135313138376437336563643130383534373263373364653462 -61396664373363373737633339333335653164323662623239313666666431353631306438303533 -65636135343039323734393637396163366138636439656633616130643636363831643532623337 -35303934656230623563366664396632356638636630613433626334303235343961616439636165 -37313264643432363532373464633633343033303133346435663062323838383931393061396531 -34373034313038663033383333333430326136346433646536626565663436323764316361356531 -64343839373138346636336637343438616639353236336633666234643365303030656132346232 -65346636626338363762653939333639313462393231616636663935333232636231326231373833 -32366535353361303532633462303763386238663432333465653361373064656162343465396631 -61363330666666653533653365653232313836373230336537363337666536313737353731363165 -63333837356135646564663536666264346630356163663666323432393865393836326338306266 -33383139383033643937383865393236396337666139633032323162303665633230346365663730 -63643661323332616163326636646634616165633538383038653766303066366335393065373236 -39613562363634316564346162333030393430303335323733306163396137373037346237656231 -39346635333138656336346230313635353233363334633037633961306663356364383962643361 -38366531396431653330393239663337626564616265636362313537373239663736636535303332 -39626565306632336330386434386636656430363738383431306637666334653136633762323434 -34356661633837626231346134303131353264643532613739333234346634346565376333343563 -37343662376533383335356331616435393764663530616335386435653538646362613364303437 -63356166353062626163313735646365643635393663316365626431383062663331366439613164 -323565613761663833636330633533376131 +32336633366435326662623163613564336332393334626662383862346337626361316330313237 +3063366162376465393134353633616139343430316533330a643361626263366538353538663062 +39306231336331636165336563336662666564393235336435393534663439336162316363376437 +3735333361343130620a613163623238356538333764333830613963333466333832353262333432 +61356434346135616566613064643331633933356332363631623664366166663034373435326262 +32623662386636353761396239343534343562616237616131646536323334303736623935326638 +38323034346266666163626536333038393935396330663065383237333065613362333736666466 +30643761366364353431306538306263653338636664393035373132616239346631343034376336 +30643562363338383239306434393535363763626136376463616432333431316433336638663061 +61633762383063383934386532363633303661323334393439373936313564636363393535646361 +37666561323736303033353930616362336134383165326463613261323665326666383337646465 +35326532353836343363616566363965663237653433646134663363643337363964643762366438 +30353535633338303133373035646230653933346130306631393233633964353865386137373262 +34303161353963613538663366633531633264663231306134313862306561613164346430393462 +39336430653864633530353931653931303266613264643462313832313432366662366566353233 +35376466356537313131313136353334386539393638663738653366373032323966346666613336 +66363234376163326562656232326432636331356238326337313538663563643939323265633238 +35316430326661356633386130613238623730313530636136346139326235333838336561376565 +62356166333936313565343764336230663332653765353531653930373265383862643337333136 +31616336363863666465336561346265613637653132343836653962313439313465313033646564 +63353162663333383637336266376535643566343637303139333838373536366264376632393938 +37376239353239356166303533393339343131336138343438366463666332636562343261366663 +34313561376665373563613636366366633034353232396133313431626663316431336330656433 +36363536663662653434353161383238346230636433366138633765376635376136636638613638 +31653137353036393364336139323366636464613133313138346433663664386465313764656431 +64633761613630393465303564656333333864333961393262303730313765383735323534326331 +38643033376136383939373565323162663139633337653836363532666538356534343365353064 +32373565623066656663346132373831343738643830633935313831343162633966363363396636 +61323237653731353438643431346539613533323637633936336531666634623330356563636630 +64633232343163353830633830646632623961646230313037366230646365633438353761336437 +38366362323964613361376236323661373736393733393938343538383563303861343130333965 +34663738373966363465363166393937633738653836643632376233353961656665366632623166 +65343037346163613664623361313534666563363537383732333739633437336635376634643339 +38303166353865656133326631323136633435623231303464663236373766326666306263663961 +33643465303138373065666433373866343730653533383366323664383235633832663536646536 +36383861363639646166626661626264353865303936333663643432613163626334356564646364 +63373936613930313935333963633765303961323531336630323034326438363464653834323563 +38323038376332306137383438336637343633396131343234326635363736393363373130616232 +32386465376338376338633931663461376530393533336530376332653630393630333330383663 +38626238663637653633653962393133313637376137663765633134306666613339306235396632 +30356331303766323732633530323162393530613634366138313637306133653436303239383738 +34356363336333313833623862356139376334356664303430306562386235396533326162383736 +30313465393936346162316330616333353934633032333265306533653264653931653430393065 +32626331363030363635393064653564613761336465633739323566323336623864323433356134 +63306364336264383836323763353233643463636131383332316362613337363039363636663030 +32316231303462666333353265613135613830333861333131656439326236333634316462646431 +63336433343937636136646434326239313064373863393461623832373262633462633338356430 +65306462666636303633 diff --git a/playbooks/deploy.yml b/playbooks/deploy.yml index dfc9434..bd7ce59 100644 --- a/playbooks/deploy.yml +++ b/playbooks/deploy.yml @@ -10,3 +10,5 @@ tags: docker - role: services tags: services + - role: backup + tags: backup diff --git a/roles/backup/defaults/main.yml b/roles/backup/defaults/main.yml new file mode 100644 index 0000000..4fa179d --- /dev/null +++ b/roles/backup/defaults/main.yml @@ -0,0 +1,4 @@ +--- +backup_dir: /opt/backups +backup_retention_days: 7 +backup_user: deploy diff --git a/roles/backup/tasks/main.yml b/roles/backup/tasks/main.yml new file mode 100644 index 0000000..2e4ed90 --- /dev/null +++ b/roles/backup/tasks/main.yml @@ -0,0 +1,25 @@ +--- +- name: Create backup directory + ansible.builtin.file: + path: "{{ backup_dir }}" + state: directory + owner: "{{ backup_user }}" + group: "{{ backup_user }}" + mode: "0750" + +- name: Deploy backup script + ansible.builtin.template: + src: backup.sh.j2 + dest: /usr/local/bin/backup-services + owner: root + group: root + mode: "0750" + +- name: Schedule daily backup at 03:00 + ansible.builtin.cron: + name: "Daily services backup" + minute: "0" + hour: "3" + job: "/usr/local/bin/backup-services >> /var/log/backup-services.log 2>&1" + user: root + state: present diff --git a/roles/backup/templates/backup.sh.j2 b/roles/backup/templates/backup.sh.j2 new file mode 100644 index 0000000..80bcae6 --- /dev/null +++ b/roles/backup/templates/backup.sh.j2 @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Generated by Ansible — do not edit manually +# Backs up PostgreSQL databases and Vaultwarden data. +# Runs daily at 03:00, keeps {{ backup_retention_days }} days of backups. +set -euo pipefail + +BACKUP_DIR="{{ backup_dir }}" +DATE=$(date +%Y-%m-%d_%H-%M-%S) +KEEP_DAYS="{{ backup_retention_days }}" + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + +log "=== Backup started ===" + +# ── Forgejo PostgreSQL ────────────────────────────────────────────────────── +log "Backing up forgejo-db..." +docker exec forgejo-db pg_dump -U forgejo forgejo \ + | gzip > "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz" +log " → ${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz" | cut -f1))" + +# ── Plane PostgreSQL ──────────────────────────────────────────────────────── +log "Backing up plane-db..." +docker exec plane-db pg_dump -U plane plane \ + | gzip > "${BACKUP_DIR}/plane-db_${DATE}.sql.gz" +log " → ${BACKUP_DIR}/plane-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/plane-db_${DATE}.sql.gz" | cut -f1))" + +# ── Vaultwarden data ──────────────────────────────────────────────────────── +log "Backing up Vaultwarden..." +docker run --rm \ + --volumes-from vaultwarden \ + -v "${BACKUP_DIR}:/backup" \ + alpine:3 \ + tar czf "/backup/vaultwarden_${DATE}.tar.gz" /data +log " → ${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz" | cut -f1))" + +# ── Forgejo repositories ──────────────────────────────────────────────────── +log "Backing up Forgejo data..." +docker run --rm \ + --volumes-from forgejo \ + -v "${BACKUP_DIR}:/backup" \ + alpine:3 \ + tar czf "/backup/forgejo-data_${DATE}.tar.gz" /data +log " → ${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz" | cut -f1))" + +# ── Cleanup old backups ───────────────────────────────────────────────────── +log "Removing backups older than ${KEEP_DAYS} days..." +find "${BACKUP_DIR}" -name "*.gz" -mtime +${KEEP_DAYS} -delete +log " → Done. Current backups:" +du -sh "${BACKUP_DIR}"/*.gz 2>/dev/null | sort -k2 || true + +log "=== Backup completed ===" diff --git a/roles/base/tasks/main.yml b/roles/base/tasks/main.yml index 5136526..1e3db8c 100644 --- a/roles/base/tasks/main.yml +++ b/roles/base/tasks/main.yml @@ -1,5 +1,6 @@ --- - import_tasks: packages.yml +- import_tasks: swap.yml - import_tasks: users.yml - import_tasks: sshd.yml - import_tasks: firewall.yml diff --git a/roles/base/tasks/swap.yml b/roles/base/tasks/swap.yml new file mode 100644 index 0000000..b073b14 --- /dev/null +++ b/roles/base/tasks/swap.yml @@ -0,0 +1,42 @@ +--- +- name: Check if swap file exists + ansible.builtin.stat: + path: /swapfile + register: swapfile_stat + +- name: Create swap file (2 GiB) + ansible.builtin.command: fallocate -l 2G /swapfile + when: not swapfile_stat.stat.exists + changed_when: true + +- name: Set swap file permissions + ansible.builtin.file: + path: /swapfile + mode: "0600" + owner: root + group: root + when: not swapfile_stat.stat.exists + +- name: Format swap file + ansible.builtin.command: mkswap /swapfile + when: not swapfile_stat.stat.exists + changed_when: true + +- name: Enable swap + ansible.builtin.command: swapon /swapfile + when: not swapfile_stat.stat.exists + changed_when: true + +- name: Persist swap in /etc/fstab + ansible.builtin.lineinfile: + path: /etc/fstab + line: "/swapfile none swap sw 0 0" + state: present + +- name: Set swappiness to 10 (prefer RAM over swap) + ansible.posix.sysctl: + name: vm.swappiness + value: "10" + state: present + sysctl_set: true + reload: true diff --git a/roles/services/defaults/main.yml b/roles/services/defaults/main.yml index 2cafe85..5eb250c 100644 --- a/roles/services/defaults/main.yml +++ b/roles/services/defaults/main.yml @@ -24,3 +24,6 @@ prometheus_image: "prom/prometheus:v3.4.0" # https://hub node_exporter_image: "prom/node-exporter:v1.9.1" # https://hub.docker.com/r/prom/node-exporter/tags cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1" # https://github.com/google/cadvisor/releases grafana_image: "grafana/grafana:11.6.1" # https://hub.docker.com/r/grafana/grafana/tags +alertmanager_image: "prom/alertmanager:v0.28.1" # https://hub.docker.com/r/prom/alertmanager/tags +loki_image: "grafana/loki:3.4.3" # https://hub.docker.com/r/grafana/loki/tags +promtail_image: "grafana/promtail:3.4.3" # https://hub.docker.com/r/grafana/promtail/tags diff --git a/roles/services/tasks/configs.yml b/roles/services/tasks/configs.yml index 5f4bf65..2607ec5 100644 --- a/roles/services/tasks/configs.yml +++ b/roles/services/tasks/configs.yml @@ -89,6 +89,51 @@ mode: "0644" notify: Restart stack +- name: Deploy Prometheus alert rules + ansible.builtin.template: + src: prometheus/rules/alerts.yml.j2 + dest: "{{ services_root }}/prometheus/rules/alerts.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_group }}" + mode: "0644" + notify: Restart stack + +- name: Deploy AlertManager config + ansible.builtin.template: + src: prometheus/alertmanager.yml.j2 + dest: "{{ services_root }}/prometheus/alertmanager.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_group }}" + mode: "0640" + notify: Restart stack + +- name: Deploy Loki config + ansible.builtin.template: + src: loki/loki.yml.j2 + dest: "{{ services_root }}/loki/loki.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_group }}" + mode: "0644" + notify: Restart stack + +- name: Deploy Promtail config + ansible.builtin.template: + src: loki/promtail.yml.j2 + dest: "{{ services_root }}/loki/promtail.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_group }}" + mode: "0644" + notify: Restart stack + +- name: Deploy Grafana Loki datasource + ansible.builtin.template: + src: grafana/provisioning/datasources/loki.yml.j2 + dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml" + owner: "{{ deploy_user }}" + group: "{{ deploy_group }}" + mode: "0644" + notify: Restart stack + - name: Create acme.json for Let's Encrypt certificates ansible.builtin.file: path: "{{ services_root }}/traefik/acme.json" diff --git a/roles/services/tasks/directories.yml b/roles/services/tasks/directories.yml index 97ded96..72b7069 100644 --- a/roles/services/tasks/directories.yml +++ b/roles/services/tasks/directories.yml @@ -29,3 +29,5 @@ - grafana/provisioning/datasources - grafana/provisioning/dashboards - grafana/provisioning/dashboards/json + - prometheus/rules + - loki diff --git a/roles/services/tasks/main.yml b/roles/services/tasks/main.yml index 39c087d..dae97e9 100644 --- a/roles/services/tasks/main.yml +++ b/roles/services/tasks/main.yml @@ -22,6 +22,9 @@ - "{{ node_exporter_image }}" - "{{ cadvisor_image }}" - "{{ grafana_image }}" + - "{{ alertmanager_image }}" + - "{{ loki_image }}" + - "{{ promtail_image }}" register: pull_result changed_when: "'Status: Downloaded newer image' in pull_result.stdout" retries: 5 diff --git a/roles/services/templates/docker-compose.yml.j2 b/roles/services/templates/docker-compose.yml.j2 index da63513..610ad57 100644 --- a/roles/services/templates/docker-compose.yml.j2 +++ b/roles/services/templates/docker-compose.yml.j2 @@ -1,5 +1,8 @@ # Docker Compose stack — generated by Ansible # Do not edit manually; re-run ansible-playbook deploy.yml +# +# NOTE: Traefik uses the file provider (routes.yml.j2) — Docker labels on +# containers are intentionally absent. Adding labels here has no effect. networks: # proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа @@ -37,6 +40,7 @@ volumes: act_runner_data: prometheus_data: grafana_data: + loki_data: services: @@ -56,14 +60,11 @@ services: - {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro - {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro - {{ services_root }}/traefik/acme.json:/acme/acme.json - labels: - - "traefik.enable=true" - - "traefik.http.routers.traefik-dashboard.rule=Host(`{{ domain_traefik }}`)" - - "traefik.http.routers.traefik-dashboard.entrypoints=websecure" - - "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt" - - "traefik.http.routers.traefik-dashboard.service=api@internal" - - "traefik.http.routers.traefik-dashboard.middlewares=traefik-auth" - - "traefik.http.middlewares.traefik-auth.basicauth.users={{ traefik_dashboard_htpasswd }}" + healthcheck: + test: ["CMD", "traefik", "healthcheck", "--ping"] + interval: 30s + timeout: 5s + retries: 3 # ── Vaultwarden ──────────────────────────────────────────────────────────── vaultwarden: @@ -82,12 +83,11 @@ services: - LOG_LEVEL=warn - EXTENDED_LOGGING=true - TZ=UTC - labels: - - "traefik.enable=true" - - "traefik.http.routers.vaultwarden.rule=Host(`{{ domain_vault }}`)" - - "traefik.http.routers.vaultwarden.entrypoints=websecure" - - "traefik.http.routers.vaultwarden.tls.certresolver=letsencrypt" - - "traefik.http.services.vaultwarden.loadbalancer.server.port=80" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:80/"] + interval: 30s + timeout: 5s + retries: 3 # ── Forgejo ──────────────────────────────────────────────────────────────── forgejo: @@ -120,12 +120,12 @@ services: - FORGEJO__service__DISABLE_REGISTRATION=true ports: - "2222:22" - labels: - - "traefik.enable=true" - - "traefik.http.routers.forgejo.rule=Host(`{{ domain_git }}`)" - - "traefik.http.routers.forgejo.entrypoints=websecure" - - "traefik.http.routers.forgejo.tls.certresolver=letsencrypt" - - "traefik.http.services.forgejo.loadbalancer.server.port=3000" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:3000/"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s forgejo-db: image: {{ forgejo_db_image }} @@ -152,6 +152,9 @@ services: # /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal) # остальное → plane-web:3000 (Next.js, на backend + plane-internal) # Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически. + # + # NOTE: Plane не публикует конкретные version tags — используем :stable. + # Следить за обновлениями: https://github.com/makeplane/plane/releases plane-web: image: {{ plane_frontend_image }} @@ -162,13 +165,12 @@ services: networks: - backend - plane-internal - labels: - - "traefik.enable=true" - - "traefik.http.routers.plane.rule=Host(`{{ domain_plane }}`)" - - "traefik.http.routers.plane.entrypoints=websecure" - - "traefik.http.routers.plane.tls.certresolver=letsencrypt" - - "traefik.http.services.plane.loadbalancer.server.port=80" - - "traefik.http.routers.plane.priority=1" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:80/"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s plane-admin: image: {{ plane_admin_image }} @@ -180,13 +182,12 @@ services: networks: - backend - plane-internal - labels: - - "traefik.enable=true" - - "traefik.http.routers.plane-admin.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/god-mode/`)" - - "traefik.http.routers.plane-admin.entrypoints=websecure" - - "traefik.http.routers.plane-admin.tls.certresolver=letsencrypt" - - "traefik.http.services.plane-admin.loadbalancer.server.port=80" - - "traefik.http.routers.plane-admin.priority=10" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:80/"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s plane-space: image: {{ plane_space_image }} @@ -198,13 +199,12 @@ services: networks: - backend - plane-internal - labels: - - "traefik.enable=true" - - "traefik.http.routers.plane-space.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/spaces/`)" - - "traefik.http.routers.plane-space.entrypoints=websecure" - - "traefik.http.routers.plane-space.tls.certresolver=letsencrypt" - - "traefik.http.services.plane-space.loadbalancer.server.port=3000" - - "traefik.http.routers.plane-space.priority=10" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:3000/"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 30s plane-api: image: {{ plane_backend_image }} @@ -245,17 +245,18 @@ services: - APP_BASE_URL=https://{{ domain_plane }} - ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode - SPACE_BASE_URL=https://{{ domain_plane }}/spaces - labels: - - "traefik.enable=true" - - "traefik.http.routers.plane-api.rule=Host(`{{ domain_plane }}`) && (PathPrefix(`/api/`) || PathPrefix(`/auth/`))" - - "traefik.http.routers.plane-api.entrypoints=websecure" - - "traefik.http.routers.plane-api.tls.certresolver=letsencrypt" - - "traefik.http.services.plane-api.loadbalancer.server.port=8000" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8000/api/"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s plane-worker: image: {{ plane_backend_image }} container_name: plane-worker restart: unless-stopped + mem_limit: 512m command: ./bin/docker-entrypoint-worker.sh depends_on: - plane-api @@ -283,6 +284,7 @@ services: image: {{ plane_backend_image }} container_name: plane-beat restart: unless-stopped + mem_limit: 256m command: ./bin/docker-entrypoint-beat.sh depends_on: - plane-api @@ -333,6 +335,11 @@ services: volumes: - plane_redis_data:/data command: redis-server --appendonly yes + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 3s + retries: 5 plane-minio: image: {{ plane_minio_image }} @@ -373,14 +380,11 @@ services: - PUID=1000 - PGID=1000 - TZ=UTC - labels: - - "traefik.enable=true" - - "traefik.http.routers.syncthing.rule=Host(`{{ domain_sync }}`)" - - "traefik.http.routers.syncthing.entrypoints=websecure" - - "traefik.http.routers.syncthing.tls.certresolver=letsencrypt" - - "traefik.http.routers.syncthing.middlewares=syncthing-auth" - - "traefik.http.middlewares.syncthing-auth.basicauth.users={{ syncthing_basic_auth_htpasswd }}" - - "traefik.http.services.syncthing.loadbalancer.server.port=8384" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8384/rest/noauth/health"] + interval: 30s + timeout: 5s + retries: 3 # ── Forgejo Actions Runner ───────────────────────────────────────────────── # backend — для связи с Forgejo по внутренней сети (http://forgejo:3000) @@ -414,12 +418,35 @@ services: volumes: - prometheus_data:/prometheus - {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=30d" - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 5s + retries: 3 + + alertmanager: + image: {{ alertmanager_image }} + container_name: alertmanager + restart: unless-stopped + networks: + - monitoring + volumes: + - {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + command: + - "--config.file=/etc/alertmanager/alertmanager.yml" + - "--storage.path=/alertmanager" + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"] + interval: 30s + timeout: 5s + retries: 3 node-exporter: image: {{ node_exporter_image }} @@ -472,3 +499,38 @@ services: - GF_SERVER_DOMAIN={{ domain_dashboard }} - GF_SERVER_ROOT_URL=https://{{ domain_dashboard }} - GF_AUTH_ANONYMOUS_ENABLED=false + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:3000/api/health"] + interval: 30s + timeout: 5s + retries: 3 + + # ── Logging Stack ────────────────────────────────────────────────────────── + loki: + image: {{ loki_image }} + container_name: loki + restart: unless-stopped + networks: + - monitoring + volumes: + - loki_data:/loki + - {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro + command: -config.file=/etc/loki/local-config.yaml + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"] + interval: 30s + timeout: 5s + retries: 3 + + promtail: + image: {{ promtail_image }} + container_name: promtail + restart: unless-stopped + networks: + - monitoring + volumes: + - /var/log:/var/log:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - {{ services_root }}/loki/promtail.yml:/etc/promtail/config.yml:ro + command: -config.file=/etc/promtail/config.yml diff --git a/roles/services/templates/grafana/provisioning/datasources/loki.yml.j2 b/roles/services/templates/grafana/provisioning/datasources/loki.yml.j2 new file mode 100644 index 0000000..4de25a0 --- /dev/null +++ b/roles/services/templates/grafana/provisioning/datasources/loki.yml.j2 @@ -0,0 +1,10 @@ +# Generated by Ansible — do not edit manually +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: false + editable: false diff --git a/roles/services/templates/loki/loki.yml.j2 b/roles/services/templates/loki/loki.yml.j2 new file mode 100644 index 0000000..0d801a7 --- /dev/null +++ b/roles/services/templates/loki/loki.yml.j2 @@ -0,0 +1,36 @@ +# Generated by Ansible — do not edit manually +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 30d + +compactor: + working_directory: /loki/retention + delete_request_store: filesystem + retention_enabled: true diff --git a/roles/services/templates/loki/promtail.yml.j2 b/roles/services/templates/loki/promtail.yml.j2 new file mode 100644 index 0000000..3f3c6bb --- /dev/null +++ b/roles/services/templates/loki/promtail.yml.j2 @@ -0,0 +1,38 @@ +# Generated by Ansible — do not edit manually +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: /(.*) + target_label: container + - source_labels: [__meta_docker_container_log_stream] + target_label: stream + - source_labels: [__meta_docker_container_label_com_docker_compose_service] + target_label: service + + - job_name: syslog + static_configs: + - targets: [localhost] + labels: + job: syslog + __path__: /var/log/syslog + + - job_name: auth + static_configs: + - targets: [localhost] + labels: + job: auth + __path__: /var/log/auth.log diff --git a/roles/services/templates/prometheus/alertmanager.yml.j2 b/roles/services/templates/prometheus/alertmanager.yml.j2 new file mode 100644 index 0000000..2001ff7 --- /dev/null +++ b/roles/services/templates/prometheus/alertmanager.yml.j2 @@ -0,0 +1,28 @@ +# Generated by Ansible — do not edit manually +global: + resolve_timeout: 5m + +route: + group_by: [alertname, severity] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: telegram + +receivers: + - name: telegram + telegram_configs: + - bot_token: "{{ alertmanager_telegram_token }}" + chat_id: {{ alertmanager_telegram_chat_id }} + message: | + {{ '{{' }} range .Alerts {{ '}}' }} + {{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}* + {{ '{{' }} .Annotations.summary {{ '}}' }} + {{ '{{' }} .Annotations.description {{ '}}' }} + {{ '{{' }} end {{ '}}' }} + parse_mode: Markdown + +inhibit_rules: + - source_matchers: [severity="critical"] + target_matchers: [severity="warning"] + equal: [alertname] diff --git a/roles/services/templates/prometheus/prometheus.yml.j2 b/roles/services/templates/prometheus/prometheus.yml.j2 index 76b761b..b7d3e91 100644 --- a/roles/services/templates/prometheus/prometheus.yml.j2 +++ b/roles/services/templates/prometheus/prometheus.yml.j2 @@ -5,6 +5,14 @@ global: external_labels: instance: "{{ domain_base }}" +alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager:9093"] + +rule_files: + - /etc/prometheus/rules/*.yml + scrape_configs: - job_name: prometheus static_configs: @@ -17,3 +25,7 @@ scrape_configs: - job_name: cadvisor static_configs: - targets: ["cadvisor:8080"] + + - job_name: alertmanager + static_configs: + - targets: ["alertmanager:9093"] diff --git a/roles/services/templates/prometheus/rules/alerts.yml.j2 b/roles/services/templates/prometheus/rules/alerts.yml.j2 new file mode 100644 index 0000000..b2726c9 --- /dev/null +++ b/roles/services/templates/prometheus/rules/alerts.yml.j2 @@ -0,0 +1,86 @@ +# Generated by Ansible — do not edit manually +groups: + - name: host + rules: + - alert: HighCPULoad + expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "CPU загружен более 85% на протяжении 5 минут." + + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "Использование RAM превысило 85%." + + - alert: CriticalMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 2m + labels: + severity: critical + annotations: + summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "RAM заполнена на 95%+. Возможны OOM kills." + + - alert: DiskSpaceWarning + expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75 + for: 5m + labels: + severity: warning + annotations: + summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%." + + - alert: DiskSpaceCritical + expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90 + for: 2m + labels: + severity: critical + annotations: + summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%." + + - alert: SwapUsageHigh + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)" + description: "Swap используется более чем на 50% — RAM под давлением." + + - name: containers + rules: + - alert: ContainerDown + expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60 + for: 2m + labels: + severity: critical + annotations: + summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен" + description: "Контейнер не отвечает более 2 минут." + + - alert: ContainerHighMemory + expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти" + description: "Контейнер близок к mem_limit — возможен OOM kill." + + - alert: ContainerRestarting + expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается" + description: "Контейнер не активен — проверьте docker ps." diff --git a/roles/services/templates/traefik/traefik.yml.j2 b/roles/services/templates/traefik/traefik.yml.j2 index ea71d08..c24052f 100644 --- a/roles/services/templates/traefik/traefik.yml.j2 +++ b/roles/services/templates/traefik/traefik.yml.j2 @@ -14,6 +14,8 @@ api: dashboard: true insecure: false +ping: {} + entryPoints: web: address: ":80"