feat: major infrastructure improvements
Some checks failed
CI/CD / deploy (push) Has been cancelled
CI/CD / syntax-check (push) Successful in 1m7s

Reliability:
- Add swap role (2GB, swappiness=10, idempotent via /etc/fstab)
- Add mem_limit to plane-worker (512m) and plane-beat (256m)
- Add health checks to all services (traefik, vaultwarden, forgejo,
  plane-*, syncthing, prometheus, grafana, loki)

Code quality:
- Remove Traefik Docker labels (file provider used, labels were dead code)
- Add comment explaining file provider architecture

Observability:
- Add AlertManager with Telegram notifications
- Add Prometheus alert rules: CPU, RAM, disk, swap, container health
- Add Loki + Promtail for centralized log aggregation
- Add Loki datasource to Grafana
- Enable Traefik /ping endpoint for health checks

Backups:
- Add backup role: pg_dump for forgejo + plane DBs, tar for
  vaultwarden and forgejo data
- 7-day retention, daily cron at 03:00
- Backup script at /usr/local/bin/backup-services

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
jack 2026-03-22 03:28:16 +07:00
parent 972a76db4c
commit 6ebd237894
20 changed files with 558 additions and 99 deletions

View file

@ -26,6 +26,8 @@ traefik_dashboard_htpasswd: "{{ vault_traefik_dashboard_htpasswd }}"
syncthing_basic_auth_htpasswd: "{{ vault_syncthing_basic_auth_htpasswd }}"
forgejo_runner_token: "{{ vault_forgejo_runner_token }}"
grafana_admin_password: "{{ vault_grafana_admin_password }}"
alertmanager_telegram_token: "{{ vault_alertmanager_telegram_token }}"
alertmanager_telegram_chat_id: "{{ vault_alertmanager_telegram_chat_id }}"
# CI/CD deploy key (public key — not a secret)
ci_deploy_pubkey: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHdr9mRSSUqt7Ym4wA5RpVyz76wEXSOtVfh2/yCSMIbg ci-deploy@forgejo-runner"

View file

@ -1,44 +1,49 @@
$ANSIBLE_VAULT;1.1;AES256
31613031396131376665643639663630323363366332396162356132316233323063356465643762
3637303062366530313132303165333161323737643030650a653032646534303463616562633234
65373032373464303839346430626665316437656363666164636532613837613637323064326630
3736343663396461350a396434613034393131623037333436306532373038623534363334313066
38333535376230653538633261366633366238636530656464343939623834646139396637343461
62396564616566373231343138653061366663366461343239636463633261643464633361373834
30386237383561613232626361323536636363373831356635653535656633323332666264383061
63633331343162396431653237333664663439383738333932373866643030303735643534386330
32366438623966303131613666613265313235323530626132316661383462313033653038363933
34646463666632396231313563363064396539663236356565306564653433353735333335656534
65396235656365376236613832626366666666653834626633373937386366656232633766636634
63346535643936646333663735333630623538393234383336613461343863623935393865343532
37306161646434656439323832383238643963316463643033306262613231646334303361653833
63646363653235326261333538666236353231376437653636316131623135323138636661386665
66316663393934663031623135633137366131633030626664306564396635663235383636343431
65343330373036316161626165343738323730646130663839616363326232653039303932363765
61633432386361353861393263306236343162313066653962363761373161656365353538656335
36643163383435326230326135616164333134363666616339653330656531326539653764633632
33396431346530633933626533313939316337363035333763613237356133396162316163646632
39633165653565333237376137303737383831373838656461346434663331313965323235626330
37303665313564346233666632656531323932316533613463633636633435646137653064653137
63356439336664383133376530393036653061613466383961613939633866306337653865366330
66366132383237336534363563643135313138376437336563643130383534373263373364653462
61396664373363373737633339333335653164323662623239313666666431353631306438303533
65636135343039323734393637396163366138636439656633616130643636363831643532623337
35303934656230623563366664396632356638636630613433626334303235343961616439636165
37313264643432363532373464633633343033303133346435663062323838383931393061396531
34373034313038663033383333333430326136346433646536626565663436323764316361356531
64343839373138346636336637343438616639353236336633666234643365303030656132346232
65346636626338363762653939333639313462393231616636663935333232636231326231373833
32366535353361303532633462303763386238663432333465653361373064656162343465396631
61363330666666653533653365653232313836373230336537363337666536313737353731363165
63333837356135646564663536666264346630356163663666323432393865393836326338306266
33383139383033643937383865393236396337666139633032323162303665633230346365663730
63643661323332616163326636646634616165633538383038653766303066366335393065373236
39613562363634316564346162333030393430303335323733306163396137373037346237656231
39346635333138656336346230313635353233363334633037633961306663356364383962643361
38366531396431653330393239663337626564616265636362313537373239663736636535303332
39626565306632336330386434386636656430363738383431306637666334653136633762323434
34356661633837626231346134303131353264643532613739333234346634346565376333343563
37343662376533383335356331616435393764663530616335386435653538646362613364303437
63356166353062626163313735646365643635393663316365626431383062663331366439613164
323565613761663833636330633533376131
32336633366435326662623163613564336332393334626662383862346337626361316330313237
3063366162376465393134353633616139343430316533330a643361626263366538353538663062
39306231336331636165336563336662666564393235336435393534663439336162316363376437
3735333361343130620a613163623238356538333764333830613963333466333832353262333432
61356434346135616566613064643331633933356332363631623664366166663034373435326262
32623662386636353761396239343534343562616237616131646536323334303736623935326638
38323034346266666163626536333038393935396330663065383237333065613362333736666466
30643761366364353431306538306263653338636664393035373132616239346631343034376336
30643562363338383239306434393535363763626136376463616432333431316433336638663061
61633762383063383934386532363633303661323334393439373936313564636363393535646361
37666561323736303033353930616362336134383165326463613261323665326666383337646465
35326532353836343363616566363965663237653433646134663363643337363964643762366438
30353535633338303133373035646230653933346130306631393233633964353865386137373262
34303161353963613538663366633531633264663231306134313862306561613164346430393462
39336430653864633530353931653931303266613264643462313832313432366662366566353233
35376466356537313131313136353334386539393638663738653366373032323966346666613336
66363234376163326562656232326432636331356238326337313538663563643939323265633238
35316430326661356633386130613238623730313530636136346139326235333838336561376565
62356166333936313565343764336230663332653765353531653930373265383862643337333136
31616336363863666465336561346265613637653132343836653962313439313465313033646564
63353162663333383637336266376535643566343637303139333838373536366264376632393938
37376239353239356166303533393339343131336138343438366463666332636562343261366663
34313561376665373563613636366366633034353232396133313431626663316431336330656433
36363536663662653434353161383238346230636433366138633765376635376136636638613638
31653137353036393364336139323366636464613133313138346433663664386465313764656431
64633761613630393465303564656333333864333961393262303730313765383735323534326331
38643033376136383939373565323162663139633337653836363532666538356534343365353064
32373565623066656663346132373831343738643830633935313831343162633966363363396636
61323237653731353438643431346539613533323637633936336531666634623330356563636630
64633232343163353830633830646632623961646230313037366230646365633438353761336437
38366362323964613361376236323661373736393733393938343538383563303861343130333965
34663738373966363465363166393937633738653836643632376233353961656665366632623166
65343037346163613664623361313534666563363537383732333739633437336635376634643339
38303166353865656133326631323136633435623231303464663236373766326666306263663961
33643465303138373065666433373866343730653533383366323664383235633832663536646536
36383861363639646166626661626264353865303936333663643432613163626334356564646364
63373936613930313935333963633765303961323531336630323034326438363464653834323563
38323038376332306137383438336637343633396131343234326635363736393363373130616232
32386465376338376338633931663461376530393533336530376332653630393630333330383663
38626238663637653633653962393133313637376137663765633134306666613339306235396632
30356331303766323732633530323162393530613634366138313637306133653436303239383738
34356363336333313833623862356139376334356664303430306562386235396533326162383736
30313465393936346162316330616333353934633032333265306533653264653931653430393065
32626331363030363635393064653564613761336465633739323566323336623864323433356134
63306364336264383836323763353233643463636131383332316362613337363039363636663030
32316231303462666333353265613135613830333861333131656439326236333634316462646431
63336433343937636136646434326239313064373863393461623832373262633462633338356430
65306462666636303633

View file

@ -10,3 +10,5 @@
tags: docker
- role: services
tags: services
- role: backup
tags: backup

View file

@ -0,0 +1,4 @@
---
backup_dir: /opt/backups
backup_retention_days: 7
backup_user: deploy

View file

@ -0,0 +1,25 @@
---
- name: Create backup directory
ansible.builtin.file:
path: "{{ backup_dir }}"
state: directory
owner: "{{ backup_user }}"
group: "{{ backup_user }}"
mode: "0750"
- name: Deploy backup script
ansible.builtin.template:
src: backup.sh.j2
dest: /usr/local/bin/backup-services
owner: root
group: root
mode: "0750"
- name: Schedule daily backup at 03:00
ansible.builtin.cron:
name: "Daily services backup"
minute: "0"
hour: "3"
job: "/usr/local/bin/backup-services >> /var/log/backup-services.log 2>&1"
user: root
state: present

View file

@ -0,0 +1,51 @@
#!/usr/bin/env bash
# Generated by Ansible — do not edit manually
# Backs up PostgreSQL databases and Vaultwarden data.
# Runs daily at 03:00, keeps {{ backup_retention_days }} days of backups.
set -euo pipefail
BACKUP_DIR="{{ backup_dir }}"
DATE=$(date +%Y-%m-%d_%H-%M-%S)
KEEP_DAYS="{{ backup_retention_days }}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
log "=== Backup started ==="
# ── Forgejo PostgreSQL ──────────────────────────────────────────────────────
log "Backing up forgejo-db..."
docker exec forgejo-db pg_dump -U forgejo forgejo \
| gzip > "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz"
log " → ${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz" | cut -f1))"
# ── Plane PostgreSQL ────────────────────────────────────────────────────────
log "Backing up plane-db..."
docker exec plane-db pg_dump -U plane plane \
| gzip > "${BACKUP_DIR}/plane-db_${DATE}.sql.gz"
log " → ${BACKUP_DIR}/plane-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/plane-db_${DATE}.sql.gz" | cut -f1))"
# ── Vaultwarden data ────────────────────────────────────────────────────────
log "Backing up Vaultwarden..."
docker run --rm \
--volumes-from vaultwarden \
-v "${BACKUP_DIR}:/backup" \
alpine:3 \
tar czf "/backup/vaultwarden_${DATE}.tar.gz" /data
log " → ${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz" | cut -f1))"
# ── Forgejo repositories ────────────────────────────────────────────────────
log "Backing up Forgejo data..."
docker run --rm \
--volumes-from forgejo \
-v "${BACKUP_DIR}:/backup" \
alpine:3 \
tar czf "/backup/forgejo-data_${DATE}.tar.gz" /data
log " → ${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz" | cut -f1))"
# ── Cleanup old backups ─────────────────────────────────────────────────────
log "Removing backups older than ${KEEP_DAYS} days..."
find "${BACKUP_DIR}" -name "*.gz" -mtime +${KEEP_DAYS} -delete
log " → Done. Current backups:"
du -sh "${BACKUP_DIR}"/*.gz 2>/dev/null | sort -k2 || true
log "=== Backup completed ==="

View file

@ -1,5 +1,6 @@
---
- import_tasks: packages.yml
- import_tasks: swap.yml
- import_tasks: users.yml
- import_tasks: sshd.yml
- import_tasks: firewall.yml

42
roles/base/tasks/swap.yml Normal file
View file

@ -0,0 +1,42 @@
---
- name: Check if swap file exists
ansible.builtin.stat:
path: /swapfile
register: swapfile_stat
- name: Create swap file (2 GiB)
ansible.builtin.command: fallocate -l 2G /swapfile
when: not swapfile_stat.stat.exists
changed_when: true
- name: Set swap file permissions
ansible.builtin.file:
path: /swapfile
mode: "0600"
owner: root
group: root
when: not swapfile_stat.stat.exists
- name: Format swap file
ansible.builtin.command: mkswap /swapfile
when: not swapfile_stat.stat.exists
changed_when: true
- name: Enable swap
ansible.builtin.command: swapon /swapfile
when: not swapfile_stat.stat.exists
changed_when: true
- name: Persist swap in /etc/fstab
ansible.builtin.lineinfile:
path: /etc/fstab
line: "/swapfile none swap sw 0 0"
state: present
- name: Set swappiness to 10 (prefer RAM over swap)
ansible.posix.sysctl:
name: vm.swappiness
value: "10"
state: present
sysctl_set: true
reload: true

View file

@ -24,3 +24,6 @@ prometheus_image: "prom/prometheus:v3.4.0" # https://hub
node_exporter_image: "prom/node-exporter:v1.9.1" # https://hub.docker.com/r/prom/node-exporter/tags
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1" # https://github.com/google/cadvisor/releases
grafana_image: "grafana/grafana:11.6.1" # https://hub.docker.com/r/grafana/grafana/tags
alertmanager_image: "prom/alertmanager:v0.28.1" # https://hub.docker.com/r/prom/alertmanager/tags
loki_image: "grafana/loki:3.4.3" # https://hub.docker.com/r/grafana/loki/tags
promtail_image: "grafana/promtail:3.4.3" # https://hub.docker.com/r/grafana/promtail/tags

View file

@ -89,6 +89,51 @@
mode: "0644"
notify: Restart stack
- name: Deploy Prometheus alert rules
ansible.builtin.template:
src: prometheus/rules/alerts.yml.j2
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy AlertManager config
ansible.builtin.template:
src: prometheus/alertmanager.yml.j2
dest: "{{ services_root }}/prometheus/alertmanager.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0640"
notify: Restart stack
- name: Deploy Loki config
ansible.builtin.template:
src: loki/loki.yml.j2
dest: "{{ services_root }}/loki/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Promtail config
ansible.builtin.template:
src: loki/promtail.yml.j2
dest: "{{ services_root }}/loki/promtail.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Deploy Grafana Loki datasource
ansible.builtin.template:
src: grafana/provisioning/datasources/loki.yml.j2
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
owner: "{{ deploy_user }}"
group: "{{ deploy_group }}"
mode: "0644"
notify: Restart stack
- name: Create acme.json for Let's Encrypt certificates
ansible.builtin.file:
path: "{{ services_root }}/traefik/acme.json"

View file

@ -29,3 +29,5 @@
- grafana/provisioning/datasources
- grafana/provisioning/dashboards
- grafana/provisioning/dashboards/json
- prometheus/rules
- loki

View file

@ -22,6 +22,9 @@
- "{{ node_exporter_image }}"
- "{{ cadvisor_image }}"
- "{{ grafana_image }}"
- "{{ alertmanager_image }}"
- "{{ loki_image }}"
- "{{ promtail_image }}"
register: pull_result
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
retries: 5

View file

@ -1,5 +1,8 @@
# Docker Compose stack — generated by Ansible
# Do not edit manually; re-run ansible-playbook deploy.yml
#
# NOTE: Traefik uses the file provider (routes.yml.j2) — Docker labels on
# containers are intentionally absent. Adding labels here has no effect.
networks:
# proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа
@ -37,6 +40,7 @@ volumes:
act_runner_data:
prometheus_data:
grafana_data:
loki_data:
services:
@ -56,14 +60,11 @@ services:
- {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro
- {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro
- {{ services_root }}/traefik/acme.json:/acme/acme.json
labels:
- "traefik.enable=true"
- "traefik.http.routers.traefik-dashboard.rule=Host(`{{ domain_traefik }}`)"
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
- "traefik.http.routers.traefik-dashboard.service=api@internal"
- "traefik.http.routers.traefik-dashboard.middlewares=traefik-auth"
- "traefik.http.middlewares.traefik-auth.basicauth.users={{ traefik_dashboard_htpasswd }}"
healthcheck:
test: ["CMD", "traefik", "healthcheck", "--ping"]
interval: 30s
timeout: 5s
retries: 3
# ── Vaultwarden ────────────────────────────────────────────────────────────
vaultwarden:
@ -82,12 +83,11 @@ services:
- LOG_LEVEL=warn
- EXTENDED_LOGGING=true
- TZ=UTC
labels:
- "traefik.enable=true"
- "traefik.http.routers.vaultwarden.rule=Host(`{{ domain_vault }}`)"
- "traefik.http.routers.vaultwarden.entrypoints=websecure"
- "traefik.http.routers.vaultwarden.tls.certresolver=letsencrypt"
- "traefik.http.services.vaultwarden.loadbalancer.server.port=80"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
# ── Forgejo ────────────────────────────────────────────────────────────────
forgejo:
@ -120,12 +120,12 @@ services:
- FORGEJO__service__DISABLE_REGISTRATION=true
ports:
- "2222:22"
labels:
- "traefik.enable=true"
- "traefik.http.routers.forgejo.rule=Host(`{{ domain_git }}`)"
- "traefik.http.routers.forgejo.entrypoints=websecure"
- "traefik.http.routers.forgejo.tls.certresolver=letsencrypt"
- "traefik.http.services.forgejo.loadbalancer.server.port=3000"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
forgejo-db:
image: {{ forgejo_db_image }}
@ -152,6 +152,9 @@ services:
# /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal)
# остальное → plane-web:3000 (Next.js, на backend + plane-internal)
# Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически.
#
# NOTE: Plane не публикует конкретные version tags — используем :stable.
# Следить за обновлениями: https://github.com/makeplane/plane/releases
plane-web:
image: {{ plane_frontend_image }}
@ -162,13 +165,12 @@ services:
networks:
- backend
- plane-internal
labels:
- "traefik.enable=true"
- "traefik.http.routers.plane.rule=Host(`{{ domain_plane }}`)"
- "traefik.http.routers.plane.entrypoints=websecure"
- "traefik.http.routers.plane.tls.certresolver=letsencrypt"
- "traefik.http.services.plane.loadbalancer.server.port=80"
- "traefik.http.routers.plane.priority=1"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-admin:
image: {{ plane_admin_image }}
@ -180,13 +182,12 @@ services:
networks:
- backend
- plane-internal
labels:
- "traefik.enable=true"
- "traefik.http.routers.plane-admin.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/god-mode/`)"
- "traefik.http.routers.plane-admin.entrypoints=websecure"
- "traefik.http.routers.plane-admin.tls.certresolver=letsencrypt"
- "traefik.http.services.plane-admin.loadbalancer.server.port=80"
- "traefik.http.routers.plane-admin.priority=10"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-space:
image: {{ plane_space_image }}
@ -198,13 +199,12 @@ services:
networks:
- backend
- plane-internal
labels:
- "traefik.enable=true"
- "traefik.http.routers.plane-space.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/spaces/`)"
- "traefik.http.routers.plane-space.entrypoints=websecure"
- "traefik.http.routers.plane-space.tls.certresolver=letsencrypt"
- "traefik.http.services.plane-space.loadbalancer.server.port=3000"
- "traefik.http.routers.plane-space.priority=10"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
plane-api:
image: {{ plane_backend_image }}
@ -245,17 +245,18 @@ services:
- APP_BASE_URL=https://{{ domain_plane }}
- ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode
- SPACE_BASE_URL=https://{{ domain_plane }}/spaces
labels:
- "traefik.enable=true"
- "traefik.http.routers.plane-api.rule=Host(`{{ domain_plane }}`) && (PathPrefix(`/api/`) || PathPrefix(`/auth/`))"
- "traefik.http.routers.plane-api.entrypoints=websecure"
- "traefik.http.routers.plane-api.tls.certresolver=letsencrypt"
- "traefik.http.services.plane-api.loadbalancer.server.port=8000"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8000/api/"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
plane-worker:
image: {{ plane_backend_image }}
container_name: plane-worker
restart: unless-stopped
mem_limit: 512m
command: ./bin/docker-entrypoint-worker.sh
depends_on:
- plane-api
@ -283,6 +284,7 @@ services:
image: {{ plane_backend_image }}
container_name: plane-beat
restart: unless-stopped
mem_limit: 256m
command: ./bin/docker-entrypoint-beat.sh
depends_on:
- plane-api
@ -333,6 +335,11 @@ services:
volumes:
- plane_redis_data:/data
command: redis-server --appendonly yes
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 3s
retries: 5
plane-minio:
image: {{ plane_minio_image }}
@ -373,14 +380,11 @@ services:
- PUID=1000
- PGID=1000
- TZ=UTC
labels:
- "traefik.enable=true"
- "traefik.http.routers.syncthing.rule=Host(`{{ domain_sync }}`)"
- "traefik.http.routers.syncthing.entrypoints=websecure"
- "traefik.http.routers.syncthing.tls.certresolver=letsencrypt"
- "traefik.http.routers.syncthing.middlewares=syncthing-auth"
- "traefik.http.middlewares.syncthing-auth.basicauth.users={{ syncthing_basic_auth_htpasswd }}"
- "traefik.http.services.syncthing.loadbalancer.server.port=8384"
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8384/rest/noauth/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Forgejo Actions Runner ─────────────────────────────────────────────────
# backend — для связи с Forgejo по внутренней сети (http://forgejo:3000)
@ -414,12 +418,35 @@ services:
volumes:
- prometheus_data:/prometheus
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=30d"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
alertmanager:
image: {{ alertmanager_image }}
container_name: alertmanager
restart: unless-stopped
networks:
- monitoring
volumes:
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 5s
retries: 3
node-exporter:
image: {{ node_exporter_image }}
@ -472,3 +499,38 @@ services:
- GF_SERVER_DOMAIN={{ domain_dashboard }}
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
- GF_AUTH_ANONYMOUS_ENABLED=false
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:3000/api/health"]
interval: 30s
timeout: 5s
retries: 3
# ── Logging Stack ──────────────────────────────────────────────────────────
loki:
image: {{ loki_image }}
container_name: loki
restart: unless-stopped
networks:
- monitoring
volumes:
- loki_data:/loki
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
command: -config.file=/etc/loki/local-config.yaml
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
interval: 30s
timeout: 5s
retries: 3
promtail:
image: {{ promtail_image }}
container_name: promtail
restart: unless-stopped
networks:
- monitoring
volumes:
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- {{ services_root }}/loki/promtail.yml:/etc/promtail/config.yml:ro
command: -config.file=/etc/promtail/config.yml

View file

@ -0,0 +1,10 @@
# Generated by Ansible — do not edit manually
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
isDefault: false
editable: false

View file

@ -0,0 +1,36 @@
# Generated by Ansible — do not edit manually
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 30d
compactor:
working_directory: /loki/retention
delete_request_store: filesystem
retention_enabled: true

View file

@ -0,0 +1,38 @@
# Generated by Ansible — do not edit manually
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: [__meta_docker_container_name]
regex: /(.*)
target_label: container
- source_labels: [__meta_docker_container_log_stream]
target_label: stream
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
target_label: service
- job_name: syslog
static_configs:
- targets: [localhost]
labels:
job: syslog
__path__: /var/log/syslog
- job_name: auth
static_configs:
- targets: [localhost]
labels:
job: auth
__path__: /var/log/auth.log

View file

@ -0,0 +1,28 @@
# Generated by Ansible — do not edit manually
global:
resolve_timeout: 5m
route:
group_by: [alertname, severity]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: telegram
receivers:
- name: telegram
telegram_configs:
- bot_token: "{{ alertmanager_telegram_token }}"
chat_id: {{ alertmanager_telegram_chat_id }}
message: |
{{ '{{' }} range .Alerts {{ '}}' }}
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
{{ '{{' }} .Annotations.summary {{ '}}' }}
{{ '{{' }} .Annotations.description {{ '}}' }}
{{ '{{' }} end {{ '}}' }}
parse_mode: Markdown
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
equal: [alertname]

View file

@ -5,6 +5,14 @@ global:
external_labels:
instance: "{{ domain_base }}"
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
@ -17,3 +25,7 @@ scrape_configs:
- job_name: cadvisor
static_configs:
- targets: ["cadvisor:8080"]
- job_name: alertmanager
static_configs:
- targets: ["alertmanager:9093"]

View file

@ -0,0 +1,86 @@
# Generated by Ansible — do not edit manually
groups:
- name: host
rules:
- alert: HighCPULoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "CPU загружен более 85% на протяжении 5 минут."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Использование RAM превысило 85%."
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "RAM заполнена на 95%+. Возможны OOM kills."
- alert: DiskSpaceWarning
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
for: 5m
labels:
severity: warning
annotations:
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: DiskSpaceCritical
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
- alert: SwapUsageHigh
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
description: "Swap используется более чем на 50% — RAM под давлением."
- name: containers
rules:
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
for: 2m
labels:
severity: critical
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
description: "Контейнер не отвечает более 2 минут."
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
description: "Контейнер близок к mem_limit — возможен OOM kill."
- alert: ContainerRestarting
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
for: 0m
labels:
severity: warning
annotations:
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
description: "Контейнер не активен — проверьте docker ps."

View file

@ -14,6 +14,8 @@ api:
dashboard: true
insecure: false
ping: {}
entryPoints:
web:
address: ":80"