feat: major infrastructure improvements
Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
972a76db4c
commit
6ebd237894
20 changed files with 558 additions and 99 deletions
|
|
@ -25,7 +25,9 @@ plane_minio_password: "{{ vault_plane_minio_password }}"
|
|||
traefik_dashboard_htpasswd: "{{ vault_traefik_dashboard_htpasswd }}"
|
||||
syncthing_basic_auth_htpasswd: "{{ vault_syncthing_basic_auth_htpasswd }}"
|
||||
forgejo_runner_token: "{{ vault_forgejo_runner_token }}"
|
||||
grafana_admin_password: "{{ vault_grafana_admin_password }}"
|
||||
grafana_admin_password: "{{ vault_grafana_admin_password }}"
|
||||
alertmanager_telegram_token: "{{ vault_alertmanager_telegram_token }}"
|
||||
alertmanager_telegram_chat_id: "{{ vault_alertmanager_telegram_chat_id }}"
|
||||
|
||||
# CI/CD deploy key (public key — not a secret)
|
||||
ci_deploy_pubkey: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHdr9mRSSUqt7Ym4wA5RpVyz76wEXSOtVfh2/yCSMIbg ci-deploy@forgejo-runner"
|
||||
|
|
|
|||
|
|
@ -1,44 +1,49 @@
|
|||
$ANSIBLE_VAULT;1.1;AES256
|
||||
31613031396131376665643639663630323363366332396162356132316233323063356465643762
|
||||
3637303062366530313132303165333161323737643030650a653032646534303463616562633234
|
||||
65373032373464303839346430626665316437656363666164636532613837613637323064326630
|
||||
3736343663396461350a396434613034393131623037333436306532373038623534363334313066
|
||||
38333535376230653538633261366633366238636530656464343939623834646139396637343461
|
||||
62396564616566373231343138653061366663366461343239636463633261643464633361373834
|
||||
30386237383561613232626361323536636363373831356635653535656633323332666264383061
|
||||
63633331343162396431653237333664663439383738333932373866643030303735643534386330
|
||||
32366438623966303131613666613265313235323530626132316661383462313033653038363933
|
||||
34646463666632396231313563363064396539663236356565306564653433353735333335656534
|
||||
65396235656365376236613832626366666666653834626633373937386366656232633766636634
|
||||
63346535643936646333663735333630623538393234383336613461343863623935393865343532
|
||||
37306161646434656439323832383238643963316463643033306262613231646334303361653833
|
||||
63646363653235326261333538666236353231376437653636316131623135323138636661386665
|
||||
66316663393934663031623135633137366131633030626664306564396635663235383636343431
|
||||
65343330373036316161626165343738323730646130663839616363326232653039303932363765
|
||||
61633432386361353861393263306236343162313066653962363761373161656365353538656335
|
||||
36643163383435326230326135616164333134363666616339653330656531326539653764633632
|
||||
33396431346530633933626533313939316337363035333763613237356133396162316163646632
|
||||
39633165653565333237376137303737383831373838656461346434663331313965323235626330
|
||||
37303665313564346233666632656531323932316533613463633636633435646137653064653137
|
||||
63356439336664383133376530393036653061613466383961613939633866306337653865366330
|
||||
66366132383237336534363563643135313138376437336563643130383534373263373364653462
|
||||
61396664373363373737633339333335653164323662623239313666666431353631306438303533
|
||||
65636135343039323734393637396163366138636439656633616130643636363831643532623337
|
||||
35303934656230623563366664396632356638636630613433626334303235343961616439636165
|
||||
37313264643432363532373464633633343033303133346435663062323838383931393061396531
|
||||
34373034313038663033383333333430326136346433646536626565663436323764316361356531
|
||||
64343839373138346636336637343438616639353236336633666234643365303030656132346232
|
||||
65346636626338363762653939333639313462393231616636663935333232636231326231373833
|
||||
32366535353361303532633462303763386238663432333465653361373064656162343465396631
|
||||
61363330666666653533653365653232313836373230336537363337666536313737353731363165
|
||||
63333837356135646564663536666264346630356163663666323432393865393836326338306266
|
||||
33383139383033643937383865393236396337666139633032323162303665633230346365663730
|
||||
63643661323332616163326636646634616165633538383038653766303066366335393065373236
|
||||
39613562363634316564346162333030393430303335323733306163396137373037346237656231
|
||||
39346635333138656336346230313635353233363334633037633961306663356364383962643361
|
||||
38366531396431653330393239663337626564616265636362313537373239663736636535303332
|
||||
39626565306632336330386434386636656430363738383431306637666334653136633762323434
|
||||
34356661633837626231346134303131353264643532613739333234346634346565376333343563
|
||||
37343662376533383335356331616435393764663530616335386435653538646362613364303437
|
||||
63356166353062626163313735646365643635393663316365626431383062663331366439613164
|
||||
323565613761663833636330633533376131
|
||||
32336633366435326662623163613564336332393334626662383862346337626361316330313237
|
||||
3063366162376465393134353633616139343430316533330a643361626263366538353538663062
|
||||
39306231336331636165336563336662666564393235336435393534663439336162316363376437
|
||||
3735333361343130620a613163623238356538333764333830613963333466333832353262333432
|
||||
61356434346135616566613064643331633933356332363631623664366166663034373435326262
|
||||
32623662386636353761396239343534343562616237616131646536323334303736623935326638
|
||||
38323034346266666163626536333038393935396330663065383237333065613362333736666466
|
||||
30643761366364353431306538306263653338636664393035373132616239346631343034376336
|
||||
30643562363338383239306434393535363763626136376463616432333431316433336638663061
|
||||
61633762383063383934386532363633303661323334393439373936313564636363393535646361
|
||||
37666561323736303033353930616362336134383165326463613261323665326666383337646465
|
||||
35326532353836343363616566363965663237653433646134663363643337363964643762366438
|
||||
30353535633338303133373035646230653933346130306631393233633964353865386137373262
|
||||
34303161353963613538663366633531633264663231306134313862306561613164346430393462
|
||||
39336430653864633530353931653931303266613264643462313832313432366662366566353233
|
||||
35376466356537313131313136353334386539393638663738653366373032323966346666613336
|
||||
66363234376163326562656232326432636331356238326337313538663563643939323265633238
|
||||
35316430326661356633386130613238623730313530636136346139326235333838336561376565
|
||||
62356166333936313565343764336230663332653765353531653930373265383862643337333136
|
||||
31616336363863666465336561346265613637653132343836653962313439313465313033646564
|
||||
63353162663333383637336266376535643566343637303139333838373536366264376632393938
|
||||
37376239353239356166303533393339343131336138343438366463666332636562343261366663
|
||||
34313561376665373563613636366366633034353232396133313431626663316431336330656433
|
||||
36363536663662653434353161383238346230636433366138633765376635376136636638613638
|
||||
31653137353036393364336139323366636464613133313138346433663664386465313764656431
|
||||
64633761613630393465303564656333333864333961393262303730313765383735323534326331
|
||||
38643033376136383939373565323162663139633337653836363532666538356534343365353064
|
||||
32373565623066656663346132373831343738643830633935313831343162633966363363396636
|
||||
61323237653731353438643431346539613533323637633936336531666634623330356563636630
|
||||
64633232343163353830633830646632623961646230313037366230646365633438353761336437
|
||||
38366362323964613361376236323661373736393733393938343538383563303861343130333965
|
||||
34663738373966363465363166393937633738653836643632376233353961656665366632623166
|
||||
65343037346163613664623361313534666563363537383732333739633437336635376634643339
|
||||
38303166353865656133326631323136633435623231303464663236373766326666306263663961
|
||||
33643465303138373065666433373866343730653533383366323664383235633832663536646536
|
||||
36383861363639646166626661626264353865303936333663643432613163626334356564646364
|
||||
63373936613930313935333963633765303961323531336630323034326438363464653834323563
|
||||
38323038376332306137383438336637343633396131343234326635363736393363373130616232
|
||||
32386465376338376338633931663461376530393533336530376332653630393630333330383663
|
||||
38626238663637653633653962393133313637376137663765633134306666613339306235396632
|
||||
30356331303766323732633530323162393530613634366138313637306133653436303239383738
|
||||
34356363336333313833623862356139376334356664303430306562386235396533326162383736
|
||||
30313465393936346162316330616333353934633032333265306533653264653931653430393065
|
||||
32626331363030363635393064653564613761336465633739323566323336623864323433356134
|
||||
63306364336264383836323763353233643463636131383332316362613337363039363636663030
|
||||
32316231303462666333353265613135613830333861333131656439326236333634316462646431
|
||||
63336433343937636136646434326239313064373863393461623832373262633462633338356430
|
||||
65306462666636303633
|
||||
|
|
|
|||
|
|
@ -10,3 +10,5 @@
|
|||
tags: docker
|
||||
- role: services
|
||||
tags: services
|
||||
- role: backup
|
||||
tags: backup
|
||||
|
|
|
|||
4
roles/backup/defaults/main.yml
Normal file
4
roles/backup/defaults/main.yml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
backup_dir: /opt/backups
|
||||
backup_retention_days: 7
|
||||
backup_user: deploy
|
||||
25
roles/backup/tasks/main.yml
Normal file
25
roles/backup/tasks/main.yml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
- name: Create backup directory
|
||||
ansible.builtin.file:
|
||||
path: "{{ backup_dir }}"
|
||||
state: directory
|
||||
owner: "{{ backup_user }}"
|
||||
group: "{{ backup_user }}"
|
||||
mode: "0750"
|
||||
|
||||
- name: Deploy backup script
|
||||
ansible.builtin.template:
|
||||
src: backup.sh.j2
|
||||
dest: /usr/local/bin/backup-services
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0750"
|
||||
|
||||
- name: Schedule daily backup at 03:00
|
||||
ansible.builtin.cron:
|
||||
name: "Daily services backup"
|
||||
minute: "0"
|
||||
hour: "3"
|
||||
job: "/usr/local/bin/backup-services >> /var/log/backup-services.log 2>&1"
|
||||
user: root
|
||||
state: present
|
||||
51
roles/backup/templates/backup.sh.j2
Normal file
51
roles/backup/templates/backup.sh.j2
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/env bash
|
||||
# Generated by Ansible — do not edit manually
|
||||
# Backs up PostgreSQL databases and Vaultwarden data.
|
||||
# Runs daily at 03:00, keeps {{ backup_retention_days }} days of backups.
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="{{ backup_dir }}"
|
||||
DATE=$(date +%Y-%m-%d_%H-%M-%S)
|
||||
KEEP_DAYS="{{ backup_retention_days }}"
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||
|
||||
log "=== Backup started ==="
|
||||
|
||||
# ── Forgejo PostgreSQL ──────────────────────────────────────────────────────
|
||||
log "Backing up forgejo-db..."
|
||||
docker exec forgejo-db pg_dump -U forgejo forgejo \
|
||||
| gzip > "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz"
|
||||
log " → ${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz" | cut -f1))"
|
||||
|
||||
# ── Plane PostgreSQL ────────────────────────────────────────────────────────
|
||||
log "Backing up plane-db..."
|
||||
docker exec plane-db pg_dump -U plane plane \
|
||||
| gzip > "${BACKUP_DIR}/plane-db_${DATE}.sql.gz"
|
||||
log " → ${BACKUP_DIR}/plane-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/plane-db_${DATE}.sql.gz" | cut -f1))"
|
||||
|
||||
# ── Vaultwarden data ────────────────────────────────────────────────────────
|
||||
log "Backing up Vaultwarden..."
|
||||
docker run --rm \
|
||||
--volumes-from vaultwarden \
|
||||
-v "${BACKUP_DIR}:/backup" \
|
||||
alpine:3 \
|
||||
tar czf "/backup/vaultwarden_${DATE}.tar.gz" /data
|
||||
log " → ${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz" | cut -f1))"
|
||||
|
||||
# ── Forgejo repositories ────────────────────────────────────────────────────
|
||||
log "Backing up Forgejo data..."
|
||||
docker run --rm \
|
||||
--volumes-from forgejo \
|
||||
-v "${BACKUP_DIR}:/backup" \
|
||||
alpine:3 \
|
||||
tar czf "/backup/forgejo-data_${DATE}.tar.gz" /data
|
||||
log " → ${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz" | cut -f1))"
|
||||
|
||||
# ── Cleanup old backups ─────────────────────────────────────────────────────
|
||||
log "Removing backups older than ${KEEP_DAYS} days..."
|
||||
find "${BACKUP_DIR}" -name "*.gz" -mtime +${KEEP_DAYS} -delete
|
||||
log " → Done. Current backups:"
|
||||
du -sh "${BACKUP_DIR}"/*.gz 2>/dev/null | sort -k2 || true
|
||||
|
||||
log "=== Backup completed ==="
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
---
|
||||
- import_tasks: packages.yml
|
||||
- import_tasks: swap.yml
|
||||
- import_tasks: users.yml
|
||||
- import_tasks: sshd.yml
|
||||
- import_tasks: firewall.yml
|
||||
|
|
|
|||
42
roles/base/tasks/swap.yml
Normal file
42
roles/base/tasks/swap.yml
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
---
|
||||
- name: Check if swap file exists
|
||||
ansible.builtin.stat:
|
||||
path: /swapfile
|
||||
register: swapfile_stat
|
||||
|
||||
- name: Create swap file (2 GiB)
|
||||
ansible.builtin.command: fallocate -l 2G /swapfile
|
||||
when: not swapfile_stat.stat.exists
|
||||
changed_when: true
|
||||
|
||||
- name: Set swap file permissions
|
||||
ansible.builtin.file:
|
||||
path: /swapfile
|
||||
mode: "0600"
|
||||
owner: root
|
||||
group: root
|
||||
when: not swapfile_stat.stat.exists
|
||||
|
||||
- name: Format swap file
|
||||
ansible.builtin.command: mkswap /swapfile
|
||||
when: not swapfile_stat.stat.exists
|
||||
changed_when: true
|
||||
|
||||
- name: Enable swap
|
||||
ansible.builtin.command: swapon /swapfile
|
||||
when: not swapfile_stat.stat.exists
|
||||
changed_when: true
|
||||
|
||||
- name: Persist swap in /etc/fstab
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/fstab
|
||||
line: "/swapfile none swap sw 0 0"
|
||||
state: present
|
||||
|
||||
- name: Set swappiness to 10 (prefer RAM over swap)
|
||||
ansible.posix.sysctl:
|
||||
name: vm.swappiness
|
||||
value: "10"
|
||||
state: present
|
||||
sysctl_set: true
|
||||
reload: true
|
||||
|
|
@ -24,3 +24,6 @@ prometheus_image: "prom/prometheus:v3.4.0" # https://hub
|
|||
node_exporter_image: "prom/node-exporter:v1.9.1" # https://hub.docker.com/r/prom/node-exporter/tags
|
||||
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1" # https://github.com/google/cadvisor/releases
|
||||
grafana_image: "grafana/grafana:11.6.1" # https://hub.docker.com/r/grafana/grafana/tags
|
||||
alertmanager_image: "prom/alertmanager:v0.28.1" # https://hub.docker.com/r/prom/alertmanager/tags
|
||||
loki_image: "grafana/loki:3.4.3" # https://hub.docker.com/r/grafana/loki/tags
|
||||
promtail_image: "grafana/promtail:3.4.3" # https://hub.docker.com/r/grafana/promtail/tags
|
||||
|
|
|
|||
|
|
@ -89,6 +89,51 @@
|
|||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Prometheus alert rules
|
||||
ansible.builtin.template:
|
||||
src: prometheus/rules/alerts.yml.j2
|
||||
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy AlertManager config
|
||||
ansible.builtin.template:
|
||||
src: prometheus/alertmanager.yml.j2
|
||||
dest: "{{ services_root }}/prometheus/alertmanager.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0640"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Loki config
|
||||
ansible.builtin.template:
|
||||
src: loki/loki.yml.j2
|
||||
dest: "{{ services_root }}/loki/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Promtail config
|
||||
ansible.builtin.template:
|
||||
src: loki/promtail.yml.j2
|
||||
dest: "{{ services_root }}/loki/promtail.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Deploy Grafana Loki datasource
|
||||
ansible.builtin.template:
|
||||
src: grafana/provisioning/datasources/loki.yml.j2
|
||||
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
|
||||
owner: "{{ deploy_user }}"
|
||||
group: "{{ deploy_group }}"
|
||||
mode: "0644"
|
||||
notify: Restart stack
|
||||
|
||||
- name: Create acme.json for Let's Encrypt certificates
|
||||
ansible.builtin.file:
|
||||
path: "{{ services_root }}/traefik/acme.json"
|
||||
|
|
|
|||
|
|
@ -29,3 +29,5 @@
|
|||
- grafana/provisioning/datasources
|
||||
- grafana/provisioning/dashboards
|
||||
- grafana/provisioning/dashboards/json
|
||||
- prometheus/rules
|
||||
- loki
|
||||
|
|
|
|||
|
|
@ -22,6 +22,9 @@
|
|||
- "{{ node_exporter_image }}"
|
||||
- "{{ cadvisor_image }}"
|
||||
- "{{ grafana_image }}"
|
||||
- "{{ alertmanager_image }}"
|
||||
- "{{ loki_image }}"
|
||||
- "{{ promtail_image }}"
|
||||
register: pull_result
|
||||
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
|
||||
retries: 5
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
# Docker Compose stack — generated by Ansible
|
||||
# Do not edit manually; re-run ansible-playbook deploy.yml
|
||||
#
|
||||
# NOTE: Traefik uses the file provider (routes.yml.j2) — Docker labels on
|
||||
# containers are intentionally absent. Adding labels here has no effect.
|
||||
|
||||
networks:
|
||||
# proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа
|
||||
|
|
@ -37,6 +40,7 @@ volumes:
|
|||
act_runner_data:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
loki_data:
|
||||
|
||||
services:
|
||||
|
||||
|
|
@ -56,14 +60,11 @@ services:
|
|||
- {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro
|
||||
- {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro
|
||||
- {{ services_root }}/traefik/acme.json:/acme/acme.json
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.traefik-dashboard.rule=Host(`{{ domain_traefik }}`)"
|
||||
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
|
||||
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.routers.traefik-dashboard.service=api@internal"
|
||||
- "traefik.http.routers.traefik-dashboard.middlewares=traefik-auth"
|
||||
- "traefik.http.middlewares.traefik-auth.basicauth.users={{ traefik_dashboard_htpasswd }}"
|
||||
healthcheck:
|
||||
test: ["CMD", "traefik", "healthcheck", "--ping"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Vaultwarden ────────────────────────────────────────────────────────────
|
||||
vaultwarden:
|
||||
|
|
@ -82,12 +83,11 @@ services:
|
|||
- LOG_LEVEL=warn
|
||||
- EXTENDED_LOGGING=true
|
||||
- TZ=UTC
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.vaultwarden.rule=Host(`{{ domain_vault }}`)"
|
||||
- "traefik.http.routers.vaultwarden.entrypoints=websecure"
|
||||
- "traefik.http.routers.vaultwarden.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.vaultwarden.loadbalancer.server.port=80"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Forgejo ────────────────────────────────────────────────────────────────
|
||||
forgejo:
|
||||
|
|
@ -120,12 +120,12 @@ services:
|
|||
- FORGEJO__service__DISABLE_REGISTRATION=true
|
||||
ports:
|
||||
- "2222:22"
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.forgejo.rule=Host(`{{ domain_git }}`)"
|
||||
- "traefik.http.routers.forgejo.entrypoints=websecure"
|
||||
- "traefik.http.routers.forgejo.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.forgejo.loadbalancer.server.port=3000"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 60s
|
||||
|
||||
forgejo-db:
|
||||
image: {{ forgejo_db_image }}
|
||||
|
|
@ -152,6 +152,9 @@ services:
|
|||
# /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal)
|
||||
# остальное → plane-web:3000 (Next.js, на backend + plane-internal)
|
||||
# Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически.
|
||||
#
|
||||
# NOTE: Plane не публикует конкретные version tags — используем :stable.
|
||||
# Следить за обновлениями: https://github.com/makeplane/plane/releases
|
||||
|
||||
plane-web:
|
||||
image: {{ plane_frontend_image }}
|
||||
|
|
@ -162,13 +165,12 @@ services:
|
|||
networks:
|
||||
- backend
|
||||
- plane-internal
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.plane.rule=Host(`{{ domain_plane }}`)"
|
||||
- "traefik.http.routers.plane.entrypoints=websecure"
|
||||
- "traefik.http.routers.plane.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.plane.loadbalancer.server.port=80"
|
||||
- "traefik.http.routers.plane.priority=1"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
plane-admin:
|
||||
image: {{ plane_admin_image }}
|
||||
|
|
@ -180,13 +182,12 @@ services:
|
|||
networks:
|
||||
- backend
|
||||
- plane-internal
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.plane-admin.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/god-mode/`)"
|
||||
- "traefik.http.routers.plane-admin.entrypoints=websecure"
|
||||
- "traefik.http.routers.plane-admin.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.plane-admin.loadbalancer.server.port=80"
|
||||
- "traefik.http.routers.plane-admin.priority=10"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
plane-space:
|
||||
image: {{ plane_space_image }}
|
||||
|
|
@ -198,13 +199,12 @@ services:
|
|||
networks:
|
||||
- backend
|
||||
- plane-internal
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.plane-space.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/spaces/`)"
|
||||
- "traefik.http.routers.plane-space.entrypoints=websecure"
|
||||
- "traefik.http.routers.plane-space.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.plane-space.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.plane-space.priority=10"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
plane-api:
|
||||
image: {{ plane_backend_image }}
|
||||
|
|
@ -245,17 +245,18 @@ services:
|
|||
- APP_BASE_URL=https://{{ domain_plane }}
|
||||
- ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode
|
||||
- SPACE_BASE_URL=https://{{ domain_plane }}/spaces
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.plane-api.rule=Host(`{{ domain_plane }}`) && (PathPrefix(`/api/`) || PathPrefix(`/auth/`))"
|
||||
- "traefik.http.routers.plane-api.entrypoints=websecure"
|
||||
- "traefik.http.routers.plane-api.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.plane-api.loadbalancer.server.port=8000"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:8000/api/"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 60s
|
||||
|
||||
plane-worker:
|
||||
image: {{ plane_backend_image }}
|
||||
container_name: plane-worker
|
||||
restart: unless-stopped
|
||||
mem_limit: 512m
|
||||
command: ./bin/docker-entrypoint-worker.sh
|
||||
depends_on:
|
||||
- plane-api
|
||||
|
|
@ -283,6 +284,7 @@ services:
|
|||
image: {{ plane_backend_image }}
|
||||
container_name: plane-beat
|
||||
restart: unless-stopped
|
||||
mem_limit: 256m
|
||||
command: ./bin/docker-entrypoint-beat.sh
|
||||
depends_on:
|
||||
- plane-api
|
||||
|
|
@ -333,6 +335,11 @@ services:
|
|||
volumes:
|
||||
- plane_redis_data:/data
|
||||
command: redis-server --appendonly yes
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 3s
|
||||
retries: 5
|
||||
|
||||
plane-minio:
|
||||
image: {{ plane_minio_image }}
|
||||
|
|
@ -373,14 +380,11 @@ services:
|
|||
- PUID=1000
|
||||
- PGID=1000
|
||||
- TZ=UTC
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.syncthing.rule=Host(`{{ domain_sync }}`)"
|
||||
- "traefik.http.routers.syncthing.entrypoints=websecure"
|
||||
- "traefik.http.routers.syncthing.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.routers.syncthing.middlewares=syncthing-auth"
|
||||
- "traefik.http.middlewares.syncthing-auth.basicauth.users={{ syncthing_basic_auth_htpasswd }}"
|
||||
- "traefik.http.services.syncthing.loadbalancer.server.port=8384"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:8384/rest/noauth/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Forgejo Actions Runner ─────────────────────────────────────────────────
|
||||
# backend — для связи с Forgejo по внутренней сети (http://forgejo:3000)
|
||||
|
|
@ -414,12 +418,35 @@ services:
|
|||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time=30d"
|
||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: {{ alertmanager_image }}
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
node-exporter:
|
||||
image: {{ node_exporter_image }}
|
||||
|
|
@ -472,3 +499,38 @@ services:
|
|||
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
||||
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-sf", "http://localhost:3000/api/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
# ── Logging Stack ──────────────────────────────────────────────────────────
|
||||
loki:
|
||||
image: {{ loki_image }}
|
||||
container_name: loki
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- loki_data:/loki
|
||||
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
promtail:
|
||||
image: {{ promtail_image }}
|
||||
container_name: promtail
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- {{ services_root }}/loki/promtail.yml:/etc/promtail/config.yml:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
editable: false
|
||||
36
roles/services/templates/loki/loki.yml.j2
Normal file
36
roles/services/templates/loki/loki.yml.j2
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
limits_config:
|
||||
retention_period: 30d
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/retention
|
||||
delete_request_store: filesystem
|
||||
retention_enabled: true
|
||||
38
roles/services/templates/loki/promtail.yml.j2
Normal file
38
roles/services/templates/loki/promtail.yml.j2
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: /(.*)
|
||||
target_label: container
|
||||
- source_labels: [__meta_docker_container_log_stream]
|
||||
target_label: stream
|
||||
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||
target_label: service
|
||||
|
||||
- job_name: syslog
|
||||
static_configs:
|
||||
- targets: [localhost]
|
||||
labels:
|
||||
job: syslog
|
||||
__path__: /var/log/syslog
|
||||
|
||||
- job_name: auth
|
||||
static_configs:
|
||||
- targets: [localhost]
|
||||
labels:
|
||||
job: auth
|
||||
__path__: /var/log/auth.log
|
||||
28
roles/services/templates/prometheus/alertmanager.yml.j2
Normal file
28
roles/services/templates/prometheus/alertmanager.yml.j2
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: [alertname, severity]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: telegram
|
||||
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegram_configs:
|
||||
- bot_token: "{{ alertmanager_telegram_token }}"
|
||||
chat_id: {{ alertmanager_telegram_chat_id }}
|
||||
message: |
|
||||
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
|
||||
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||
{{ '{{' }} end {{ '}}' }}
|
||||
parse_mode: Markdown
|
||||
|
||||
inhibit_rules:
|
||||
- source_matchers: [severity="critical"]
|
||||
target_matchers: [severity="warning"]
|
||||
equal: [alertname]
|
||||
|
|
@ -5,6 +5,14 @@ global:
|
|||
external_labels:
|
||||
instance: "{{ domain_base }}"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
|
|
@ -17,3 +25,7 @@ scrape_configs:
|
|||
- job_name: cadvisor
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
|
||||
- job_name: alertmanager
|
||||
static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
|
|
|||
86
roles/services/templates/prometheus/rules/alerts.yml.j2
Normal file
86
roles/services/templates/prometheus/rules/alerts.yml.j2
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
# Generated by Ansible — do not edit manually
|
||||
groups:
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HighCPULoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "CPU загружен более 85% на протяжении 5 минут."
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Использование RAM превысило 85%."
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
||||
|
||||
- alert: DiskSpaceWarning
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||
|
||||
- alert: SwapUsageHigh
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||
description: "Swap используется более чем на 50% — RAM под давлением."
|
||||
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: ContainerDown
|
||||
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
||||
description: "Контейнер не отвечает более 2 минут."
|
||||
|
||||
- alert: ContainerHighMemory
|
||||
expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
||||
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
||||
|
||||
- alert: ContainerRestarting
|
||||
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
||||
description: "Контейнер не активен — проверьте docker ps."
|
||||
|
|
@ -14,6 +14,8 @@ api:
|
|||
dashboard: true
|
||||
insecure: false
|
||||
|
||||
ping: {}
|
||||
|
||||
entryPoints:
|
||||
web:
|
||||
address: ":80"
|
||||
|
|
|
|||
Loading…
Reference in a new issue