feat: major infrastructure improvements
Reliability: - Add swap role (2GB, swappiness=10, idempotent via /etc/fstab) - Add mem_limit to plane-worker (512m) and plane-beat (256m) - Add health checks to all services (traefik, vaultwarden, forgejo, plane-*, syncthing, prometheus, grafana, loki) Code quality: - Remove Traefik Docker labels (file provider used, labels were dead code) - Add comment explaining file provider architecture Observability: - Add AlertManager with Telegram notifications - Add Prometheus alert rules: CPU, RAM, disk, swap, container health - Add Loki + Promtail for centralized log aggregation - Add Loki datasource to Grafana - Enable Traefik /ping endpoint for health checks Backups: - Add backup role: pg_dump for forgejo + plane DBs, tar for vaultwarden and forgejo data - 7-day retention, daily cron at 03:00 - Backup script at /usr/local/bin/backup-services Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
972a76db4c
commit
6ebd237894
20 changed files with 558 additions and 99 deletions
|
|
@ -25,7 +25,9 @@ plane_minio_password: "{{ vault_plane_minio_password }}"
|
||||||
traefik_dashboard_htpasswd: "{{ vault_traefik_dashboard_htpasswd }}"
|
traefik_dashboard_htpasswd: "{{ vault_traefik_dashboard_htpasswd }}"
|
||||||
syncthing_basic_auth_htpasswd: "{{ vault_syncthing_basic_auth_htpasswd }}"
|
syncthing_basic_auth_htpasswd: "{{ vault_syncthing_basic_auth_htpasswd }}"
|
||||||
forgejo_runner_token: "{{ vault_forgejo_runner_token }}"
|
forgejo_runner_token: "{{ vault_forgejo_runner_token }}"
|
||||||
grafana_admin_password: "{{ vault_grafana_admin_password }}"
|
grafana_admin_password: "{{ vault_grafana_admin_password }}"
|
||||||
|
alertmanager_telegram_token: "{{ vault_alertmanager_telegram_token }}"
|
||||||
|
alertmanager_telegram_chat_id: "{{ vault_alertmanager_telegram_chat_id }}"
|
||||||
|
|
||||||
# CI/CD deploy key (public key — not a secret)
|
# CI/CD deploy key (public key — not a secret)
|
||||||
ci_deploy_pubkey: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHdr9mRSSUqt7Ym4wA5RpVyz76wEXSOtVfh2/yCSMIbg ci-deploy@forgejo-runner"
|
ci_deploy_pubkey: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHdr9mRSSUqt7Ym4wA5RpVyz76wEXSOtVfh2/yCSMIbg ci-deploy@forgejo-runner"
|
||||||
|
|
|
||||||
|
|
@ -1,44 +1,49 @@
|
||||||
$ANSIBLE_VAULT;1.1;AES256
|
$ANSIBLE_VAULT;1.1;AES256
|
||||||
31613031396131376665643639663630323363366332396162356132316233323063356465643762
|
32336633366435326662623163613564336332393334626662383862346337626361316330313237
|
||||||
3637303062366530313132303165333161323737643030650a653032646534303463616562633234
|
3063366162376465393134353633616139343430316533330a643361626263366538353538663062
|
||||||
65373032373464303839346430626665316437656363666164636532613837613637323064326630
|
39306231336331636165336563336662666564393235336435393534663439336162316363376437
|
||||||
3736343663396461350a396434613034393131623037333436306532373038623534363334313066
|
3735333361343130620a613163623238356538333764333830613963333466333832353262333432
|
||||||
38333535376230653538633261366633366238636530656464343939623834646139396637343461
|
61356434346135616566613064643331633933356332363631623664366166663034373435326262
|
||||||
62396564616566373231343138653061366663366461343239636463633261643464633361373834
|
32623662386636353761396239343534343562616237616131646536323334303736623935326638
|
||||||
30386237383561613232626361323536636363373831356635653535656633323332666264383061
|
38323034346266666163626536333038393935396330663065383237333065613362333736666466
|
||||||
63633331343162396431653237333664663439383738333932373866643030303735643534386330
|
30643761366364353431306538306263653338636664393035373132616239346631343034376336
|
||||||
32366438623966303131613666613265313235323530626132316661383462313033653038363933
|
30643562363338383239306434393535363763626136376463616432333431316433336638663061
|
||||||
34646463666632396231313563363064396539663236356565306564653433353735333335656534
|
61633762383063383934386532363633303661323334393439373936313564636363393535646361
|
||||||
65396235656365376236613832626366666666653834626633373937386366656232633766636634
|
37666561323736303033353930616362336134383165326463613261323665326666383337646465
|
||||||
63346535643936646333663735333630623538393234383336613461343863623935393865343532
|
35326532353836343363616566363965663237653433646134663363643337363964643762366438
|
||||||
37306161646434656439323832383238643963316463643033306262613231646334303361653833
|
30353535633338303133373035646230653933346130306631393233633964353865386137373262
|
||||||
63646363653235326261333538666236353231376437653636316131623135323138636661386665
|
34303161353963613538663366633531633264663231306134313862306561613164346430393462
|
||||||
66316663393934663031623135633137366131633030626664306564396635663235383636343431
|
39336430653864633530353931653931303266613264643462313832313432366662366566353233
|
||||||
65343330373036316161626165343738323730646130663839616363326232653039303932363765
|
35376466356537313131313136353334386539393638663738653366373032323966346666613336
|
||||||
61633432386361353861393263306236343162313066653962363761373161656365353538656335
|
66363234376163326562656232326432636331356238326337313538663563643939323265633238
|
||||||
36643163383435326230326135616164333134363666616339653330656531326539653764633632
|
35316430326661356633386130613238623730313530636136346139326235333838336561376565
|
||||||
33396431346530633933626533313939316337363035333763613237356133396162316163646632
|
62356166333936313565343764336230663332653765353531653930373265383862643337333136
|
||||||
39633165653565333237376137303737383831373838656461346434663331313965323235626330
|
31616336363863666465336561346265613637653132343836653962313439313465313033646564
|
||||||
37303665313564346233666632656531323932316533613463633636633435646137653064653137
|
63353162663333383637336266376535643566343637303139333838373536366264376632393938
|
||||||
63356439336664383133376530393036653061613466383961613939633866306337653865366330
|
37376239353239356166303533393339343131336138343438366463666332636562343261366663
|
||||||
66366132383237336534363563643135313138376437336563643130383534373263373364653462
|
34313561376665373563613636366366633034353232396133313431626663316431336330656433
|
||||||
61396664373363373737633339333335653164323662623239313666666431353631306438303533
|
36363536663662653434353161383238346230636433366138633765376635376136636638613638
|
||||||
65636135343039323734393637396163366138636439656633616130643636363831643532623337
|
31653137353036393364336139323366636464613133313138346433663664386465313764656431
|
||||||
35303934656230623563366664396632356638636630613433626334303235343961616439636165
|
64633761613630393465303564656333333864333961393262303730313765383735323534326331
|
||||||
37313264643432363532373464633633343033303133346435663062323838383931393061396531
|
38643033376136383939373565323162663139633337653836363532666538356534343365353064
|
||||||
34373034313038663033383333333430326136346433646536626565663436323764316361356531
|
32373565623066656663346132373831343738643830633935313831343162633966363363396636
|
||||||
64343839373138346636336637343438616639353236336633666234643365303030656132346232
|
61323237653731353438643431346539613533323637633936336531666634623330356563636630
|
||||||
65346636626338363762653939333639313462393231616636663935333232636231326231373833
|
64633232343163353830633830646632623961646230313037366230646365633438353761336437
|
||||||
32366535353361303532633462303763386238663432333465653361373064656162343465396631
|
38366362323964613361376236323661373736393733393938343538383563303861343130333965
|
||||||
61363330666666653533653365653232313836373230336537363337666536313737353731363165
|
34663738373966363465363166393937633738653836643632376233353961656665366632623166
|
||||||
63333837356135646564663536666264346630356163663666323432393865393836326338306266
|
65343037346163613664623361313534666563363537383732333739633437336635376634643339
|
||||||
33383139383033643937383865393236396337666139633032323162303665633230346365663730
|
38303166353865656133326631323136633435623231303464663236373766326666306263663961
|
||||||
63643661323332616163326636646634616165633538383038653766303066366335393065373236
|
33643465303138373065666433373866343730653533383366323664383235633832663536646536
|
||||||
39613562363634316564346162333030393430303335323733306163396137373037346237656231
|
36383861363639646166626661626264353865303936333663643432613163626334356564646364
|
||||||
39346635333138656336346230313635353233363334633037633961306663356364383962643361
|
63373936613930313935333963633765303961323531336630323034326438363464653834323563
|
||||||
38366531396431653330393239663337626564616265636362313537373239663736636535303332
|
38323038376332306137383438336637343633396131343234326635363736393363373130616232
|
||||||
39626565306632336330386434386636656430363738383431306637666334653136633762323434
|
32386465376338376338633931663461376530393533336530376332653630393630333330383663
|
||||||
34356661633837626231346134303131353264643532613739333234346634346565376333343563
|
38626238663637653633653962393133313637376137663765633134306666613339306235396632
|
||||||
37343662376533383335356331616435393764663530616335386435653538646362613364303437
|
30356331303766323732633530323162393530613634366138313637306133653436303239383738
|
||||||
63356166353062626163313735646365643635393663316365626431383062663331366439613164
|
34356363336333313833623862356139376334356664303430306562386235396533326162383736
|
||||||
323565613761663833636330633533376131
|
30313465393936346162316330616333353934633032333265306533653264653931653430393065
|
||||||
|
32626331363030363635393064653564613761336465633739323566323336623864323433356134
|
||||||
|
63306364336264383836323763353233643463636131383332316362613337363039363636663030
|
||||||
|
32316231303462666333353265613135613830333861333131656439326236333634316462646431
|
||||||
|
63336433343937636136646434326239313064373863393461623832373262633462633338356430
|
||||||
|
65306462666636303633
|
||||||
|
|
|
||||||
|
|
@ -10,3 +10,5 @@
|
||||||
tags: docker
|
tags: docker
|
||||||
- role: services
|
- role: services
|
||||||
tags: services
|
tags: services
|
||||||
|
- role: backup
|
||||||
|
tags: backup
|
||||||
|
|
|
||||||
4
roles/backup/defaults/main.yml
Normal file
4
roles/backup/defaults/main.yml
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
backup_dir: /opt/backups
|
||||||
|
backup_retention_days: 7
|
||||||
|
backup_user: deploy
|
||||||
25
roles/backup/tasks/main.yml
Normal file
25
roles/backup/tasks/main.yml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
---
|
||||||
|
- name: Create backup directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ backup_dir }}"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ backup_user }}"
|
||||||
|
group: "{{ backup_user }}"
|
||||||
|
mode: "0750"
|
||||||
|
|
||||||
|
- name: Deploy backup script
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: backup.sh.j2
|
||||||
|
dest: /usr/local/bin/backup-services
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0750"
|
||||||
|
|
||||||
|
- name: Schedule daily backup at 03:00
|
||||||
|
ansible.builtin.cron:
|
||||||
|
name: "Daily services backup"
|
||||||
|
minute: "0"
|
||||||
|
hour: "3"
|
||||||
|
job: "/usr/local/bin/backup-services >> /var/log/backup-services.log 2>&1"
|
||||||
|
user: root
|
||||||
|
state: present
|
||||||
51
roles/backup/templates/backup.sh.j2
Normal file
51
roles/backup/templates/backup.sh.j2
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
# Backs up PostgreSQL databases and Vaultwarden data.
|
||||||
|
# Runs daily at 03:00, keeps {{ backup_retention_days }} days of backups.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
BACKUP_DIR="{{ backup_dir }}"
|
||||||
|
DATE=$(date +%Y-%m-%d_%H-%M-%S)
|
||||||
|
KEEP_DAYS="{{ backup_retention_days }}"
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
||||||
|
|
||||||
|
log "=== Backup started ==="
|
||||||
|
|
||||||
|
# ── Forgejo PostgreSQL ──────────────────────────────────────────────────────
|
||||||
|
log "Backing up forgejo-db..."
|
||||||
|
docker exec forgejo-db pg_dump -U forgejo forgejo \
|
||||||
|
| gzip > "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz"
|
||||||
|
log " → ${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/forgejo-db_${DATE}.sql.gz" | cut -f1))"
|
||||||
|
|
||||||
|
# ── Plane PostgreSQL ────────────────────────────────────────────────────────
|
||||||
|
log "Backing up plane-db..."
|
||||||
|
docker exec plane-db pg_dump -U plane plane \
|
||||||
|
| gzip > "${BACKUP_DIR}/plane-db_${DATE}.sql.gz"
|
||||||
|
log " → ${BACKUP_DIR}/plane-db_${DATE}.sql.gz ($(du -sh "${BACKUP_DIR}/plane-db_${DATE}.sql.gz" | cut -f1))"
|
||||||
|
|
||||||
|
# ── Vaultwarden data ────────────────────────────────────────────────────────
|
||||||
|
log "Backing up Vaultwarden..."
|
||||||
|
docker run --rm \
|
||||||
|
--volumes-from vaultwarden \
|
||||||
|
-v "${BACKUP_DIR}:/backup" \
|
||||||
|
alpine:3 \
|
||||||
|
tar czf "/backup/vaultwarden_${DATE}.tar.gz" /data
|
||||||
|
log " → ${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/vaultwarden_${DATE}.tar.gz" | cut -f1))"
|
||||||
|
|
||||||
|
# ── Forgejo repositories ────────────────────────────────────────────────────
|
||||||
|
log "Backing up Forgejo data..."
|
||||||
|
docker run --rm \
|
||||||
|
--volumes-from forgejo \
|
||||||
|
-v "${BACKUP_DIR}:/backup" \
|
||||||
|
alpine:3 \
|
||||||
|
tar czf "/backup/forgejo-data_${DATE}.tar.gz" /data
|
||||||
|
log " → ${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz ($(du -sh "${BACKUP_DIR}/forgejo-data_${DATE}.tar.gz" | cut -f1))"
|
||||||
|
|
||||||
|
# ── Cleanup old backups ─────────────────────────────────────────────────────
|
||||||
|
log "Removing backups older than ${KEEP_DAYS} days..."
|
||||||
|
find "${BACKUP_DIR}" -name "*.gz" -mtime +${KEEP_DAYS} -delete
|
||||||
|
log " → Done. Current backups:"
|
||||||
|
du -sh "${BACKUP_DIR}"/*.gz 2>/dev/null | sort -k2 || true
|
||||||
|
|
||||||
|
log "=== Backup completed ==="
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
---
|
---
|
||||||
- import_tasks: packages.yml
|
- import_tasks: packages.yml
|
||||||
|
- import_tasks: swap.yml
|
||||||
- import_tasks: users.yml
|
- import_tasks: users.yml
|
||||||
- import_tasks: sshd.yml
|
- import_tasks: sshd.yml
|
||||||
- import_tasks: firewall.yml
|
- import_tasks: firewall.yml
|
||||||
|
|
|
||||||
42
roles/base/tasks/swap.yml
Normal file
42
roles/base/tasks/swap.yml
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
---
|
||||||
|
- name: Check if swap file exists
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /swapfile
|
||||||
|
register: swapfile_stat
|
||||||
|
|
||||||
|
- name: Create swap file (2 GiB)
|
||||||
|
ansible.builtin.command: fallocate -l 2G /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Set swap file permissions
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /swapfile
|
||||||
|
mode: "0600"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
|
||||||
|
- name: Format swap file
|
||||||
|
ansible.builtin.command: mkswap /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Enable swap
|
||||||
|
ansible.builtin.command: swapon /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Persist swap in /etc/fstab
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/fstab
|
||||||
|
line: "/swapfile none swap sw 0 0"
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Set swappiness to 10 (prefer RAM over swap)
|
||||||
|
ansible.posix.sysctl:
|
||||||
|
name: vm.swappiness
|
||||||
|
value: "10"
|
||||||
|
state: present
|
||||||
|
sysctl_set: true
|
||||||
|
reload: true
|
||||||
|
|
@ -24,3 +24,6 @@ prometheus_image: "prom/prometheus:v3.4.0" # https://hub
|
||||||
node_exporter_image: "prom/node-exporter:v1.9.1" # https://hub.docker.com/r/prom/node-exporter/tags
|
node_exporter_image: "prom/node-exporter:v1.9.1" # https://hub.docker.com/r/prom/node-exporter/tags
|
||||||
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1" # https://github.com/google/cadvisor/releases
|
cadvisor_image: "gcr.io/cadvisor/cadvisor:v0.52.1" # https://github.com/google/cadvisor/releases
|
||||||
grafana_image: "grafana/grafana:11.6.1" # https://hub.docker.com/r/grafana/grafana/tags
|
grafana_image: "grafana/grafana:11.6.1" # https://hub.docker.com/r/grafana/grafana/tags
|
||||||
|
alertmanager_image: "prom/alertmanager:v0.28.1" # https://hub.docker.com/r/prom/alertmanager/tags
|
||||||
|
loki_image: "grafana/loki:3.4.3" # https://hub.docker.com/r/grafana/loki/tags
|
||||||
|
promtail_image: "grafana/promtail:3.4.3" # https://hub.docker.com/r/grafana/promtail/tags
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,51 @@
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
notify: Restart stack
|
notify: Restart stack
|
||||||
|
|
||||||
|
- name: Deploy Prometheus alert rules
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus/rules/alerts.yml.j2
|
||||||
|
dest: "{{ services_root }}/prometheus/rules/alerts.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart stack
|
||||||
|
|
||||||
|
- name: Deploy AlertManager config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus/alertmanager.yml.j2
|
||||||
|
dest: "{{ services_root }}/prometheus/alertmanager.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0640"
|
||||||
|
notify: Restart stack
|
||||||
|
|
||||||
|
- name: Deploy Loki config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: loki/loki.yml.j2
|
||||||
|
dest: "{{ services_root }}/loki/loki.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart stack
|
||||||
|
|
||||||
|
- name: Deploy Promtail config
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: loki/promtail.yml.j2
|
||||||
|
dest: "{{ services_root }}/loki/promtail.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart stack
|
||||||
|
|
||||||
|
- name: Deploy Grafana Loki datasource
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: grafana/provisioning/datasources/loki.yml.j2
|
||||||
|
dest: "{{ services_root }}/grafana/provisioning/datasources/loki.yml"
|
||||||
|
owner: "{{ deploy_user }}"
|
||||||
|
group: "{{ deploy_group }}"
|
||||||
|
mode: "0644"
|
||||||
|
notify: Restart stack
|
||||||
|
|
||||||
- name: Create acme.json for Let's Encrypt certificates
|
- name: Create acme.json for Let's Encrypt certificates
|
||||||
ansible.builtin.file:
|
ansible.builtin.file:
|
||||||
path: "{{ services_root }}/traefik/acme.json"
|
path: "{{ services_root }}/traefik/acme.json"
|
||||||
|
|
|
||||||
|
|
@ -29,3 +29,5 @@
|
||||||
- grafana/provisioning/datasources
|
- grafana/provisioning/datasources
|
||||||
- grafana/provisioning/dashboards
|
- grafana/provisioning/dashboards
|
||||||
- grafana/provisioning/dashboards/json
|
- grafana/provisioning/dashboards/json
|
||||||
|
- prometheus/rules
|
||||||
|
- loki
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,9 @@
|
||||||
- "{{ node_exporter_image }}"
|
- "{{ node_exporter_image }}"
|
||||||
- "{{ cadvisor_image }}"
|
- "{{ cadvisor_image }}"
|
||||||
- "{{ grafana_image }}"
|
- "{{ grafana_image }}"
|
||||||
|
- "{{ alertmanager_image }}"
|
||||||
|
- "{{ loki_image }}"
|
||||||
|
- "{{ promtail_image }}"
|
||||||
register: pull_result
|
register: pull_result
|
||||||
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
|
changed_when: "'Status: Downloaded newer image' in pull_result.stdout"
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
# Docker Compose stack — generated by Ansible
|
# Docker Compose stack — generated by Ansible
|
||||||
# Do not edit manually; re-run ansible-playbook deploy.yml
|
# Do not edit manually; re-run ansible-playbook deploy.yml
|
||||||
|
#
|
||||||
|
# NOTE: Traefik uses the file provider (routes.yml.j2) — Docker labels on
|
||||||
|
# containers are intentionally absent. Adding labels here has no effect.
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
# proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа
|
# proxy — публичная сеть только для Traefik: нужна для исходящего интернет-доступа
|
||||||
|
|
@ -37,6 +40,7 @@ volumes:
|
||||||
act_runner_data:
|
act_runner_data:
|
||||||
prometheus_data:
|
prometheus_data:
|
||||||
grafana_data:
|
grafana_data:
|
||||||
|
loki_data:
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
|
|
@ -56,14 +60,11 @@ services:
|
||||||
- {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro
|
- {{ services_root }}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro
|
||||||
- {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro
|
- {{ services_root }}/traefik/dynamic:/etc/traefik/dynamic:ro
|
||||||
- {{ services_root }}/traefik/acme.json:/acme/acme.json
|
- {{ services_root }}/traefik/acme.json:/acme/acme.json
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "traefik", "healthcheck", "--ping"]
|
||||||
- "traefik.http.routers.traefik-dashboard.rule=Host(`{{ domain_traefik }}`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.routers.traefik-dashboard.service=api@internal"
|
|
||||||
- "traefik.http.routers.traefik-dashboard.middlewares=traefik-auth"
|
|
||||||
- "traefik.http.middlewares.traefik-auth.basicauth.users={{ traefik_dashboard_htpasswd }}"
|
|
||||||
|
|
||||||
# ── Vaultwarden ────────────────────────────────────────────────────────────
|
# ── Vaultwarden ────────────────────────────────────────────────────────────
|
||||||
vaultwarden:
|
vaultwarden:
|
||||||
|
|
@ -82,12 +83,11 @@ services:
|
||||||
- LOG_LEVEL=warn
|
- LOG_LEVEL=warn
|
||||||
- EXTENDED_LOGGING=true
|
- EXTENDED_LOGGING=true
|
||||||
- TZ=UTC
|
- TZ=UTC
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||||
- "traefik.http.routers.vaultwarden.rule=Host(`{{ domain_vault }}`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.vaultwarden.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.vaultwarden.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.services.vaultwarden.loadbalancer.server.port=80"
|
|
||||||
|
|
||||||
# ── Forgejo ────────────────────────────────────────────────────────────────
|
# ── Forgejo ────────────────────────────────────────────────────────────────
|
||||||
forgejo:
|
forgejo:
|
||||||
|
|
@ -120,12 +120,12 @@ services:
|
||||||
- FORGEJO__service__DISABLE_REGISTRATION=true
|
- FORGEJO__service__DISABLE_REGISTRATION=true
|
||||||
ports:
|
ports:
|
||||||
- "2222:22"
|
- "2222:22"
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
|
||||||
- "traefik.http.routers.forgejo.rule=Host(`{{ domain_git }}`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.forgejo.entrypoints=websecure"
|
timeout: 10s
|
||||||
- "traefik.http.routers.forgejo.tls.certresolver=letsencrypt"
|
retries: 5
|
||||||
- "traefik.http.services.forgejo.loadbalancer.server.port=3000"
|
start_period: 60s
|
||||||
|
|
||||||
forgejo-db:
|
forgejo-db:
|
||||||
image: {{ forgejo_db_image }}
|
image: {{ forgejo_db_image }}
|
||||||
|
|
@ -152,6 +152,9 @@ services:
|
||||||
# /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal)
|
# /api/* и /auth/* → plane-api:8000 (Django, на backend + plane-internal)
|
||||||
# остальное → plane-web:3000 (Next.js, на backend + plane-internal)
|
# остальное → plane-web:3000 (Next.js, на backend + plane-internal)
|
||||||
# Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически.
|
# Правило с PathPrefix длиннее → более высокий приоритет у Traefik автоматически.
|
||||||
|
#
|
||||||
|
# NOTE: Plane не публикует конкретные version tags — используем :stable.
|
||||||
|
# Следить за обновлениями: https://github.com/makeplane/plane/releases
|
||||||
|
|
||||||
plane-web:
|
plane-web:
|
||||||
image: {{ plane_frontend_image }}
|
image: {{ plane_frontend_image }}
|
||||||
|
|
@ -162,13 +165,12 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
- plane-internal
|
- plane-internal
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||||
- "traefik.http.routers.plane.rule=Host(`{{ domain_plane }}`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.plane.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.plane.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.services.plane.loadbalancer.server.port=80"
|
start_period: 30s
|
||||||
- "traefik.http.routers.plane.priority=1"
|
|
||||||
|
|
||||||
plane-admin:
|
plane-admin:
|
||||||
image: {{ plane_admin_image }}
|
image: {{ plane_admin_image }}
|
||||||
|
|
@ -180,13 +182,12 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
- plane-internal
|
- plane-internal
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:80/"]
|
||||||
- "traefik.http.routers.plane-admin.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/god-mode/`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.plane-admin.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.plane-admin.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.services.plane-admin.loadbalancer.server.port=80"
|
start_period: 30s
|
||||||
- "traefik.http.routers.plane-admin.priority=10"
|
|
||||||
|
|
||||||
plane-space:
|
plane-space:
|
||||||
image: {{ plane_space_image }}
|
image: {{ plane_space_image }}
|
||||||
|
|
@ -198,13 +199,12 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- backend
|
- backend
|
||||||
- plane-internal
|
- plane-internal
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:3000/"]
|
||||||
- "traefik.http.routers.plane-space.rule=Host(`{{ domain_plane }}`) && PathPrefix(`/spaces/`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.plane-space.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.plane-space.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.services.plane-space.loadbalancer.server.port=3000"
|
start_period: 30s
|
||||||
- "traefik.http.routers.plane-space.priority=10"
|
|
||||||
|
|
||||||
plane-api:
|
plane-api:
|
||||||
image: {{ plane_backend_image }}
|
image: {{ plane_backend_image }}
|
||||||
|
|
@ -245,17 +245,18 @@ services:
|
||||||
- APP_BASE_URL=https://{{ domain_plane }}
|
- APP_BASE_URL=https://{{ domain_plane }}
|
||||||
- ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode
|
- ADMIN_BASE_URL=https://{{ domain_plane }}/god-mode
|
||||||
- SPACE_BASE_URL=https://{{ domain_plane }}/spaces
|
- SPACE_BASE_URL=https://{{ domain_plane }}/spaces
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:8000/api/"]
|
||||||
- "traefik.http.routers.plane-api.rule=Host(`{{ domain_plane }}`) && (PathPrefix(`/api/`) || PathPrefix(`/auth/`))"
|
interval: 30s
|
||||||
- "traefik.http.routers.plane-api.entrypoints=websecure"
|
timeout: 10s
|
||||||
- "traefik.http.routers.plane-api.tls.certresolver=letsencrypt"
|
retries: 5
|
||||||
- "traefik.http.services.plane-api.loadbalancer.server.port=8000"
|
start_period: 60s
|
||||||
|
|
||||||
plane-worker:
|
plane-worker:
|
||||||
image: {{ plane_backend_image }}
|
image: {{ plane_backend_image }}
|
||||||
container_name: plane-worker
|
container_name: plane-worker
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
mem_limit: 512m
|
||||||
command: ./bin/docker-entrypoint-worker.sh
|
command: ./bin/docker-entrypoint-worker.sh
|
||||||
depends_on:
|
depends_on:
|
||||||
- plane-api
|
- plane-api
|
||||||
|
|
@ -283,6 +284,7 @@ services:
|
||||||
image: {{ plane_backend_image }}
|
image: {{ plane_backend_image }}
|
||||||
container_name: plane-beat
|
container_name: plane-beat
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
mem_limit: 256m
|
||||||
command: ./bin/docker-entrypoint-beat.sh
|
command: ./bin/docker-entrypoint-beat.sh
|
||||||
depends_on:
|
depends_on:
|
||||||
- plane-api
|
- plane-api
|
||||||
|
|
@ -333,6 +335,11 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- plane_redis_data:/data
|
- plane_redis_data:/data
|
||||||
command: redis-server --appendonly yes
|
command: redis-server --appendonly yes
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
plane-minio:
|
plane-minio:
|
||||||
image: {{ plane_minio_image }}
|
image: {{ plane_minio_image }}
|
||||||
|
|
@ -373,14 +380,11 @@ services:
|
||||||
- PUID=1000
|
- PUID=1000
|
||||||
- PGID=1000
|
- PGID=1000
|
||||||
- TZ=UTC
|
- TZ=UTC
|
||||||
labels:
|
healthcheck:
|
||||||
- "traefik.enable=true"
|
test: ["CMD", "curl", "-sf", "http://localhost:8384/rest/noauth/health"]
|
||||||
- "traefik.http.routers.syncthing.rule=Host(`{{ domain_sync }}`)"
|
interval: 30s
|
||||||
- "traefik.http.routers.syncthing.entrypoints=websecure"
|
timeout: 5s
|
||||||
- "traefik.http.routers.syncthing.tls.certresolver=letsencrypt"
|
retries: 3
|
||||||
- "traefik.http.routers.syncthing.middlewares=syncthing-auth"
|
|
||||||
- "traefik.http.middlewares.syncthing-auth.basicauth.users={{ syncthing_basic_auth_htpasswd }}"
|
|
||||||
- "traefik.http.services.syncthing.loadbalancer.server.port=8384"
|
|
||||||
|
|
||||||
# ── Forgejo Actions Runner ─────────────────────────────────────────────────
|
# ── Forgejo Actions Runner ─────────────────────────────────────────────────
|
||||||
# backend — для связи с Forgejo по внутренней сети (http://forgejo:3000)
|
# backend — для связи с Forgejo по внутренней сети (http://forgejo:3000)
|
||||||
|
|
@ -414,12 +418,35 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- prometheus_data:/prometheus
|
- prometheus_data:/prometheus
|
||||||
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
- {{ services_root }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- {{ services_root }}/prometheus/rules:/etc/prometheus/rules:ro
|
||||||
command:
|
command:
|
||||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||||
- "--storage.tsdb.path=/prometheus"
|
- "--storage.tsdb.path=/prometheus"
|
||||||
- "--storage.tsdb.retention.time=30d"
|
- "--storage.tsdb.retention.time=30d"
|
||||||
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
|
||||||
- "--web.console.templates=/usr/share/prometheus/consoles"
|
- "--web.console.templates=/usr/share/prometheus/consoles"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: {{ alertmanager_image }}
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
volumes:
|
||||||
|
- {{ services_root }}/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
command:
|
||||||
|
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||||
|
- "--storage.path=/alertmanager"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:9093/-/healthy"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
node-exporter:
|
node-exporter:
|
||||||
image: {{ node_exporter_image }}
|
image: {{ node_exporter_image }}
|
||||||
|
|
@ -472,3 +499,38 @@ services:
|
||||||
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
- GF_SERVER_DOMAIN={{ domain_dashboard }}
|
||||||
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
- GF_SERVER_ROOT_URL=https://{{ domain_dashboard }}
|
||||||
- GF_AUTH_ANONYMOUS_ENABLED=false
|
- GF_AUTH_ANONYMOUS_ENABLED=false
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-sf", "http://localhost:3000/api/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# ── Logging Stack ──────────────────────────────────────────────────────────
|
||||||
|
loki:
|
||||||
|
image: {{ loki_image }}
|
||||||
|
container_name: loki
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
volumes:
|
||||||
|
- loki_data:/loki
|
||||||
|
- {{ services_root }}/loki/loki.yml:/etc/loki/local-config.yaml:ro
|
||||||
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-qO-", "http://localhost:3100/ready"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
promtail:
|
||||||
|
image: {{ promtail_image }}
|
||||||
|
container_name: promtail
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
volumes:
|
||||||
|
- /var/log:/var/log:ro
|
||||||
|
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
- {{ services_root }}/loki/promtail.yml:/etc/promtail/config.yml:ro
|
||||||
|
command: -config.file=/etc/promtail/config.yml
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
isDefault: false
|
||||||
|
editable: false
|
||||||
36
roles/services/templates/loki/loki.yml.j2
Normal file
36
roles/services/templates/loki/loki.yml.j2
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
grpc_listen_port: 9096
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2020-10-24
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 30d
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
working_directory: /loki/retention
|
||||||
|
delete_request_store: filesystem
|
||||||
|
retention_enabled: true
|
||||||
38
roles/services/templates/loki/promtail.yml.j2
Normal file
38
roles/services/templates/loki/promtail.yml.j2
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
server:
|
||||||
|
http_listen_port: 9080
|
||||||
|
grpc_listen_port: 0
|
||||||
|
|
||||||
|
positions:
|
||||||
|
filename: /tmp/positions.yaml
|
||||||
|
|
||||||
|
clients:
|
||||||
|
- url: http://loki:3100/loki/api/v1/push
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: docker
|
||||||
|
docker_sd_configs:
|
||||||
|
- host: unix:///var/run/docker.sock
|
||||||
|
refresh_interval: 5s
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_docker_container_name]
|
||||||
|
regex: /(.*)
|
||||||
|
target_label: container
|
||||||
|
- source_labels: [__meta_docker_container_log_stream]
|
||||||
|
target_label: stream
|
||||||
|
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||||
|
target_label: service
|
||||||
|
|
||||||
|
- job_name: syslog
|
||||||
|
static_configs:
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job: syslog
|
||||||
|
__path__: /var/log/syslog
|
||||||
|
|
||||||
|
- job_name: auth
|
||||||
|
static_configs:
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job: auth
|
||||||
|
__path__: /var/log/auth.log
|
||||||
28
roles/services/templates/prometheus/alertmanager.yml.j2
Normal file
28
roles/services/templates/prometheus/alertmanager.yml.j2
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: [alertname, severity]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 4h
|
||||||
|
receiver: telegram
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: telegram
|
||||||
|
telegram_configs:
|
||||||
|
- bot_token: "{{ alertmanager_telegram_token }}"
|
||||||
|
chat_id: {{ alertmanager_telegram_chat_id }}
|
||||||
|
message: |
|
||||||
|
{{ '{{' }} range .Alerts {{ '}}' }}
|
||||||
|
{{ '{{' }} if eq .Status "firing" {{ '}}' }}🔴{{ '{{' }} else {{ '}}' }}🟢{{ '{{' }} end {{ '}}' }} *{{ '{{' }} .Labels.alertname {{ '}}' }}*
|
||||||
|
{{ '{{' }} .Annotations.summary {{ '}}' }}
|
||||||
|
{{ '{{' }} .Annotations.description {{ '}}' }}
|
||||||
|
{{ '{{' }} end {{ '}}' }}
|
||||||
|
parse_mode: Markdown
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_matchers: [severity="critical"]
|
||||||
|
target_matchers: [severity="warning"]
|
||||||
|
equal: [alertname]
|
||||||
|
|
@ -5,6 +5,14 @@ global:
|
||||||
external_labels:
|
external_labels:
|
||||||
instance: "{{ domain_base }}"
|
instance: "{{ domain_base }}"
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets: ["alertmanager:9093"]
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- /etc/prometheus/rules/*.yml
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: prometheus
|
- job_name: prometheus
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|
@ -17,3 +25,7 @@ scrape_configs:
|
||||||
- job_name: cadvisor
|
- job_name: cadvisor
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["cadvisor:8080"]
|
- targets: ["cadvisor:8080"]
|
||||||
|
|
||||||
|
- job_name: alertmanager
|
||||||
|
static_configs:
|
||||||
|
- targets: ["alertmanager:9093"]
|
||||||
|
|
|
||||||
86
roles/services/templates/prometheus/rules/alerts.yml.j2
Normal file
86
roles/services/templates/prometheus/rules/alerts.yml.j2
Normal file
|
|
@ -0,0 +1,86 @@
|
||||||
|
# Generated by Ansible — do not edit manually
|
||||||
|
groups:
|
||||||
|
- name: host
|
||||||
|
rules:
|
||||||
|
- alert: HighCPULoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокая нагрузка CPU ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "CPU загружен более 85% на протяжении 5 минут."
|
||||||
|
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Использование RAM превысило 85%."
|
||||||
|
|
||||||
|
- alert: CriticalMemoryUsage
|
||||||
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Критическое использование RAM ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "RAM заполнена на 95%+. Возможны OOM kills."
|
||||||
|
|
||||||
|
- alert: DiskSpaceWarning
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Заканчивается место на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||||
|
|
||||||
|
- alert: DiskSpaceCritical
|
||||||
|
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|aufs"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|aufs"})) * 100 > 90
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Критически мало места на диске ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Диск {{ '{{' }} $labels.mountpoint {{ '}}' }} занят на {{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%."
|
||||||
|
|
||||||
|
- alert: SwapUsageHigh
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Высокое использование swap ({{ '{{' }} $value | printf \"%.0f\" {{ '}}' }}%)"
|
||||||
|
description: "Swap используется более чем на 50% — RAM под давлением."
|
||||||
|
|
||||||
|
- name: containers
|
||||||
|
rules:
|
||||||
|
- alert: ContainerDown
|
||||||
|
expr: absent(container_last_seen{name=~".+"}) or time() - container_last_seen{name=~".+"} > 60
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} недоступен"
|
||||||
|
description: "Контейнер не отвечает более 2 минут."
|
||||||
|
|
||||||
|
- alert: ContainerHighMemory
|
||||||
|
expr: (container_memory_usage_bytes{name=~".+"} / container_spec_memory_limit_bytes{name=~".+", container_spec_memory_limit_bytes > 0}) * 100 > 90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} использует 90%+ памяти"
|
||||||
|
description: "Контейнер близок к mem_limit — возможен OOM kill."
|
||||||
|
|
||||||
|
- alert: ContainerRestarting
|
||||||
|
expr: increase(container_last_seen{name=~".+"}[5m]) == 0 and rate(container_cpu_usage_seconds_total{name=~".+"}[5m]) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Контейнер {{ '{{' }} $labels.name {{ '}}' }} возможно перезапускается"
|
||||||
|
description: "Контейнер не активен — проверьте docker ps."
|
||||||
|
|
@ -14,6 +14,8 @@ api:
|
||||||
dashboard: true
|
dashboard: true
|
||||||
insecure: false
|
insecure: false
|
||||||
|
|
||||||
|
ping: {}
|
||||||
|
|
||||||
entryPoints:
|
entryPoints:
|
||||||
web:
|
web:
|
||||||
address: ":80"
|
address: ":80"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue