version: "3.0" # # updated: 2023-05-27 # stack: monitoring # x-logging: &x-logging logging: driver: loki options: loki-url: "http://loki:3100/loki/api/v1/push" loki-retries: "5" loki-batch-size: "400" x-common: &x-common <<: *x-logging restart: "no" stop_grace_period: 5s stdin_open: true tty: true privileged: false security_opt: - no-new-privileges=true cap_drop: - ALL cap_add: - KILL dns: - 1.1.1.1 - 8.8.8.8 - 1.0.0.1 - 8.8.4.4 ipc: "shareable" extra_hosts: - "template.home:192.168.0.0" environment: TZ: "Europe/Paris" PUID: 1000 PGID: 1000 user: 1000:1000 labels: com.centurylinklabs.watchtower.enable: true logging: "promtail" com.stack.name: "common" com.stack.service.name: "common" devices: - /dev/kmsg:/dev/kmsg deploy: resources: limits: cpus: "0.50" memory: 256M ulimits: nproc: 65535 nofile: soft: 20000 hard: 40000 tmpfs: - /tmp:rw,noexec,nosuid,size=64k sysctls: net.core.somaxconn: 1024 net.ipv4.tcp_syncookies: 0 x-volume-timezone: &x-volume-timezone "/etc/timezone:/etc/timezone:ro" x-volume-localtime: &x-volume-localtime "/etc/localtime:/etc/localtime:ro" x-volume-docker-socket: &x-volume-docker-socket "/var/run/docker.sock:/var/run/docker.sock:rw" x-volume-cgroups: &x-volume-cgroups "/proc/cgroups:/cgroup:rw" x-volume-ssl: &x-volume-ssl "/opt/docker/ssl:/ssl:ro" services: notifier: <<: *x-common container_name: notifier hostname: notifier image: academo/grafana-ntfy:latest restart: always ports: - "8088:8080" expose: - "8080" labels: com.stack.name: "monitoring" com.stack.service.name: "notifier" command: - "-ntfy-url=http://ntfy:8050/grafana" volumes: - *x-volume-timezone - *x-volume-localtime cadvisor: <<: *x-common user: 0:0 container_name: cadvisor hostname: cadvisor image: gcr.io/cadvisor/cadvisor:v0.47.1 restart: always privileged: true cap_add: - SYS_PTRACE ports: - "8080:8080" expose: - "8080" command: - "--storage_duration=1m0s" - "--allow_dynamic_housekeeping=true" - "--housekeeping_interval=30s" - "--global_housekeeping_interval=30s" - "--event_storage_age_limit=default=0" - "--event_storage_event_limit=default=0" labels: com.stack.name: "monitoring" com.stack.service.name: "cadvisor" volumes: - *x-volume-timezone - *x-volume-localtime - *x-volume-docker-socket - *x-volume-cgroups - /var/lib/docker/:/var/lib/docker:ro - /etc/machine-id:/etc/machine-id:ro - /:/rootfs:ro - /sys:/sys:ro - /dev/disk/:/dev/disk:ro - /var/run:/var/run:ro node-exporter: <<: *x-common user: 0:0 container_name: node-exporter hostname: node-exporter image: prom/node-exporter:latest restart: always privileged: true cap_add: - SYS_ADMIN ports: - "9100:9100" expose: - "9100" healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9100/ || exit 1 command: - "--collector.arp" - "--collector.bcache" - "--collector.bonding" - "--collector.btrfs" - "--collector.conntrack" - "--collector.cpu" - "--collector.cpufreq" - "--collector.diskstats" - "--collector.dmi" - "--collector.edac" - "--collector.entropy" - "--collector.fibrechannel" - "--collector.filefd" - "--collector.filesystem" - "--collector.hwmon" - "--collector.infiniband" - "--collector.ipvs" - "--collector.loadavg" - "--collector.mdadm" - "--collector.meminfo" - "--collector.netclass" - "--collector.netdev" - "--collector.netstat" - "--collector.nfs" - "--collector.nfsd" - "--collector.nvme" - "--collector.os" - "--collector.powersupplyclass" - "--collector.pressure" - "--collector.rapl" - "--collector.schedstat" - "--collector.selinux" - "--collector.sockstat" - "--collector.softnet" - "--collector.stat" - "--collector.tapestats" - "--collector.textfile" - "--collector.thermal_zone" - "--collector.time" - "--collector.timex" - "--collector.udp_queues" - "--collector.uname" - "--collector.vmstat" - "--collector.xfs" - "--collector.zfs" - "--collector.buddyinfo" - "--collector.cgroups" - "--collector.drbd" - "--collector.ethtool" - "--collector.interrupts" - "--collector.ksmd" - "--collector.lnstat" - "--collector.logind" - "--collector.meminfo_numa" - "--collector.mountstats" - "--collector.network_route" - "--collector.ntp" - "--collector.perf" - "--collector.processes" - "--collector.qdisc" - "--collector.runit" #- "--collector.slabinfo" - "--collector.supervisord" - "--collector.sysctl" - "--collector.systemd" - "--collector.tcpstat" - "--collector.wifi" - "--collector.zoneinfo" - "--path.rootfs=/host" - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--collector.filesystem.ignored-mount-points='^(/rootfs|/host|)/(sys|proc|dev|host|etc)($$|/)'" - "--collector.filesystem.ignored-fs-types='^(sys|proc|auto|cgroup|devpts|ns|au|fuse\\.lxc|mqueue)(fs|)$$'" - "--collector.netdev.device-exclude='^(lo|veth.*)$'" - "--collector.ethtool.device-exclude='^(lo|br-.*|docker0|flannel.*|veth.*)$'" labels: com.stack.name: "monitoring" com.stack.service.name: "node-exporter" deploy: resources: limits: memory: 512M volumes: - *x-volume-timezone - *x-volume-localtime - *x-volume-docker-socket - *x-volume-cgroups - /etc/service:/etc/service:ro - /:/host:ro,rslave - /sys:/host/sys:ro - /proc:/host/proc:ro - /run/udev/data:/run/udev/data:ro - /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro ntfy-alertmanager: <<: *x-common user: 0:0 cap_add: - DAC_OVERRIDE container_name: ntfy-alertmanager hostname: ntfy-alertmanager image: xenrox/ntfy-alertmanager:latest restart: always ports: - "6080:8080" expose: - "8080" environment: NTFY_TOPIC: "alertmanager" labels: com.stack.name: "monitoring" com.stack.service.name: "ntfy" deploy: resources: limits: memory: 512M volumes: - *x-volume-timezone - *x-volume-localtime - /opt/docker/monitoring/conf/ntfy-alertmanager.cfg:/etc/ntfy-alertmanager/config alertmanager: <<: *x-common cap_add: - DAC_OVERRIDE container_name: alertmanager hostname: alertmanager image: prom/alertmanager:latest restart: always ports: - "9093:9093" expose: - "9093" command: - "--config.file=/etc/alertmanager/alertmanager.yml" healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9093/ || exit 1 labels: com.stack.name: "monitoring" com.stack.service.name: "alertmanager" deploy: resources: limits: memory: 512M volumes: - *x-volume-timezone - *x-volume-localtime - /opt/docker/monitoring/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml - /opt/docker/monitoring/datas/alertmanager:/alertmanager mimir: <<: *x-common stop_grace_period: 60s user: 0:0 cap_add: - DAC_OVERRIDE - SETUID - SETGID - CHOWN - SYS_ADMIN container_name: mimir hostname: mimir image: grafana/mimir:latest restart: always ports: - "9009:9009" - "9095:9095" expose: - "9009" - "9095" command: - "--config.file=/etc/mimir/mimir.yaml" healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9009/ || exit 1 labels: com.stack.name: "monitoring" com.stack.service.name: "mimir" deploy: resources: limits: memory: 4G volumes: - *x-volume-timezone - *x-volume-localtime - /opt/docker/monitoring/conf/mimir.yml:/etc/mimir/mimir.yaml - /opt/docker/monitoring/datas/mimir:/mimir prometheus: <<: *x-common stop_grace_period: 60s cap_add: - DAC_OVERRIDE container_name: prometheus hostname: prometheus image: prom/prometheus:latest restart: always depends_on: - mimir - alertmanager - cadvisor - node-exporter links: - mimir - alertmanager - cadvisor - node-exporter ports: - "9090:9090" expose: - "9090" healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1 command: - "--storage.tsdb.retention.time=365d" - "--config.file=/etc/prometheus/prometheus.yml" labels: com.stack.name: "monitoring" com.stack.service.name: "prometheus" deploy: resources: limits: memory: 2G volumes: - *x-volume-timezone - *x-volume-localtime - /opt/docker/monitoring/conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro - /opt/docker/monitoring/conf/alertmanager/rules.yml:/alertmanager/rules.yml:ro - /opt/docker/monitoring/datas/prometheus:/prometheus grafana: <<: *x-common logging: driver: json-file options: max-size: 32m max-file: "7" compress: "true" user: 0:0 cap_add: - DAC_OVERRIDE - SETUID - SETGID - CHOWN - SYS_ADMIN container_name: grafana hostname: grafana image: grafana/grafana:latest restart: always ports: - "3000:3000" expose: - "3000" healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1 depends_on: - notifier - prometheus links: - notifier - prometheus environment: GF_USERS_ALLOW_SIGN_UP: false GF_SERVER_SERVE_FROM_SUB_PATH: true GF_INSTALL_PLUGINS: "grafana-clock-panel,grafana-piechart-panel,grafana-simple-json-datasource,grafana-worldmap-panel,camptocamp-prometheus-alertmanager-datasource" GF_SECURITY_ADMIN_USER: "[admin username]" GF_SECURITY_ADMIN_PASSWORD: "[admin password]" labels: com.stack.name: "monitoring" com.stack.service.name: "grafana" tmpfs: - /tmp:rw,noexec,nosuid,size=5120k deploy: resources: limits: cpus: "4.0" memory: 1G volumes: - *x-volume-timezone - *x-volume-localtime - /opt/docker/monitoring/conf/grafana.ini:/etc/grafana/grafana.ini - /opt/docker/monitoring/conf/provisioning:/etc/grafana/provisioning - /opt/docker/monitoring/conf/provisioning/provisioned:/etc/grafana/dashboards/provisioned - /opt/docker/monitoring/datas/grafana/dashboards:/etc/grafana/dashboards - /opt/docker/monitoring/datas/grafana/datas:/var/lib/grafana