# 2023-05-28 groups: - name: Alerts rules: # Host out of memory - alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of memory (instance {{ $labels.instance }}) description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host memory under memory pressure - alert: HostMemoryUnderMemoryPressure expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host Memory is under utilized # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostMemoryIsUnderUtilized expr: (100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 1w labels: severity: info annotations: summary: Host Memory is under utilized (instance {{ $labels.instance }}) description: "Node memory is < 20% for 1 week. Consider reducing memory space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual network throughput in - alert: HostUnusualNetworkThroughputIn expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 5m labels: severity: warning annotations: summary: Host unusual network throughput in (instance {{ $labels.instance }}) description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual network throughput out - alert: HostUnusualNetworkThroughputOut expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 5m labels: severity: warning annotations: summary: Host unusual network throughput out (instance {{ $labels.instance }}) description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual disk read rate - alert: HostUnusualDiskReadRate expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 5m labels: severity: warning annotations: summary: Host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual disk write rate - alert: HostUnusualDiskWriteRate expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host out of disk space # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostOutOfDiskSpace expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of disk space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host disk will fill in 24 hours # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostDiskWillFillIn24Hours expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host out of inodes - alert: HostOutOfInodes expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host inodes will fill in 24 hours - alert: HostInodesWillFillIn24Hours expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual disk read latency - alert: HostUnusualDiskReadLatency expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host unusual disk read latency (instance {{ $labels.instance }}) description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual disk write latency - alert: HostUnusualDiskWriteLatency expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host high CPU - alert: HostHighCpuLoad expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host CPU is under utilized # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderUtilized expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 1w labels: severity: info annotations: summary: Host CPU is under utilized (instance {{ $labels.instance }}) description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host CPU steal noisy neighbor - alert: HostCpuStealNoisyNeighbor expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host CPU high iowait - alert: HostCpuHighIowait expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host CPU high iowait (instance {{ $labels.instance }}) description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host unusual disk IO - alert: HostUnusualDiskIo expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 5m labels: severity: warning annotations: summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ## Host context switching # # 1000 context switches is an arbitrary number. # # Alert threshold depends on nature of application. # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 # - alert: HostContextSwitching # expr: ((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} # for: 0m # labels: # severity: warning # annotations: # summary: Host context switching (instance {{ $labels.instance }}) # description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host swap is filling up - alert: HostSwapIsFillingUp expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host swap is filling up (instance {{ $labels.instance }}) description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host systemd service crashed - alert: HostSystemdServiceCrashed expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host systemd service crashed (instance {{ $labels.instance }}) description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host kernel version deviations - alert: HostKernelVersionDeviations expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 6h labels: severity: warning annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host OOM kill detected - alert: HostOomKillDetected expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host Network Receive Errors - alert: HostNetworkReceiveErrors expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host Network Receive Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host Network Transmit Errors - alert: HostNetworkTransmitErrors expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host Network Transmit Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host Network Interface Saturated - alert: HostNetworkInterfaceSaturated expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 1m labels: severity: warning annotations: summary: Host Network Interface Saturated (instance {{ $labels.instance }}) description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host Network Bond Degraded - alert: HostNetworkBondDegraded expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host Network Bond Degraded (instance {{ $labels.instance }}) description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host conntrack limit - alert: HostConntrackLimit expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 5m labels: severity: warning annotations: summary: Host conntrack limit (instance {{ $labels.instance }}) description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host clock skew - alert: HostClockSkew expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host clock skew (instance {{ $labels.instance }}) description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host clock not synchronising - alert: HostClockNotSynchronising expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m labels: severity: warning annotations: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Host requires reboot - alert: HostRequiresReboot expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 4h labels: severity: info annotations: summary: Host requires reboot (instance {{ $labels.instance }}) description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ## Container killed # # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. # - alert: ContainerKilled # expr: time() - container_last_seen > 60 # for: 0m # labels: # severity: warning # annotations: # summary: Container killed (instance {{ $labels.instance }}) # description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Container CPU usage - alert: ContainerCpuUsage expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container CPU usage (instance {{ $labels.instance }}) description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Container Memory usage # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - alert: ContainerMemoryUsage expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Container Volume usage - alert: ContainerVolumeUsage expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 for: 2m labels: severity: warning annotations: summary: Container Volume usage (instance {{ $labels.instance }}) description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Container high throttle rate - alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 2m labels: severity: warning annotations: summary: Container high throttle rate (instance {{ $labels.instance }}) description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Container Low CPU utilization - alert: ContainerLowCpuUtilization expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20 for: 7d labels: severity: info annotations: summary: Container Low CPU utilization (instance {{ $labels.instance }}) description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # ContainerLowMemoryUsage - alert: ContainerLowMemoryUsage expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20 for: 7d labels: severity: info annotations: summary: Container Low Memory usage (instance {{ $labels.instance }}) description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Traefik service down - alert: TraefikServiceDown expr: count(traefik_service_server_up) by (service) == 0 for: 0m labels: severity: critical annotations: summary: Traefik service down (instance {{ $labels.instance }}) description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Traefik high HTTP 4xx error rate service - alert: TraefikHighHttp4xxErrorRateService expr: sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 10 for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }}) description: "Traefik service 4xx error rate is above 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Traefik high HTTP 5xx error rate service - alert: TraefikHighHttp5xxErrorRateService expr: sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 10 for: 1m labels: severity: critical annotations: summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }}) description: "Traefik service 5xx error rate is above 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # MySQL down - alert: MysqlDown expr: mysql_up == 0 for: 0m labels: severity: critical annotations: summary: MySQL down (instance {{ $labels.instance }}) description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # MySQL too many connections (> 80%) - alert: MysqlTooManyConnections(>80%) expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 2m labels: severity: warning annotations: summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # MySQL high threads running - alert: MysqlHighThreadsRunning expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 for: 2m labels: severity: warning annotations: summary: MySQL high threads running (instance {{ $labels.instance }}) description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # MySQL slow queries - alert: MysqlSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: warning annotations: summary: MySQL slow queries (instance {{ $labels.instance }}) description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql down - alert: PostgresqlDown expr: pg_up == 0 for: 0m labels: severity: critical annotations: summary: Postgresql down (instance {{ $labels.instance }}) description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql too many connections - alert: PostgresqlTooManyConnections expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 for: 2m labels: severity: warning annotations: summary: Postgresql too many connections (instance {{ $labels.instance }}) description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ## Postgresql not enough connections # - alert: PostgresqlNotEnoughConnections # expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 # for: 2m # labels: # severity: warning # annotations: # summary: Postgresql not enough connections (instance {{ $labels.instance }}) # description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql dead locks - alert: PostgresqlDeadLocks expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 for: 0m labels: severity: warning annotations: summary: Postgresql dead locks (instance {{ $labels.instance }}) description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql high rollback rate - alert: PostgresqlHighRollbackRate expr: sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 for: 0m labels: severity: warning annotations: summary: Postgresql high rollback rate (instance {{ $labels.instance }}) description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ## Postgresql commit rate low # - alert: PostgresqlCommitRateLow # expr: rate(pg_stat_database_xact_commit[1m]) < 10 # for: 2m # labels: # severity: critical # annotations: # summary: Postgresql commit rate low (instance {{ $labels.instance }}) # description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql high rate statement timeout - alert: PostgresqlHighRateStatementTimeout expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 for: 0m labels: severity: critical annotations: summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql high rate deadlock - alert: PostgresqlHighRateDeadlock expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 for: 0m labels: severity: critical annotations: summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql too many dead tuples - alert: PostgresqlTooManyDeadTuples expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 for: 2m labels: severity: warning annotations: summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql too many locks acquired - alert: PostgresqlTooManyLocksAcquired expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 for: 2m labels: severity: critical annotations: summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql bloat index high (> 80%) # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatIndexHigh(>80%) expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) for: 1h labels: severity: warning annotations: summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Postgresql bloat table high (> 80%) # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatTableHigh(>80%) expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) for: 1h labels: severity: warning annotations: summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Redis down - alert: RedisDown expr: redis_up == 0 for: 0m labels: severity: critical annotations: summary: Redis down (instance {{ $labels.instance }}) description: "Redis instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Redis out of system memory # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - alert: RedisOutOfSystemMemory expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 for: 2m labels: severity: warning annotations: summary: Redis out of system memory (instance {{ $labels.instance }}) description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Redis out of configured maxmemory - alert: RedisOutOfConfiguredMaxmemory expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 for: 2m labels: severity: warning annotations: summary: Redis out of configured maxmemory (instance {{ $labels.instance }}) description: "Redis is running out of configured maxmemory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Redis too many connections - alert: RedisTooManyConnections expr: redis_connected_clients > 100 for: 2m labels: severity: warning annotations: summary: Redis too many connections (instance {{ $labels.instance }}) description: "Redis instance has too many connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" ## Redis not enough connections # - alert: RedisNotEnoughConnections # expr: redis_connected_clients < 5 # for: 2m # labels: # severity: warning # annotations: # summary: Redis not enough connections (instance {{ $labels.instance }}) # description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Redis rejected connections - alert: RedisRejectedConnections expr: increase(redis_rejected_connections_total[1m]) > 0 for: 0m labels: severity: critical annotations: summary: Redis rejected connections (instance {{ $labels.instance }}) description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # JVM memory filling up - alert: JvmMemoryFillingUp expr: (sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 for: 2m labels: severity: warning annotations: summary: JVM memory filling up (instance {{ $labels.instance }}) description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # SSL certificate revoked - alert: SslCertificateRevoked expr: ssl_ocsp_response_status == 1 for: 0m labels: severity: critical annotations: summary: SSL certificate revoked (instance {{ $labels.instance }}) description: "SSL certificate revoked {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # SSL certificate expiry (< 7 days) - alert: SslCertificateExpiry(<7Days) expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7 for: 0m labels: severity: warning annotations: summary: SSL certificate expiry (< 7 days) (instance {{ $labels.instance }}) description: "{{ $labels.instance }} Certificate is expiring in 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Loki process too many restarts - alert: LokiProcessTooManyRestarts expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 for: 0m labels: severity: warning annotations: summary: Loki process too many restarts (instance {{ $labels.instance }}) description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Loki request errors - alert: LokiRequestErrors expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 for: 15m labels: severity: critical annotations: summary: Loki request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Loki request panic - alert: LokiRequestPanic expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 for: 5m labels: severity: critical annotations: summary: Loki request panic (instance {{ $labels.instance }}) description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Loki request latency - alert: LokiRequestLatency expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 for: 5m labels: severity: critical annotations: summary: Loki request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Promtail request errors - alert: PromtailRequestErrors expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 for: 5m labels: severity: critical annotations: summary: Promtail request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Promtail request latency - alert: PromtailRequestLatency expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 for: 5m labels: severity: critical annotations: summary: Promtail request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Cloudflare http 4xx error rate - alert: CloudflareHttp4xxErrorRate expr: (sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 for: 0m labels: severity: warning annotations: summary: Cloudflare http 4xx error rate (instance {{ $labels.instance }}) description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Cloudflare http 5xx error rate - alert: CloudflareHttp5xxErrorRate expr: (sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 for: 0m labels: severity: critical annotations: summary: Cloudflare http 5xx error rate (instance {{ $labels.instance }}) description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"