Alerts


/etc/prometheus/alerts.yaml > blackbox_exporter_alerts
HTTPSDown (0 active)
alert: HTTPSDown
expr: probe_success{instance=~"https://.*",job="blackbox",probe="http_2xx"}
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: The HTTPS endpoint {{ $labels.instance }} is not responding or returning
    non-2xx status codes.
  summary: HTTPS Endpoint Down
SSLCertificateExpiry (0 active)
alert: SSLCertificateExpiry
expr: probe_ssl_earliest_cert_expiry{instance=~"https://.*",job="blackbox"}
  < time() + 604800
for: 5m
labels:
  severity: warning
annotations:
  description: The SSL certificate for the HTTPS endpoint {{ $labels.instance }} will
    expire within one week.
  summary: SSL Certificate Expiry
/etc/prometheus/alerts.yaml > node_exporter_alerts
HighCPUUsage (1 active)
alert: HighCPUUsage
expr: 100
  * (1 - avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])))
  > 80
for: 5m
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} has CPU usage over 80% for the last 5 minutes.'
  summary: High CPU usage on {{ $labels.instance }}
Labels State Active Since Value
alertname="HighCPUUsage" instance="localhost:9100" severity="warning" firing 2024-09-17 08:37:31.687198332 +0000 UTC 100
HighDiskUsage (2 active)
alert: HighDiskUsage
expr: 100
  * (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes
  > 80
for: 5m
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} has disk usage over 80% for the last 5 minutes.'
  summary: High Disk usage on {{ $labels.instance }}
Labels State Active Since Value
alertname="HighDiskUsage" device="/dev/root" environment="ulca-production" fstype="ext4" instance="172.17.0.10:9100" job="node" mountpoint="/" severity="warning" firing 2024-05-21 08:31:16.687198332 +0000 UTC 82.49134305161685
alertname="HighDiskUsage" device="/dev/root" environment="ulca-production" fstype="ext4" instance="172.17.0.4:9100" job="node" mountpoint="/" severity="warning" firing 2024-08-18 19:23:46.687198332 +0000 UTC 84.32123438717895
HighMemoryUsage (0 active)
alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes
  - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes)
  / node_memory_MemTotal_bytes * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} has memory usage over 80% for the last 5 minutes.'
  summary: High Memory usage on {{ $labels.instance }}