Skip to main content

Alerts

Configure alerts to get notified when infrastructure metrics exceed thresholds.

Creating Alerts

BrainzLab::Sentinel.create_alert(
  name: "High CPU on web servers",
  hosts: ["web-1", "web-2"],  # or host_group: "web-servers"
  metric: :cpu,
  condition: :above,
  warning_threshold: 80,
  critical_threshold: 95,
  duration: 5.minutes,  # Must exceed for this duration
  channels: [:slack, :pagerduty]
)

Alert Conditions

ConditionDescription
aboveMetric exceeds threshold
belowMetric falls below threshold
equalsMetric equals value
changeMetric changes by percentage

Metric Types

Host Metrics

# CPU alerts
BrainzLab::Sentinel.alert_on(:cpu, threshold: 90)
BrainzLab::Sentinel.alert_on(:load_average, threshold: 4.0)

# Memory alerts
BrainzLab::Sentinel.alert_on(:memory, threshold: 90)
BrainzLab::Sentinel.alert_on(:swap, threshold: 50)

# Disk alerts
BrainzLab::Sentinel.alert_on(:disk, mount: "/", threshold: 85)
BrainzLab::Sentinel.alert_on(:disk_iops, threshold: 1000)

# Network alerts
BrainzLab::Sentinel.alert_on(:network_bandwidth, threshold: 100)  # MB/s

Container Metrics

# Container-specific alerts
BrainzLab::Sentinel.alert_on_container(
  name: "api-server",
  metric: :memory_percent,
  threshold: 90
)

# Container event alerts
BrainzLab::Sentinel.alert_on_container_event(
  event: :stopped,
  containers: ["nginx", "api-server"]
)

Alert Channels

Alerts integrate with Signal for notifications:
# Slack
BrainzLab::Sentinel.alert_on(:cpu, threshold: 90, channel: :slack)

# PagerDuty
BrainzLab::Sentinel.alert_on(:disk, threshold: 95, channel: :pagerduty)

# Email
BrainzLab::Sentinel.alert_on(:memory, threshold: 85, channel: :email)

# Webhook
BrainzLab::Sentinel.alert_on(:load, threshold: 5.0, channel: :webhook)

Alert Examples

Web Server Alerts

# High CPU
BrainzLab::Sentinel.create_alert(
  name: "Web server high CPU",
  host_group: "web-servers",
  metric: :cpu,
  warning_threshold: 75,
  critical_threshold: 90,
  duration: 5.minutes
)

# Low disk space
BrainzLab::Sentinel.create_alert(
  name: "Web server disk space",
  host_group: "web-servers",
  metric: :disk,
  mount: "/",
  warning_threshold: 80,
  critical_threshold: 90
)

Database Server Alerts

# Memory usage (DBs often use high memory)
BrainzLab::Sentinel.create_alert(
  name: "DB memory critical",
  hosts: ["db-1", "db-2"],
  metric: :memory,
  critical_threshold: 98,  # Higher threshold for DBs
  duration: 10.minutes
)

# Disk I/O
BrainzLab::Sentinel.create_alert(
  name: "DB high disk I/O",
  hosts: ["db-1"],
  metric: :disk_iops,
  warning_threshold: 5000,
  critical_threshold: 10000
)

Container Alerts

# Container OOM risk
BrainzLab::Sentinel.create_alert(
  name: "Container approaching memory limit",
  containers: ["api-*"],
  metric: :memory_percent,
  warning_threshold: 85,
  critical_threshold: 95
)

# Container restart loop
BrainzLab::Sentinel.create_alert(
  name: "Container restart loop",
  condition: :restart_count,
  threshold: 3,
  window: 10.minutes
)

Managing Alerts

# List all alerts
alerts = BrainzLab::Sentinel.alerts

# Disable an alert
BrainzLab::Sentinel.disable_alert("alert_abc123")

# Delete an alert
BrainzLab::Sentinel.delete_alert("alert_abc123")

# View alert history
history = BrainzLab::Sentinel.alert_history(
  alert_id: "alert_abc123",
  period: :week
)