Back to Documentation

Monitoring & Alerts

Comprehensive monitoring and alerting for entropyDB using industry-standard tools

Overview

entropyDB provides rich metrics and monitoring capabilities:

  • Prometheus: Native metrics export at /metrics
  • Grafana: Pre-built dashboards for visualization
  • OpenTelemetry: Distributed tracing support
  • Health Checks: Built-in health and readiness endpoints
  • Alerting: Automated alerts for critical conditions

Prometheus Setup

Configuration

Configure Prometheus to scrape entropyDB metrics:

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'production'
    environment: 'prod'

scrape_configs:
  - job_name: 'entropydb'
    static_configs:
      - targets:
          - 'entropydb-node-1:8080'
          - 'entropydb-node-2:8080'
          - 'entropydb-node-3:8080'
    metrics_path: '/metrics'
    scheme: 'https'
    tls_config:
      ca_file: /etc/prometheus/ca.crt
      cert_file: /etc/prometheus/client.crt
      key_file: /etc/prometheus/client.key
    
  - job_name: 'entropydb-kubernetes'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - entropydb
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_app]
        action: keep
        regex: entropydb
      - source_labels: [__meta_kubernetes_pod_name]
        target_label: pod
      - source_labels: [__meta_kubernetes_namespace]
        target_label: namespace

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - 'alertmanager:9093'

rule_files:
  - 'alerts/*.yml'

Key Metrics

# System Metrics
entropydb_up                          # Database is running (1/0)
entropydb_node_count                  # Number of nodes in cluster
entropydb_node_status                 # Node status (0=down, 1=up, 2=degraded)

# Performance Metrics
entropydb_queries_total               # Total queries executed
entropydb_query_duration_seconds      # Query execution time histogram
entropydb_transactions_total          # Total transactions
entropydb_transaction_conflicts_total # Transaction conflicts

# Connection Metrics
entropydb_connections_active          # Active connections
entropydb_connections_idle            # Idle connections
entropydb_connections_waiting         # Connections waiting for resources

# Storage Metrics
entropydb_storage_bytes_used          # Storage used in bytes
entropydb_storage_bytes_available     # Available storage in bytes
entropydb_lsm_level_count            # Number of LSM levels
entropydb_compaction_bytes_total     # Bytes compacted

# Replication Metrics
entropydb_replication_lag_seconds    # Replication lag
entropydb_replication_bytes_total    # Bytes replicated
entropydb_replica_sync_status        # Replica sync status

# Cache Metrics
entropydb_cache_hits_total           # Cache hits
entropydb_cache_misses_total         # Cache misses
entropydb_cache_size_bytes           # Cache size in bytes

# Error Metrics
entropydb_errors_total               # Total errors
entropydb_slow_queries_total         # Slow queries (>1s)
entropydb_deadlocks_total            # Deadlock occurrences

PromQL Queries

# Query rate (queries per second)
rate(entropydb_queries_total[5m])

# Average query latency
rate(entropydb_query_duration_seconds_sum[5m]) / 
  rate(entropydb_query_duration_seconds_count[5m])

# 95th percentile query latency
histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m]))

# Cache hit ratio
rate(entropydb_cache_hits_total[5m]) / 
  (rate(entropydb_cache_hits_total[5m]) + rate(entropydb_cache_misses_total[5m]))

# Replication lag by node
max(entropydb_replication_lag_seconds) by (node)

# Error rate
rate(entropydb_errors_total[5m])

# CPU usage by node
100 - (avg by (node) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Memory usage percentage
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

# Disk I/O operations
rate(node_disk_io_time_seconds_total[5m])

Grafana Dashboards

Import Pre-built Dashboard

# Download official dashboard
curl -o entropydb-dashboard.json \
  https://grafana.com/api/dashboards/12345/revisions/1/download

# Or use Grafana UI:
# 1. Go to Dashboards → Import
# 2. Enter dashboard ID: 12345
# 3. Select Prometheus data source
# 4. Click Import

# Or via API
curl -X POST http://grafana:3000/api/dashboards/db \
  -H "Authorization: Bearer $GRAFANA_API_KEY" \
  -H "Content-Type: application/json" \
  -d @entropydb-dashboard.json

Custom Dashboard Configuration

{
  "dashboard": {
    "title": "entropyDB Overview",
    "panels": [
      {
        "title": "Query Rate",
        "targets": [{
          "expr": "rate(entropydb_queries_total[5m])",
          "legendFormat": "{{node}}"
        }],
        "type": "graph"
      },
      {
        "title": "Query Latency (p95)",
        "targets": [{
          "expr": "histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m]))",
          "legendFormat": "{{node}}"
        }],
        "type": "graph"
      },
      {
        "title": "Active Connections",
        "targets": [{
          "expr": "entropydb_connections_active",
          "legendFormat": "{{node}}"
        }],
        "type": "graph"
      },
      {
        "title": "Replication Lag",
        "targets": [{
          "expr": "entropydb_replication_lag_seconds",
          "legendFormat": "{{node}}"
        }],
        "type": "graph",
        "alert": {
          "conditions": [{
            "evaluator": {
              "params": [10],
              "type": "gt"
            }
          }],
          "name": "High Replication Lag"
        }
      },
      {
        "title": "Storage Usage",
        "targets": [{
          "expr": "(entropydb_storage_bytes_used / entropydb_storage_bytes_available) * 100",
          "legendFormat": "{{node}}"
        }],
        "type": "gauge"
      },
      {
        "title": "Error Rate",
        "targets": [{
          "expr": "rate(entropydb_errors_total[5m])",
          "legendFormat": "{{type}}"
        }],
        "type": "graph"
      }
    ],
    "templating": {
      "list": [
        {
          "name": "cluster",
          "query": "label_values(entropydb_up, cluster)",
          "type": "query"
        },
        {
          "name": "node",
          "query": "label_values(entropydb_up{cluster=\"$cluster\"}, node)",
          "type": "query"
        }
      ]
    },
    "refresh": "30s"
  }
}

Alerting Rules

Alert Configuration

# alerts/entropydb.yml
groups:
  - name: entropydb_alerts
    interval: 30s
    rules:
      - alert: entropyDBDown
        expr: entropydb_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "entropyDB instance {{ $labels.node }} is down"
          description: "entropyDB on {{ $labels.node }} has been down for more than 1 minute"

      - alert: HighQueryLatency
        expr: histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High query latency on {{ $labels.node }}"
          description: "95th percentile query latency is {{ $value }}s"

      - alert: HighReplicationLag
        expr: entropydb_replication_lag_seconds > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High replication lag on {{ $labels.node }}"
          description: "Replication lag is {{ $value }} seconds"

      - alert: LowStorageSpace
        expr: (entropydb_storage_bytes_available / entropydb_storage_bytes_total) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low storage space on {{ $labels.node }}"
          description: "Only {{ $value | humanizePercentage }} storage remaining"

      - alert: HighConnectionCount
        expr: entropydb_connections_active > 900
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High connection count on {{ $labels.node }}"
          description: "{{ $value }} active connections (max: 1000)"

      - alert: HighErrorRate
        expr: rate(entropydb_errors_total[5m]) > 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate on {{ $labels.node }}"
          description: "Error rate is {{ $value }} errors/sec"

      - alert: DeadlockDetected
        expr: rate(entropydb_deadlocks_total[5m]) > 1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Deadlocks detected on {{ $labels.node }}"
          description: "{{ $value }} deadlocks/sec detected"

      - alert: CacheMissRateHigh
        expr: |
          rate(entropydb_cache_misses_total[5m]) /
          (rate(entropydb_cache_hits_total[5m]) + rate(entropydb_cache_misses_total[5m])) > 0.5
        for: 10m
        labels:
          severity: info
        annotations:
          summary: "High cache miss rate on {{ $labels.node }}"
          description: "Cache miss rate is {{ $value | humanizePercentage }}"

      - alert: NodeNotInCluster
        expr: entropydb_node_count < 3
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Cluster has fewer than 3 nodes"
          description: "Only {{ $value }} nodes in cluster (expected: 3)"

AlertManager Configuration

# alertmanager.yml
global:
  resolve_timeout: 5m
  slack_api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'

route:
  group_by: ['alertname', 'cluster', 'severity']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'default'
  routes:
    - match:
        severity: critical
      receiver: 'pagerduty'
      continue: true
    - match:
        severity: warning
      receiver: 'slack'
    - match:
        severity: info
      receiver: 'email'

receivers:
  - name: 'default'
    slack_configs:
      - channel: '#alerts'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

  - name: 'pagerduty'
    pagerduty_configs:
      - service_key: 'YOUR_PAGERDUTY_KEY'
        description: '{{ .CommonAnnotations.summary }}'

  - name: 'slack'
    slack_configs:
      - channel: '#entropydb-alerts'
        title: '{{ .CommonAnnotations.summary }}'
        text: '{{ .CommonAnnotations.description }}'
        color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}'

  - name: 'email'
    email_configs:
      - to: 'ops-team@example.com'
        from: 'alertmanager@example.com'
        smarthost: 'smtp.example.com:587'
        auth_username: 'alertmanager@example.com'
        auth_password: 'password'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'cluster', 'node']

Health Checks

Built-in Endpoints

# Health check - returns 200 if database is running
GET http://entropydb:8080/health

Response:
{
  "status": "healthy",
  "timestamp": "2024-01-15T10:30:00Z",
  "uptime": "72h15m30s",
  "version": "1.0.0"
}

# Readiness check - returns 200 if ready to accept connections
GET http://entropydb:8080/ready

Response:
{
  "ready": true,
  "connections": {
    "active": 45,
    "max": 1000
  },
  "replication": {
    "status": "synced",
    "lag_seconds": 0.5
  }
}

# Detailed status
GET http://entropydb:8080/status

Response:
{
  "node_id": "node-1",
  "cluster": "production",
  "role": "leader",
  "connections": {
    "active": 45,
    "idle": 12,
    "waiting": 0
  },
  "storage": {
    "used_bytes": 10737418240,
    "available_bytes": 42949672960,
    "usage_percent": 20.0
  },
  "replication": {
    "replicas": ["node-2", "node-3"],
    "lag_seconds": 0.5,
    "status": "synced"
  },
  "performance": {
    "queries_per_second": 1250,
    "avg_latency_ms": 12.5,
    "cache_hit_ratio": 0.95
  }
}

Kubernetes Probes

apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: entropydb
spec:
  template:
    spec:
      containers:
      - name: entropydb
        image: entropydb/entropydb:latest
        livenessProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 3
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 5
          timeoutSeconds: 3
          failureThreshold: 2
        startupProbe:
          httpGet:
            path: /health
            port: 8080
          initialDelaySeconds: 0
          periodSeconds: 10
          timeoutSeconds: 3
          failureThreshold: 30

Logging

Configuration

# entropydb.conf
logging:
  level: info                  # debug, info, warn, error
  format: json                 # json or text
  output: /var/log/entropydb/entropydb.log
  max_size: 100MB
  max_files: 10
  compress: true
  
  # Component-specific logging
  components:
    query_executor: debug
    replication: info
    storage: warn
    
  # Slow query logging
  slow_query:
    enabled: true
    threshold: 1s              # Log queries taking >1s
    
  # Audit logging
  audit:
    enabled: true
    events:
      - authentication
      - authorization
      - ddl_statements
      - privilege_changes

Centralized Logging with ELK

# filebeat.yml
filebeat.inputs:
  - type: log
    enabled: true
    paths:
      - /var/log/entropydb/*.log
    fields:
      service: entropydb
      environment: production
    json.keys_under_root: true
    json.add_error_key: true

output.elasticsearch:
  hosts: ["elasticsearch:9200"]
  index: "entropydb-logs-%{+yyyy.MM.dd}"

setup.kibana:
  host: "kibana:5601"

# Query logs in Kibana
GET /entropydb-logs-*/_search
{
  "query": {
    "bool": {
      "must": [
        { "match": { "level": "error" }},
        { "range": { "@timestamp": { "gte": "now-1h" }}}
      ]
    }
  }
}

Best Practices

Monitoring

  • • Monitor all nodes in cluster
  • • Set appropriate alert thresholds
  • • Track long-term trends
  • • Create custom dashboards

Alerting

  • • Configure multiple channels
  • • Use severity levels appropriately
  • • Avoid alert fatigue
  • • Test alert routing regularly

Logging

  • • Use structured JSON logging
  • • Implement log rotation
  • • Enable audit logging
  • • Centralize logs for analysis

Performance

  • • Monitor query performance
  • • Track cache hit ratios
  • • Watch replication lag
  • • Analyze slow queries

Next Steps