←Back to Documentation
Monitoring & Alerts
Comprehensive monitoring and alerting for entropyDB using industry-standard tools
Overview
entropyDB provides rich metrics and monitoring capabilities:
- • Prometheus: Native metrics export at
/metrics - • Grafana: Pre-built dashboards for visualization
- • OpenTelemetry: Distributed tracing support
- • Health Checks: Built-in health and readiness endpoints
- • Alerting: Automated alerts for critical conditions
Prometheus Setup
Configuration
Configure Prometheus to scrape entropyDB metrics:
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
environment: 'prod'
scrape_configs:
- job_name: 'entropydb'
static_configs:
- targets:
- 'entropydb-node-1:8080'
- 'entropydb-node-2:8080'
- 'entropydb-node-3:8080'
metrics_path: '/metrics'
scheme: 'https'
tls_config:
ca_file: /etc/prometheus/ca.crt
cert_file: /etc/prometheus/client.crt
key_file: /etc/prometheus/client.key
- job_name: 'entropydb-kubernetes'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- entropydb
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: entropydb
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
alerting:
alertmanagers:
- static_configs:
- targets:
- 'alertmanager:9093'
rule_files:
- 'alerts/*.yml'Key Metrics
# System Metrics entropydb_up # Database is running (1/0) entropydb_node_count # Number of nodes in cluster entropydb_node_status # Node status (0=down, 1=up, 2=degraded) # Performance Metrics entropydb_queries_total # Total queries executed entropydb_query_duration_seconds # Query execution time histogram entropydb_transactions_total # Total transactions entropydb_transaction_conflicts_total # Transaction conflicts # Connection Metrics entropydb_connections_active # Active connections entropydb_connections_idle # Idle connections entropydb_connections_waiting # Connections waiting for resources # Storage Metrics entropydb_storage_bytes_used # Storage used in bytes entropydb_storage_bytes_available # Available storage in bytes entropydb_lsm_level_count # Number of LSM levels entropydb_compaction_bytes_total # Bytes compacted # Replication Metrics entropydb_replication_lag_seconds # Replication lag entropydb_replication_bytes_total # Bytes replicated entropydb_replica_sync_status # Replica sync status # Cache Metrics entropydb_cache_hits_total # Cache hits entropydb_cache_misses_total # Cache misses entropydb_cache_size_bytes # Cache size in bytes # Error Metrics entropydb_errors_total # Total errors entropydb_slow_queries_total # Slow queries (>1s) entropydb_deadlocks_total # Deadlock occurrences
PromQL Queries
# Query rate (queries per second)
rate(entropydb_queries_total[5m])
# Average query latency
rate(entropydb_query_duration_seconds_sum[5m]) /
rate(entropydb_query_duration_seconds_count[5m])
# 95th percentile query latency
histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m]))
# Cache hit ratio
rate(entropydb_cache_hits_total[5m]) /
(rate(entropydb_cache_hits_total[5m]) + rate(entropydb_cache_misses_total[5m]))
# Replication lag by node
max(entropydb_replication_lag_seconds) by (node)
# Error rate
rate(entropydb_errors_total[5m])
# CPU usage by node
100 - (avg by (node) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage percentage
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Disk I/O operations
rate(node_disk_io_time_seconds_total[5m])Grafana Dashboards
Import Pre-built Dashboard
# Download official dashboard curl -o entropydb-dashboard.json \ https://grafana.com/api/dashboards/12345/revisions/1/download # Or use Grafana UI: # 1. Go to Dashboards → Import # 2. Enter dashboard ID: 12345 # 3. Select Prometheus data source # 4. Click Import # Or via API curl -X POST http://grafana:3000/api/dashboards/db \ -H "Authorization: Bearer $GRAFANA_API_KEY" \ -H "Content-Type: application/json" \ -d @entropydb-dashboard.json
Custom Dashboard Configuration
{
"dashboard": {
"title": "entropyDB Overview",
"panels": [
{
"title": "Query Rate",
"targets": [{
"expr": "rate(entropydb_queries_total[5m])",
"legendFormat": "{{node}}"
}],
"type": "graph"
},
{
"title": "Query Latency (p95)",
"targets": [{
"expr": "histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m]))",
"legendFormat": "{{node}}"
}],
"type": "graph"
},
{
"title": "Active Connections",
"targets": [{
"expr": "entropydb_connections_active",
"legendFormat": "{{node}}"
}],
"type": "graph"
},
{
"title": "Replication Lag",
"targets": [{
"expr": "entropydb_replication_lag_seconds",
"legendFormat": "{{node}}"
}],
"type": "graph",
"alert": {
"conditions": [{
"evaluator": {
"params": [10],
"type": "gt"
}
}],
"name": "High Replication Lag"
}
},
{
"title": "Storage Usage",
"targets": [{
"expr": "(entropydb_storage_bytes_used / entropydb_storage_bytes_available) * 100",
"legendFormat": "{{node}}"
}],
"type": "gauge"
},
{
"title": "Error Rate",
"targets": [{
"expr": "rate(entropydb_errors_total[5m])",
"legendFormat": "{{type}}"
}],
"type": "graph"
}
],
"templating": {
"list": [
{
"name": "cluster",
"query": "label_values(entropydb_up, cluster)",
"type": "query"
},
{
"name": "node",
"query": "label_values(entropydb_up{cluster=\"$cluster\"}, node)",
"type": "query"
}
]
},
"refresh": "30s"
}
}Alerting Rules
Alert Configuration
# alerts/entropydb.yml
groups:
- name: entropydb_alerts
interval: 30s
rules:
- alert: entropyDBDown
expr: entropydb_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "entropyDB instance {{ $labels.node }} is down"
description: "entropyDB on {{ $labels.node }} has been down for more than 1 minute"
- alert: HighQueryLatency
expr: histogram_quantile(0.95, rate(entropydb_query_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High query latency on {{ $labels.node }}"
description: "95th percentile query latency is {{ $value }}s"
- alert: HighReplicationLag
expr: entropydb_replication_lag_seconds > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High replication lag on {{ $labels.node }}"
description: "Replication lag is {{ $value }} seconds"
- alert: LowStorageSpace
expr: (entropydb_storage_bytes_available / entropydb_storage_bytes_total) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Low storage space on {{ $labels.node }}"
description: "Only {{ $value | humanizePercentage }} storage remaining"
- alert: HighConnectionCount
expr: entropydb_connections_active > 900
for: 5m
labels:
severity: warning
annotations:
summary: "High connection count on {{ $labels.node }}"
description: "{{ $value }} active connections (max: 1000)"
- alert: HighErrorRate
expr: rate(entropydb_errors_total[5m]) > 10
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate on {{ $labels.node }}"
description: "Error rate is {{ $value }} errors/sec"
- alert: DeadlockDetected
expr: rate(entropydb_deadlocks_total[5m]) > 1
for: 1m
labels:
severity: warning
annotations:
summary: "Deadlocks detected on {{ $labels.node }}"
description: "{{ $value }} deadlocks/sec detected"
- alert: CacheMissRateHigh
expr: |
rate(entropydb_cache_misses_total[5m]) /
(rate(entropydb_cache_hits_total[5m]) + rate(entropydb_cache_misses_total[5m])) > 0.5
for: 10m
labels:
severity: info
annotations:
summary: "High cache miss rate on {{ $labels.node }}"
description: "Cache miss rate is {{ $value | humanizePercentage }}"
- alert: NodeNotInCluster
expr: entropydb_node_count < 3
for: 5m
labels:
severity: critical
annotations:
summary: "Cluster has fewer than 3 nodes"
description: "Only {{ $value }} nodes in cluster (expected: 3)"AlertManager Configuration
# alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
route:
group_by: ['alertname', 'cluster', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'pagerduty'
continue: true
- match:
severity: warning
receiver: 'slack'
- match:
severity: info
receiver: 'email'
receivers:
- name: 'default'
slack_configs:
- channel: '#alerts'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
description: '{{ .CommonAnnotations.summary }}'
- name: 'slack'
slack_configs:
- channel: '#entropydb-alerts'
title: '{{ .CommonAnnotations.summary }}'
text: '{{ .CommonAnnotations.description }}'
color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}'
- name: 'email'
email_configs:
- to: 'ops-team@example.com'
from: 'alertmanager@example.com'
smarthost: 'smtp.example.com:587'
auth_username: 'alertmanager@example.com'
auth_password: 'password'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'node']Health Checks
Built-in Endpoints
# Health check - returns 200 if database is running
GET http://entropydb:8080/health
Response:
{
"status": "healthy",
"timestamp": "2024-01-15T10:30:00Z",
"uptime": "72h15m30s",
"version": "1.0.0"
}
# Readiness check - returns 200 if ready to accept connections
GET http://entropydb:8080/ready
Response:
{
"ready": true,
"connections": {
"active": 45,
"max": 1000
},
"replication": {
"status": "synced",
"lag_seconds": 0.5
}
}
# Detailed status
GET http://entropydb:8080/status
Response:
{
"node_id": "node-1",
"cluster": "production",
"role": "leader",
"connections": {
"active": 45,
"idle": 12,
"waiting": 0
},
"storage": {
"used_bytes": 10737418240,
"available_bytes": 42949672960,
"usage_percent": 20.0
},
"replication": {
"replicas": ["node-2", "node-3"],
"lag_seconds": 0.5,
"status": "synced"
},
"performance": {
"queries_per_second": 1250,
"avg_latency_ms": 12.5,
"cache_hit_ratio": 0.95
}
}Kubernetes Probes
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: entropydb
spec:
template:
spec:
containers:
- name: entropydb
image: entropydb/entropydb:latest
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 2
startupProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 0
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 30Logging
Configuration
# entropydb.conf
logging:
level: info # debug, info, warn, error
format: json # json or text
output: /var/log/entropydb/entropydb.log
max_size: 100MB
max_files: 10
compress: true
# Component-specific logging
components:
query_executor: debug
replication: info
storage: warn
# Slow query logging
slow_query:
enabled: true
threshold: 1s # Log queries taking >1s
# Audit logging
audit:
enabled: true
events:
- authentication
- authorization
- ddl_statements
- privilege_changesCentralized Logging with ELK
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/entropydb/*.log
fields:
service: entropydb
environment: production
json.keys_under_root: true
json.add_error_key: true
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "entropydb-logs-%{+yyyy.MM.dd}"
setup.kibana:
host: "kibana:5601"
# Query logs in Kibana
GET /entropydb-logs-*/_search
{
"query": {
"bool": {
"must": [
{ "match": { "level": "error" }},
{ "range": { "@timestamp": { "gte": "now-1h" }}}
]
}
}
}Best Practices
Monitoring
- • Monitor all nodes in cluster
- • Set appropriate alert thresholds
- • Track long-term trends
- • Create custom dashboards
Alerting
- • Configure multiple channels
- • Use severity levels appropriately
- • Avoid alert fatigue
- • Test alert routing regularly
Logging
- • Use structured JSON logging
- • Implement log rotation
- • Enable audit logging
- • Centralize logs for analysis
Performance
- • Monitor query performance
- • Track cache hit ratios
- • Watch replication lag
- • Analyze slow queries