Backup & Recovery

Enterprise-grade backup and disaster recovery strategies for entropyDB

Overview

entropyDB provides multiple backup and recovery options:

• Point-in-Time Recovery (PITR): Restore to any moment in time
• Full Backups: Complete database snapshots
• Incremental Backups: Only changed data since last backup
• Continuous Archiving: Real-time WAL archiving
• Cross-Region Replication: Geographic disaster recovery

Backup Strategies

Full Backup

Create a complete snapshot of your database:

# Using entropydb-backup utility
entropydb-backup full \
  --host localhost \
  --port 5432 \
  --database mydb \
  --output /backups/full-backup-2024-01-15.tar.gz \
  --compress gzip \
  --parallel 4

# Verify backup
entropydb-backup verify /backups/full-backup-2024-01-15.tar.gz

# List backup contents
entropydb-backup list /backups/full-backup-2024-01-15.tar.gz

# Backup with encryption
entropydb-backup full \
  --database mydb \
  --output /backups/encrypted-backup.tar.gz.enc \
  --encrypt aes-256-cbc \
  --key-file /etc/entropydb/backup.key

Incremental Backup

# First full backup
entropydb-backup full \
  --database mydb \
  --output /backups/base-backup.tar.gz \
  --label "base-backup-20240115"

# Subsequent incremental backups
entropydb-backup incremental \
  --database mydb \
  --base-backup /backups/base-backup.tar.gz \
  --output /backups/incremental-20240116.tar.gz

entropydb-backup incremental \
  --database mydb \
  --base-backup /backups/base-backup.tar.gz \
  --output /backups/incremental-20240117.tar.gz

# List all backups in chain
entropydb-backup chain-list /backups/base-backup.tar.gz

Automated Backup Schedule

# Kubernetes CronJob for automated backups
apiVersion: batch/v1
kind: CronJob
metadata:
  name: entropydb-backup
  namespace: entropydb
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: backup
            image: entropydb/backup-tools:latest
            env:
            - name: ENTROPY_HOST
              value: "entropydb-service"
            - name: ENTROPY_DATABASE
              value: "mydb"
            - name: BACKUP_RETENTION_DAYS
              value: "30"
            - name: S3_BUCKET
              value: "s3://my-backups/entropydb"
            command:
            - /bin/sh
            - -c
            - |
              DATE=$(date +%Y%m%d-%H%M%S)
              entropydb-backup full \
                --host $ENTROPY_HOST \
                --database $ENTROPY_DATABASE \
                --output /tmp/backup-$DATE.tar.gz \
                --compress gzip
              
              # Upload to S3
              aws s3 cp /tmp/backup-$DATE.tar.gz $S3_BUCKET/
              
              # Cleanup old backups
              aws s3 ls $S3_BUCKET/ | \
                awk '{print $4}' | \
                head -n -$BACKUP_RETENTION_DAYS | \
                xargs -I {} aws s3 rm $S3_BUCKET/{}
          restartPolicy: OnFailure

Point-in-Time Recovery (PITR)

Configure WAL Archiving

Enable continuous WAL archiving for PITR:

# entropydb.conf
wal_level = replica
archive_mode = on
archive_command = 'aws s3 cp %p s3://my-wal-archive/%f'
archive_timeout = 300  # Archive every 5 minutes

# Alternative: Copy to local storage
archive_command = 'test ! -f /mnt/wal_archive/%f && cp %p /mnt/wal_archive/%f'

# Alternative: Use pg_receivewal for streaming
# Run on backup server:
pg_receivewal \
  -h entropydb-primary \
  -p 5432 \
  -D /mnt/wal_archive \
  -U replication \
  --slot=wal_archive_slot \
  --create-slot

# Verify archiving is working
SELECT * FROM entropy_stat_archiver;

Perform PITR

# Step 1: Stop the database
systemctl stop entropydb

# Step 2: Restore base backup
rm -rf /var/lib/entropydb/data/*
tar -xzf /backups/base-backup.tar.gz -C /var/lib/entropydb/data/

# Step 3: Create recovery configuration
cat > /var/lib/entropydb/data/recovery.conf << EOF
restore_command = 'aws s3 cp s3://my-wal-archive/%f %p'
recovery_target_time = '2024-01-15 14:30:00 UTC'
recovery_target_action = promote
EOF

# Step 4: Start database in recovery mode
systemctl start entropydb

# Step 5: Monitor recovery progress
tail -f /var/log/entropydb/recovery.log

# Check recovery status
SELECT * FROM entropy_recovery_status;

# Alternative: Recover to specific transaction
cat > /var/lib/entropydb/data/recovery.conf << EOF
restore_command = 'aws s3 cp s3://my-wal-archive/%f %p'
recovery_target_xid = '12345678'
recovery_target_action = promote
EOF

# Alternative: Recover to specific LSN
cat > /var/lib/entropydb/data/recovery.conf << EOF
restore_command = 'aws s3 cp s3://my-wal-archive/%f %p'
recovery_target_lsn = '0/15000A8'
recovery_target_action = promote
EOF

Disaster Recovery

Cross-Region Replication

# Primary region configuration (US-West)
# entropydb.conf
cluster_name = 'production'
region = 'us-west-2'
wal_level = logical
max_wal_senders = 10
max_replication_slots = 10

# Create replication user
CREATE USER replication_user WITH REPLICATION PASSWORD 'secure_password';

# Create replication slot
SELECT * FROM pg_create_physical_replication_slot('dr_slot');

# Secondary region (EU-Central) - Standby configuration
# standby.conf
primary_conninfo = 'host=us-west-primary.example.com port=5432 user=replication_user password=secure_password'
primary_slot_name = 'dr_slot'
restore_command = 'aws s3 cp s3://wal-archive/%f %p'
recovery_target_timeline = 'latest'
hot_standby = on
hot_standby_feedback = on

# Monitor replication lag
SELECT 
  client_addr,
  state,
  sent_lsn,
  write_lsn,
  flush_lsn,
  replay_lsn,
  sync_state,
  EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag_seconds
FROM pg_stat_replication;

# Promote standby to primary (failover)
entropy-admin promote --immediate

Disaster Recovery Testing

# DR Testing Checklist Script
#!/bin/bash

echo "=== entropyDB Disaster Recovery Test ==="
echo "Test Date: $(date)"
echo ""

# 1. Verify backup exists
echo "1. Checking latest backup..."
LATEST_BACKUP=$(aws s3 ls s3://my-backups/entropydb/ | tail -1 | awk '{print $4}')
if [ -n "$LATEST_BACKUP" ]; then
  echo "✓ Latest backup: $LATEST_BACKUP"
else
  echo "✗ No backup found!"
  exit 1
fi

# 2. Verify WAL archiving
echo "2. Checking WAL archiving..."
WAL_COUNT=$(aws s3 ls s3://my-wal-archive/ | wc -l)
if [ $WAL_COUNT -gt 0 ]; then
  echo "✓ WAL archive has $WAL_COUNT files"
else
  echo "✗ WAL archive is empty!"
  exit 1
fi

# 3. Test restore on DR cluster
echo "3. Testing restore on DR cluster..."
ssh dr-cluster "systemctl stop entropydb"
ssh dr-cluster "rm -rf /var/lib/entropydb/test-restore/*"
aws s3 cp s3://my-backups/entropydb/$LATEST_BACKUP /tmp/
ssh dr-cluster "tar -xzf /tmp/$LATEST_BACKUP -C /var/lib/entropydb/test-restore/"
ssh dr-cluster "systemctl start entropydb-test"

# 4. Verify data integrity
echo "4. Verifying data integrity..."
CHECKSUM_PROD=$(psql -h production -c "SELECT md5(string_agg(id::text, '')) FROM users ORDER BY id;" -t)
sleep 10  # Wait for recovery
CHECKSUM_DR=$(psql -h dr-cluster -c "SELECT md5(string_agg(id::text, '')) FROM users ORDER BY id;" -t)

if [ "$CHECKSUM_PROD" == "$CHECKSUM_DR" ]; then
  echo "✓ Data integrity verified"
else
  echo "✗ Data mismatch detected!"
  exit 1
fi

# 5. Test failover time
echo "5. Measuring failover time..."
START_TIME=$(date +%s)
ssh dr-cluster "entropy-admin promote --immediate"
END_TIME=$(date +%s)
FAILOVER_TIME=$((END_TIME - START_TIME))
echo "✓ Failover completed in $FAILOVER_TIME seconds"

echo ""
echo "=== DR Test Complete ==="
echo "All checks passed! RTO: $FAILOVER_TIME seconds"

Cloud Backup Integration

AWS S3

# Configure AWS credentials
aws configure set aws_access_key_id YOUR_ACCESS_KEY
aws configure set aws_secret_access_key YOUR_SECRET_KEY
aws configure set default.region us-west-2

# Create S3 bucket with versioning
aws s3api create-bucket \
  --bucket entropydb-backups \
  --region us-west-2 \
  --create-bucket-configuration LocationConstraint=us-west-2

aws s3api put-bucket-versioning \
  --bucket entropydb-backups \
  --versioning-configuration Status=Enabled

# Configure lifecycle policy
cat > lifecycle.json << EOF
{
  "Rules": [{
    "Id": "BackupRetention",
    "Status": "Enabled",
    "Transitions": [
      {
        "Days": 30,
        "StorageClass": "STANDARD_IA"
      },
      {
        "Days": 90,
        "StorageClass": "GLACIER"
      }
    ],
    "Expiration": {
      "Days": 365
    }
  }]
}
EOF

aws s3api put-bucket-lifecycle-configuration \
  --bucket entropydb-backups \
  --lifecycle-configuration file://lifecycle.json

# Backup to S3
entropydb-backup full \
  --database mydb \
  --output - | \
  aws s3 cp - s3://entropydb-backups/backup-$(date +%Y%m%d).tar.gz

# Restore from S3
aws s3 cp s3://entropydb-backups/backup-20240115.tar.gz - | \
  entropydb-restore --input - --database mydb

Azure Blob Storage

# Install Azure CLI
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

# Login and configure
az login
az account set --subscription "Your-Subscription-ID"

# Create storage account
az storage account create \
  --name entropydbbackups \
  --resource-group entropydb-rg \
  --location westus2 \
  --sku Standard_GRS

# Create container
az storage container create \
  --name backups \
  --account-name entropydbbackups

# Configure lifecycle management
cat > lifecycle-policy.json << EOF
{
  "rules": [{
    "name": "BackupRetention",
    "enabled": true,
    "type": "Lifecycle",
    "definition": {
      "actions": {
        "baseBlob": {
          "tierToCool": { "daysAfterModificationGreaterThan": 30 },
          "tierToArchive": { "daysAfterModificationGreaterThan": 90 },
          "delete": { "daysAfterModificationGreaterThan": 365 }
        }
      }
    }
  }]
}
EOF

az storage account management-policy create \
  --account-name entropydbbackups \
  --policy @lifecycle-policy.json

# Backup to Azure
entropydb-backup full --database mydb --output - | \
  az storage blob upload \
    --account-name entropydbbackups \
    --container-name backups \
    --name backup-$(date +%Y%m%d).tar.gz \
    --file /dev/stdin

Common Recovery Scenarios

Accidental Data Deletion

Scenario: User accidentally deleted critical data at 2:30 PM

Solution: Use PITR to restore to 2:29 PM

# Create a clone for recovery
CREATE DATABASE recovery_db TEMPLATE mydb;

# Restore to point before deletion
entropy-admin pitr-restore \
  --database recovery_db \
  --target-time "2024-01-15 14:29:00" \
  --wal-source s3://my-wal-archive

# Extract deleted data
psql recovery_db -c "COPY (SELECT * FROM users WHERE deleted_at IS NULL) TO STDOUT" | \
  psql mydb -c "COPY users FROM STDIN"

# Verify and drop recovery database
DROP DATABASE recovery_db;

Corruption Detection

# Run corruption check
entropydb-admin check --full

# If corruption detected, restore from backup
systemctl stop entropydb
mv /var/lib/entropydb/data /var/lib/entropydb/data.corrupted
tar -xzf /backups/latest-backup.tar.gz -C /var/lib/entropydb/data
systemctl start entropydb

# Analyze corrupted data
entropydb-admin analyze-corruption /var/lib/entropydb/data.corrupted

Complete Cluster Failure

# Restore cluster from backup
# 1. Provision new infrastructure
terraform apply -var-file=disaster-recovery.tfvars

# 2. Restore primary node
entropy-admin restore-cluster \
  --backup s3://entropydb-backups/latest.tar.gz \
  --wal-archive s3://my-wal-archive \
  --target-time latest

# 3. Configure replication for other nodes
for node in node2 node3; do
  ssh $node "entropy-admin join-cluster --primary node1"
done

# 4. Verify cluster health
entropy-admin cluster-status
entropy-admin verify-data-integrity

Best Practices

Backup Strategy

• Follow 3-2-1 rule: 3 copies, 2 media types, 1 offsite
• Automate backups with schedules
• Encrypt backups at rest
• Test backups regularly

Recovery Planning

• Define RTO and RPO requirements
• Document recovery procedures
• Practice disaster recovery drills
• Maintain runbooks

Monitoring

• Monitor backup success/failure
• Alert on backup delays
• Track backup size trends
• Verify backup integrity

Security

• Encrypt backups in transit and at rest
• Restrict backup access with IAM
• Audit backup operations
• Rotate encryption keys

Next Steps

Deployment

Deploy entropyDB clusters

Monitoring

Set up monitoring and alerts

Scaling

Scale your deployment