←Back to Documentation
Feature Store
Centralized feature management for ML model training and serving
Overview
entropyDB Feature Store provides:
- • Offline Features: Historical features for model training
- • Online Features: Low-latency features for inference
- • Point-in-Time Correctness: Prevent data leakage
- • Feature Versioning: Track feature evolution
- • Feature Monitoring: Detect drift and anomalies
Define Feature Groups
-- Create feature group
CREATE FEATURE GROUP user_features (
user_id BIGINT PRIMARY KEY,
age INTEGER,
account_age_days INTEGER,
total_purchases DECIMAL,
avg_purchase_value DECIMAL,
purchase_frequency DECIMAL,
last_purchase_days_ago INTEGER,
favorite_category TEXT,
feature_timestamp TIMESTAMP DEFAULT NOW()
);
-- Register feature transformations
CREATE FEATURE user_age_group AS
CASE
WHEN age < 25 THEN 'young'
WHEN age < 45 THEN 'middle'
ELSE 'senior'
END
FROM user_features;
CREATE FEATURE user_spending_tier AS
CASE
WHEN total_purchases < 100 THEN 'low'
WHEN total_purchases < 1000 THEN 'medium'
ELSE 'high'
END
FROM user_features;
-- Time-windowed features
CREATE FEATURE user_30d_purchase_count AS
SELECT
user_id,
COUNT(*) as purchase_count,
NOW() as feature_timestamp
FROM orders
WHERE created_at > NOW() - INTERVAL '30 days'
GROUP BY user_id
REFRESH EVERY '1 hour';
-- Aggregate features
CREATE FEATURE GROUP transaction_features (
user_id BIGINT,
transaction_count_7d INTEGER,
avg_transaction_amount_7d DECIMAL,
max_transaction_amount_7d DECIMAL,
unique_merchants_7d INTEGER,
feature_timestamp TIMESTAMP
) WITH (
source_table = 'transactions',
window = '7 days',
refresh_interval = '1 hour'
);Offline Feature Retrieval (Training)
-- Get historical features with point-in-time correctness
SELECT
t.user_id,
t.transaction_id,
t.amount,
t.fraud_label, -- Target variable
-- Point-in-time join: get features as they were at transaction time
uf.age,
uf.total_purchases,
uf.purchase_frequency,
tf.transaction_count_7d,
tf.avg_transaction_amount_7d
FROM transactions t
LEFT JOIN LATERAL (
SELECT * FROM user_features
WHERE user_id = t.user_id
AND feature_timestamp <= t.created_at
ORDER BY feature_timestamp DESC
LIMIT 1
) uf ON true
LEFT JOIN LATERAL (
SELECT * FROM transaction_features
WHERE user_id = t.user_id
AND feature_timestamp <= t.created_at
ORDER BY feature_timestamp DESC
LIMIT 1
) tf ON true
WHERE t.created_at BETWEEN '2024-01-01' AND '2024-12-31';
-- Feature engineering for training dataset
WITH training_data AS (
SELECT
t.*,
-- User features
EXTRACT(EPOCH FROM (t.created_at - u.signup_date)) / 86400 as account_age_days,
-- Transaction velocity
COUNT(*) OVER (
PARTITION BY t.user_id
ORDER BY t.created_at
RANGE BETWEEN INTERVAL '24 hours' PRECEDING AND CURRENT ROW
) as transactions_last_24h,
-- Amount statistics
AVG(amount) OVER (
PARTITION BY t.user_id
ORDER BY t.created_at
ROWS BETWEEN 10 PRECEDING AND 1 PRECEDING
) as avg_amount_last_10,
-- Time features
EXTRACT(HOUR FROM t.created_at) as hour_of_day,
EXTRACT(DOW FROM t.created_at) as day_of_week
FROM transactions t
JOIN users u ON t.user_id = u.id
WHERE t.created_at >= '2024-01-01'
)
SELECT * FROM training_data
WHERE fraud_label IS NOT NULL;
-- Export training dataset
COPY (
SELECT * FROM get_training_features(
start_date => '2024-01-01',
end_date => '2024-12-31',
feature_groups => ARRAY['user_features', 'transaction_features']
)
) TO '/ml/training_data.parquet' WITH (FORMAT parquet);Online Feature Serving (Inference)
-- Get latest features for real-time prediction
SELECT * FROM get_online_features(
entity_ids => ARRAY[123, 456, 789],
feature_groups => ARRAY['user_features', 'transaction_features']
);
-- REST API endpoint for feature serving
POST /api/features/online
{
"entity_ids": [123, 456, 789],
"feature_groups": ["user_features", "transaction_features"]
}
-- Response:
{
"features": [
{
"user_id": 123,
"age": 32,
"total_purchases": 1250.50,
"transaction_count_7d": 5,
"avg_transaction_amount_7d": 45.30
},
...
]
}
-- Cached feature serving with Redis
-- Features are automatically cached for low latency
CREATE FEATURE CACHE ON user_features
WITH (
ttl => '5 minutes',
cache_backend => 'redis',
warm_up_on_startup => true
);
-- Batch feature serving
SELECT * FROM get_online_features_batch(
ARRAY[
ROW(123, 'user_features'),
ROW(456, 'user_features'),
ROW(789, 'transaction_features')
]
);
-- Feature vector for model prediction
SELECT entropy_feature_vector(
user_id => 123,
feature_groups => ARRAY['user_features', 'transaction_features'],
feature_names => ARRAY[
'age', 'total_purchases', 'transaction_count_7d',
'avg_transaction_amount_7d', 'hour_of_day'
]
) as feature_vector;
-- Returns: [32, 1250.50, 5, 45.30, 14]Feature Versioning & Lineage
-- Version features CREATE FEATURE GROUP user_features_v2 ( user_id BIGINT PRIMARY KEY, age INTEGER, account_age_days INTEGER, -- New features in v2 credit_score INTEGER, income_bracket TEXT, -- Modified feature total_purchases_normalized DECIMAL, feature_timestamp TIMESTAMP ); -- Tag feature version ALTER FEATURE GROUP user_features_v2 SET TAG 'production_candidate'; -- Track feature lineage SELECT feature_name, source_table, transformation_sql, created_at, created_by, version FROM entropy_feature_lineage WHERE feature_group = 'user_features'; -- Compare feature versions SELECT v1.user_id, v1.total_purchases as v1_total, v2.total_purchases_normalized as v2_total_norm, ABS(v1.total_purchases - v2.total_purchases_normalized * 1000) as diff FROM user_features v1 JOIN user_features_v2 v2 ON v1.user_id = v2.user_id WHERE ABS(v1.total_purchases - v2.total_purchases_normalized * 1000) > 100; -- Feature schema evolution ALTER FEATURE GROUP user_features ADD COLUMN social_media_followers INTEGER, ADD COLUMN email_verified BOOLEAN; -- Rollback to previous version ALTER FEATURE GROUP user_features RESTORE VERSION '2024-01-15T10:30:00Z';
Feature Monitoring
-- Monitor feature drift
SELECT entropy_feature_drift(
feature_group => 'user_features',
baseline_start => '2024-01-01',
baseline_end => '2024-01-31',
current_start => '2024-06-01',
current_end => '2024-06-30',
method => 'psi' -- Population Stability Index
) as feature_name, drift_score
WHERE drift_score > 0.1;
-- Feature statistics over time
SELECT
DATE(feature_timestamp) as date,
AVG(age) as avg_age,
STDDEV(age) as stddev_age,
AVG(total_purchases) as avg_purchases,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_purchases) as median_purchases
FROM user_features
WHERE feature_timestamp > NOW() - INTERVAL '30 days'
GROUP BY DATE(feature_timestamp)
ORDER BY date;
-- Detect feature anomalies
WITH feature_stats AS (
SELECT
AVG(total_purchases) as mean,
STDDEV(total_purchases) as stddev
FROM user_features
WHERE feature_timestamp > NOW() - INTERVAL '30 days'
)
SELECT
user_id,
total_purchases,
(total_purchases - fs.mean) / fs.stddev as z_score
FROM user_features uf
CROSS JOIN feature_stats fs
WHERE ABS((total_purchases - fs.mean) / fs.stddev) > 3
AND feature_timestamp > NOW() - INTERVAL '1 day';
-- Feature freshness monitoring
SELECT
feature_group,
MAX(feature_timestamp) as last_update,
NOW() - MAX(feature_timestamp) as staleness
FROM (
SELECT 'user_features' as feature_group, MAX(feature_timestamp) as feature_timestamp FROM user_features
UNION ALL
SELECT 'transaction_features', MAX(feature_timestamp) FROM transaction_features
) sub
GROUP BY feature_group
HAVING NOW() - MAX(feature_timestamp) > INTERVAL '1 hour';Best Practices
Design
- • Ensure point-in-time correctness
- • Version all feature transformations
- • Document feature definitions
- • Track feature lineage
Operations
- • Monitor feature drift
- • Cache online features
- • Automate feature refresh
- • Test feature changes