Lesson 7.3: Building the Feature Store
Schema for Real-Time Features
CREATE TABLE feature_definitions (
id BIGSERIAL PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT,
data_type TEXT NOT NULL,
source TEXT,
refresh_interval_seconds INT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE user_features (
user_id BIGINT PRIMARY KEY,
messages_sent_7d INT DEFAULT 0,
messages_sent_30d INT DEFAULT 0,
avg_response_time_seconds FLOAT,
thumbs_up_count INT DEFAULT 0,
thumbs_down_count INT DEFAULT 0,
satisfaction_score FLOAT,
last_interaction_at TIMESTAMPTZ,
days_since_last_interaction INT,
computed_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX user_features_lookup_idx ON user_features(user_id)
INCLUDE (messages_sent_7d, messages_sent_30d, satisfaction_score);
CREATE MATERIALIZED VIEW feature_stats AS
SELECT
'messages_sent_7d' AS feature_name,
AVG(messages_sent_7d) AS avg_value,
STDDEV(messages_sent_7d) AS stddev,
MIN(messages_sent_7d) AS min_value,
MAX(messages_sent_7d) AS max_value,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY messages_sent_7d) AS median
FROM user_features
UNION ALL
SELECT
'satisfaction_score',
AVG(satisfaction_score),
STDDEV(satisfaction_score),
MIN(satisfaction_score),
MAX(satisfaction_score),
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY satisfaction_score)
FROM user_features;
Feature Computation Pipeline
CREATE OR REPLACE FUNCTION refresh_user_features()
RETURNS void AS $$
BEGIN
INSERT INTO user_features (
user_id,
messages_sent_7d,
messages_sent_30d,
last_interaction_at,
days_since_last_interaction,
computed_at
)
SELECT
user_id,
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days') AS messages_7d,
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '30 days') AS messages_30d,
MAX(created_at) AS last_interaction,
EXTRACT(DAY FROM NOW() - MAX(created_at)) AS days_since,
NOW() AS computed
FROM messages
WHERE role = 'user'
GROUP BY user_id
ON CONFLICT (user_id) DO UPDATE SET
messages_sent_7d = EXCLUDED.messages_sent_7d,
messages_sent_30d = EXCLUDED.messages_sent_30d,
last_interaction_at = EXCLUDED.last_interaction_at,
days_since_last_interaction = EXCLUDED.days_since_last_interaction,
computed_at = EXCLUDED.computed_at;
END;
$$ LANGUAGE plpgsql;
Key Takeaways
- Feature store provides sub-100ms feature lookups for ML models
- Denormalized design trades storage for speed (pre-computed features)
- Covering index enables index-only scans for hot-path queries
- Feature definitions table stores metadata about feature computation
- Materialized views pre-compute expensive aggregations for analytics
- Refresh function uses UPSERT pattern to update features periodically
- Time-windowed features (7d, 30d) capture recency and engagement
- Computed_at timestamp tracks feature freshness for cache invalidation