Lesson 4.6: Schema Design Patterns for AI
Pattern 1: Feature Store Schema
-- Entities (users, products, etc.)
CREATE TABLE entities (
id BIGSERIAL PRIMARY KEY,
entity_type TEXT NOT NULL, -- 'user', 'product', 'session'
external_id TEXT NOT NULL,
UNIQUE(entity_type, external_id)
);
-- Features (pre-computed)
CREATE TABLE features (
entity_id BIGINT NOT NULL REFERENCES entities(id),
feature_name TEXT NOT NULL,
feature_value NUMERIC,
computed_at TIMESTAMPTZ DEFAULT NOW(),
PRIMARY KEY (entity_id, feature_name)
);
CREATE INDEX features_entity_computed_idx ON features(entity_id, computed_at DESC);
-- Fast feature lookup
SELECT feature_name, feature_value
FROM features
WHERE entity_id = 12345
AND computed_at = (SELECT MAX(computed_at) FROM features WHERE entity_id = 12345);
Pattern 2: RAG Document Store
-- Documents
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
source TEXT NOT NULL, -- 'upload', 'web', 'api'
title TEXT,
content TEXT,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Document chunks (for embedding)
CREATE TABLE document_chunks (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INT NOT NULL,
content TEXT NOT NULL,
token_count INT,
embedding VECTOR(1536),
UNIQUE(document_id, chunk_index)
);
CREATE INDEX chunks_embedding_idx ON document_chunks USING hnsw(embedding vector_cosine_ops);
-- Permissions
CREATE TABLE document_permissions (
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
user_id BIGINT NOT NULL,
permission TEXT NOT NULL, -- 'read', 'write', 'admin'
PRIMARY KEY (document_id, user_id)
);
Pattern 3: Model Prediction Logs
CREATE TABLE predictions (
id BIGSERIAL,
model_id BIGINT NOT NULL REFERENCES ml_models(id),
input_data JSONB NOT NULL,
prediction JSONB NOT NULL,
confidence FLOAT,
latency_ms INT,
created_at TIMESTAMPTZ DEFAULT NOW(),
PRIMARY KEY (id, created_at) -- Partition key
) PARTITION BY RANGE (created_at);
-- Monthly partitions
CREATE TABLE predictions_2024_01 PARTITION OF predictions
FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
-- Indexes per partition
CREATE INDEX predictions_2024_01_model_idx ON predictions_2024_01(model_id, created_at DESC);
Module 4 Summary
Key Principles
- Hybrid normalization: Normalize transactional data, denormalize ML features
- Avoid hot rows: Use sharding or materialized views
- Partition large tables: Time-based partitions for logs and predictions
- JSONB for flexibility: But columns for frequently queried data
- Soft deletes + audit: Never lose data, maintain history
- Metadata tables: Track models, features, datasets for reproducibility
Schema Design Checklist
- Identify hot paths (high-concurrency queries)
- Denormalize read paths, normalize write paths
- Add soft delete columns where needed
- Use JSONB only for truly flexible data
- Partition time-series tables (logs, events, predictions)
- Create metadata tables for ML artifacts
- Add audit tables for compliance
- Plan for growth (partitioning, archival strategy)
What's Next
Module 5 covers SQL performance optimization for AI workloads:
- Identifying slow query patterns
- Handling massive writes from ML systems
- Real-time ingestion strategies
- Scaling reads with replicas
Continue to Module 5: SQL Performance for AI

