Lesson 4.6: Schema Design Patterns for AI

Pattern 1: Feature Store Schema

-- Entities (users, products, etc.)
CREATE TABLE entities (
    id BIGSERIAL PRIMARY KEY,
    entity_type TEXT NOT NULL,  -- 'user', 'product', 'session'
    external_id TEXT NOT NULL,
    UNIQUE(entity_type, external_id)
);

-- Features (pre-computed)
CREATE TABLE features (
    entity_id BIGINT NOT NULL REFERENCES entities(id),
    feature_name TEXT NOT NULL,
    feature_value NUMERIC,
    computed_at TIMESTAMPTZ DEFAULT NOW(),
    PRIMARY KEY (entity_id, feature_name)
);

CREATE INDEX features_entity_computed_idx ON features(entity_id, computed_at DESC);

-- Fast feature lookup
SELECT feature_name, feature_value
FROM features
WHERE entity_id = 12345
    AND computed_at = (SELECT MAX(computed_at) FROM features WHERE entity_id = 12345);

Pattern 2: RAG Document Store

-- Documents
CREATE TABLE documents (
    id BIGSERIAL PRIMARY KEY,
    source TEXT NOT NULL,  -- 'upload', 'web', 'api'
    title TEXT,
    content TEXT,
    metadata JSONB,
    created_at TIMESTAMPTZ DEFAULT NOW(),
    updated_at TIMESTAMPTZ DEFAULT NOW()
);

-- Document chunks (for embedding)
CREATE TABLE document_chunks (
    id BIGSERIAL PRIMARY KEY,
    document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    chunk_index INT NOT NULL,
    content TEXT NOT NULL,
    token_count INT,
    embedding VECTOR(1536),
    UNIQUE(document_id, chunk_index)
);

CREATE INDEX chunks_embedding_idx ON document_chunks USING hnsw(embedding vector_cosine_ops);

-- Permissions
CREATE TABLE document_permissions (
    document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    user_id BIGINT NOT NULL,
    permission TEXT NOT NULL,  -- 'read', 'write', 'admin'
    PRIMARY KEY (document_id, user_id)
);

Pattern 3: Model Prediction Logs

CREATE TABLE predictions (
    id BIGSERIAL,
    model_id BIGINT NOT NULL REFERENCES ml_models(id),
    input_data JSONB NOT NULL,
    prediction JSONB NOT NULL,
    confidence FLOAT,
    latency_ms INT,
    created_at TIMESTAMPTZ DEFAULT NOW(),
    PRIMARY KEY (id, created_at)  -- Partition key
) PARTITION BY RANGE (created_at);

-- Monthly partitions
CREATE TABLE predictions_2024_01 PARTITION OF predictions
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

-- Indexes per partition
CREATE INDEX predictions_2024_01_model_idx ON predictions_2024_01(model_id, created_at DESC);

Module 4 Summary

Key Principles

Hybrid normalization: Normalize transactional data, denormalize ML features
Avoid hot rows: Use sharding or materialized views
Partition large tables: Time-based partitions for logs and predictions
JSONB for flexibility: But columns for frequently queried data
Soft deletes + audit: Never lose data, maintain history
Metadata tables: Track models, features, datasets for reproducibility

Schema Design Checklist

Identify hot paths (high-concurrency queries)
Denormalize read paths, normalize write paths
Add soft delete columns where needed
Use JSONB only for truly flexible data
Partition time-series tables (logs, events, predictions)
Create metadata tables for ML artifacts
Add audit tables for compliance
Plan for growth (partitioning, archival strategy)

What's Next

Module 5 covers SQL performance optimization for AI workloads:

Identifying slow query patterns
Handling massive writes from ML systems
Real-time ingestion strategies
Scaling reads with replicas

Continue to Module 5: SQL Performance for AI

Pattern 1: Feature Store Schema

-- Entities (users, products, etc.) CREATE TABLE entities ( id BIGSERIAL PRIMARY KEY, entity_type TEXT NOT NULL, -- 'user', 'product', 'session' external_id TEXT NOT NULL, UNIQUE(entity_type, external_id) ); -- Features (pre-computed) CREATE TABLE features ( entity_id BIGINT NOT NULL REFERENCES entities(id), feature_name TEXT NOT NULL, feature_value NUMERIC, computed_at TIMESTAMPTZ DEFAULT NOW(), PRIMARY KEY (entity_id, feature_name) ); CREATE INDEX features_entity_computed_idx ON features(entity_id, computed_at DESC); -- Fast feature lookup SELECT feature_name, feature_value FROM features WHERE entity_id = 12345 AND computed_at = (SELECT MAX(computed_at) FROM features WHERE entity_id = 12345);

Pattern 2: RAG Document Store

-- Documents CREATE TABLE documents ( id BIGSERIAL PRIMARY KEY, source TEXT NOT NULL, -- 'upload', 'web', 'api' title TEXT, content TEXT, metadata JSONB, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- Document chunks (for embedding) CREATE TABLE document_chunks ( id BIGSERIAL PRIMARY KEY, document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, chunk_index INT NOT NULL, content TEXT NOT NULL, token_count INT, embedding VECTOR(1536), UNIQUE(document_id, chunk_index) ); CREATE INDEX chunks_embedding_idx ON document_chunks USING hnsw(embedding vector_cosine_ops); -- Permissions CREATE TABLE document_permissions ( document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, user_id BIGINT NOT NULL, permission TEXT NOT NULL, -- 'read', 'write', 'admin' PRIMARY KEY (document_id, user_id) );

Pattern 3: Model Prediction Logs

CREATE TABLE predictions ( id BIGSERIAL, model_id BIGINT NOT NULL REFERENCES ml_models(id), input_data JSONB NOT NULL, prediction JSONB NOT NULL, confidence FLOAT, latency_ms INT, created_at TIMESTAMPTZ DEFAULT NOW(), PRIMARY KEY (id, created_at) -- Partition key ) PARTITION BY RANGE (created_at); -- Monthly partitions CREATE TABLE predictions_2024_01 PARTITION OF predictions FOR VALUES FROM ('2024-01-01') TO ('2024-02-01'); -- Indexes per partition CREATE INDEX predictions_2024_01_model_idx ON predictions_2024_01(model_id, created_at DESC);

Module 4 Summary

Key Principles

Hybrid normalization: Normalize transactional data, denormalize ML features

Avoid hot rows: Use sharding or materialized views

Partition large tables: Time-based partitions for logs and predictions

JSONB for flexibility: But columns for frequently queried data

Soft deletes + audit: Never lose data, maintain history

Metadata tables: Track models, features, datasets for reproducibility

Schema Design Checklist

Identify hot paths (high-concurrency queries)

Denormalize read paths, normalize write paths

Add soft delete columns where needed

Use JSONB only for truly flexible data

Partition time-series tables (logs, events, predictions)

Create metadata tables for ML artifacts

Add audit tables for compliance

Plan for growth (partitioning, archival strategy)

What's Next

Module 5 covers SQL performance optimization for AI workloads:

Identifying slow query patterns

Handling massive writes from ML systems

Real-time ingestion strategies

Scaling reads with replicas

Continue to Module 5: SQL Performance for AI