Lesson 4.2: How AI Pipelines Break Naive Schemas

Problem 1: Write Amplification

Naive schema:

-- Update user stats after every order
CREATE TABLE users (
    id BIGSERIAL PRIMARY KEY,
    order_count INT DEFAULT 0,
    total_spent NUMERIC DEFAULT 0
);

-- Every order = 2 writes (order + user update)
INSERT INTO orders (...) VALUES (...);
UPDATE users SET order_count = order_count + 1, total_spent = total_spent + 49.99
WHERE id = 123;

At scale:

10k orders/second = 20k writes/second
Lock contention on hot users
Slow inserts waiting for user UPDATE

Better schema:

-- Append-only orders (fast writes)
CREATE TABLE orders (
    id BIGSERIAL PRIMARY KEY,
    user_id BIGINT NOT NULL,
    total NUMERIC NOT NULL,
    created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX orders_user_created_idx ON orders(user_id, created_at DESC);

-- Materialized view (refreshed periodically)
CREATE MATERIALIZED VIEW user_stats AS
SELECT
    user_id,
    COUNT(*) as order_count,
    SUM(total) as total_spent,
    MAX(created_at) as last_order
FROM orders
GROUP BY user_id;

CREATE UNIQUE INDEX user_stats_user_idx ON user_stats(user_id);

-- Refresh every 5 minutes (not real-time, but much faster writes)
REFRESH MATERIALIZED VIEW CONCURRENTLY user_stats;

Problem 2: Hot Row Contention

Naive schema:

-- Global counter (disaster at scale)
CREATE TABLE stats (
    key TEXT PRIMARY KEY,
    value BIGINT
);

-- Every request updates same row
UPDATE stats SET value = value + 1 WHERE key = 'page_views';
-- Lock contention! Only 1 write at a time

Better schema:

-- Sharded counters (no contention)
CREATE TABLE stats_shards (
    key TEXT NOT NULL,
    shard INT NOT NULL,
    value BIGINT DEFAULT 0,
    PRIMARY KEY (key, shard)
);

-- Initialize 100 shards
INSERT INTO stats_shards (key, shard, value)
SELECT 'page_views', generate_series(0, 99), 0;

-- Increment random shard (parallel writes!)
UPDATE stats_shards
SET value = value + 1
WHERE key = 'page_views' AND shard = floor(random() * 100);

-- Read total
SELECT SUM(value) FROM stats_shards WHERE key = 'page_views';

Problem 3: Unbounded Growth

Naive schema:

-- Logs in single table (grows forever)
CREATE TABLE model_predictions (
    id BIGSERIAL PRIMARY KEY,
    model_id TEXT,
    prediction JSONB,
    created_at TIMESTAMPTZ DEFAULT NOW()
);

-- After 1 year: 1 billion rows, slow queries, huge indexes

Better schema:

-- Partitioned by time
CREATE TABLE model_predictions (
    id BIGSERIAL,
    model_id TEXT,
    prediction JSONB,
    created_at TIMESTAMPTZ DEFAULT NOW()
) PARTITION BY RANGE (created_at);

-- Monthly partitions
CREATE TABLE model_predictions_2024_01 PARTITION OF model_predictions
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

CREATE TABLE model_predictions_2024_02 PARTITION OF model_predictions
    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');

-- Auto-create with pg_partman extension
-- Drop old partitions easily
DROP TABLE model_predictions_2023_01;  -- Drop entire month in 1ms

Key Takeaways

Write amplification occurs when one operation triggers multiple updates
Use append-only tables with materialized views instead of live aggregates
Hot row contention from global counters can be solved with sharding
Unbounded table growth requires partitioning strategy
Partition large tables by time for easy archival and faster queries
AI workloads generate massive write volume that breaks naive schemas

Problem 1: Write Amplification

Naive schema:

-- Update user stats after every order CREATE TABLE users ( id BIGSERIAL PRIMARY KEY, order_count INT DEFAULT 0, total_spent NUMERIC DEFAULT 0 ); -- Every order = 2 writes (order + user update) INSERT INTO orders (...) VALUES (...); UPDATE users SET order_count = order_count + 1, total_spent = total_spent + 49.99 WHERE id = 123;

At scale:

10k orders/second = 20k writes/second

Lock contention on hot users

Slow inserts waiting for user UPDATE

Better schema:

-- Append-only orders (fast writes) CREATE TABLE orders ( id BIGSERIAL PRIMARY KEY, user_id BIGINT NOT NULL, total NUMERIC NOT NULL, created_at TIMESTAMPTZ DEFAULT NOW() ); CREATE INDEX orders_user_created_idx ON orders(user_id, created_at DESC); -- Materialized view (refreshed periodically) CREATE MATERIALIZED VIEW user_stats AS SELECT user_id, COUNT(*) as order_count, SUM(total) as total_spent, MAX(created_at) as last_order FROM orders GROUP BY user_id; CREATE UNIQUE INDEX user_stats_user_idx ON user_stats(user_id); -- Refresh every 5 minutes (not real-time, but much faster writes) REFRESH MATERIALIZED VIEW CONCURRENTLY user_stats;

Problem 2: Hot Row Contention

Naive schema:

-- Global counter (disaster at scale) CREATE TABLE stats ( key TEXT PRIMARY KEY, value BIGINT ); -- Every request updates same row UPDATE stats SET value = value + 1 WHERE key = 'page_views'; -- Lock contention! Only 1 write at a time

Better schema:

-- Sharded counters (no contention) CREATE TABLE stats_shards ( key TEXT NOT NULL, shard INT NOT NULL, value BIGINT DEFAULT 0, PRIMARY KEY (key, shard) ); -- Initialize 100 shards INSERT INTO stats_shards (key, shard, value) SELECT 'page_views', generate_series(0, 99), 0; -- Increment random shard (parallel writes!) UPDATE stats_shards SET value = value + 1 WHERE key = 'page_views' AND shard = floor(random() * 100); -- Read total SELECT SUM(value) FROM stats_shards WHERE key = 'page_views';

Problem 3: Unbounded Growth

Naive schema:

-- Logs in single table (grows forever) CREATE TABLE model_predictions ( id BIGSERIAL PRIMARY KEY, model_id TEXT, prediction JSONB, created_at TIMESTAMPTZ DEFAULT NOW() ); -- After 1 year: 1 billion rows, slow queries, huge indexes

Better schema:

-- Partitioned by time CREATE TABLE model_predictions ( id BIGSERIAL, model_id TEXT, prediction JSONB, created_at TIMESTAMPTZ DEFAULT NOW() ) PARTITION BY RANGE (created_at); -- Monthly partitions CREATE TABLE model_predictions_2024_01 PARTITION OF model_predictions FOR VALUES FROM ('2024-01-01') TO ('2024-02-01'); CREATE TABLE model_predictions_2024_02 PARTITION OF model_predictions FOR VALUES FROM ('2024-02-01') TO ('2024-03-01'); -- Auto-create with pg_partman extension -- Drop old partitions easily DROP TABLE model_predictions_2023_01; -- Drop entire month in 1ms

Key Takeaways

Write amplification occurs when one operation triggers multiple updates

Use append-only tables with materialized views instead of live aggregates

Hot row contention from global counters can be solved with sharding

Unbounded table growth requires partitioning strategy

Partition large tables by time for easy archival and faster queries

AI workloads generate massive write volume that breaks naive schemas