Lesson 5.2: Handling Massive Writes

Bulk Loading Strategies

Slow: Row-by-row inserts:

for row in data:
    db.execute("INSERT INTO embeddings VALUES (%s, %s, %s)", row)
# 1M rows × 5ms = 5000 seconds (83 minutes!)

Fast: COPY command:

import csv
import io

# Write to CSV in memory
csv_buffer = io.StringIO()
writer = csv.writer(csv_buffer)
for row in data:
    writer.writerow(row)

# Bulk load
csv_buffer.seek(0)
cursor.copy_expert("COPY embeddings FROM STDIN WITH CSV", csv_buffer)
# 1M rows in 10 seconds (500x faster!)

Faster: DROP indexes, then rebuild:

BEGIN;
-- Drop indexes
DROP INDEX embeddings_vector_idx;
DROP INDEX embeddings_document_id_idx;

-- Bulk load
COPY embeddings FROM '/path/to/data.csv' WITH CSV;

-- Rebuild indexes
CREATE INDEX embeddings_vector_idx ON embeddings USING hnsw(vector vector_cosine_ops);
CREATE INDEX embeddings_document_id_idx ON embeddings(document_id);
COMMIT;

-- Total time: 60 seconds vs 300 seconds with indexes

Write Optimization Techniques

1. Batch commits:

# Bad: Commit each row
for row in data:
    db.execute("INSERT ...")
    db.commit()  # Slow!

# Good: Batch commits
db.execute("BEGIN")
for i, row in enumerate(data):
    db.execute("INSERT ...")
    if i % 1000 == 0:
        db.commit()
        db.execute("BEGIN")
db.commit()

2. UNLOGGED tables (temporary data):

-- Skip WAL (Write-Ahead Log) - faster writes
CREATE UNLOGGED TABLE temp_import (
    id BIGINT,
    vector VECTOR(1536)
);

-- Load data fast
COPY temp_import FROM '/data.csv';

-- Process and move to logged table
INSERT INTO embeddings SELECT * FROM temp_import;

3. Parallel writes (multiple connections):

from concurrent.futures import ThreadPoolExecutor

def insert_chunk(chunk):
    conn = get_db_connection()
    conn.execute("COPY embeddings FROM STDIN", chunk)

# Split data into 10 chunks, insert in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(insert_chunk, data_chunks)

Key Takeaways

COPY is 500x faster than row-by-row INSERT for bulk loads
Drop indexes before bulk inserts, rebuild after
Batch commits reduce WAL overhead
UNLOGGED tables skip durability for temporary staging data
Parallel writes use multiple connections for throughput
ML embedding generation often requires bulk loading millions of vectors

Bulk Loading Strategies

Slow: Row-by-row inserts:

for row in data: db.execute("INSERT INTO embeddings VALUES (%s, %s, %s)", row) # 1M rows × 5ms = 5000 seconds (83 minutes!)

Fast: COPY command:

import csv import io # Write to CSV in memory csv_buffer = io.StringIO() writer = csv.writer(csv_buffer) for row in data: writer.writerow(row) # Bulk load csv_buffer.seek(0) cursor.copy_expert("COPY embeddings FROM STDIN WITH CSV", csv_buffer) # 1M rows in 10 seconds (500x faster!)

Faster: DROP indexes, then rebuild:

BEGIN; -- Drop indexes DROP INDEX embeddings_vector_idx; DROP INDEX embeddings_document_id_idx; -- Bulk load COPY embeddings FROM '/path/to/data.csv' WITH CSV; -- Rebuild indexes CREATE INDEX embeddings_vector_idx ON embeddings USING hnsw(vector vector_cosine_ops); CREATE INDEX embeddings_document_id_idx ON embeddings(document_id); COMMIT; -- Total time: 60 seconds vs 300 seconds with indexes

Write Optimization Techniques

1. Batch commits:

# Bad: Commit each row for row in data: db.execute("INSERT ...") db.commit() # Slow! # Good: Batch commits db.execute("BEGIN") for i, row in enumerate(data): db.execute("INSERT ...") if i % 1000 == 0: db.commit() db.execute("BEGIN") db.commit()

2. UNLOGGED tables (temporary data):

-- Skip WAL (Write-Ahead Log) - faster writes CREATE UNLOGGED TABLE temp_import ( id BIGINT, vector VECTOR(1536) ); -- Load data fast COPY temp_import FROM '/data.csv'; -- Process and move to logged table INSERT INTO embeddings SELECT * FROM temp_import;

3. Parallel writes (multiple connections):

from concurrent.futures import ThreadPoolExecutor def insert_chunk(chunk): conn = get_db_connection() conn.execute("COPY embeddings FROM STDIN", chunk) # Split data into 10 chunks, insert in parallel with ThreadPoolExecutor(max_workers=10) as executor: executor.map(insert_chunk, data_chunks)

Key Takeaways

COPY is 500x faster than row-by-row INSERT for bulk loads

Drop indexes before bulk inserts, rebuild after

Batch commits reduce WAL overhead

UNLOGGED tables skip durability for temporary staging data

Parallel writes use multiple connections for throughput

ML embedding generation often requires bulk loading millions of vectors