Lesson 7.2: Designing the RAG Pipeline
Schema Design
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
title TEXT NOT NULL,
source TEXT NOT NULL,
content TEXT NOT NULL,
category TEXT,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
deleted_at TIMESTAMPTZ
);
CREATE INDEX documents_category_idx ON documents(category) WHERE deleted_at IS NULL;
CREATE INDEX documents_created_idx ON documents(created_at DESC) WHERE deleted_at IS NULL;
CREATE TABLE document_chunks (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INT NOT NULL,
content TEXT NOT NULL,
token_count INT NOT NULL,
embedding VECTOR(1536) NOT NULL,
UNIQUE(document_id, chunk_index)
);
CREATE INDEX chunks_embedding_hnsw ON document_chunks
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
CREATE INDEX chunks_document_idx ON document_chunks(document_id);
CREATE TABLE document_permissions (
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
user_id BIGINT NOT NULL,
permission_level TEXT NOT NULL DEFAULT 'read',
granted_at TIMESTAMPTZ DEFAULT NOW(),
PRIMARY KEY (document_id, user_id)
);
CREATE INDEX document_permissions_user_idx ON document_permissions(user_id);
Query Patterns
SELECT
d.id AS document_id,
d.title,
c.content AS chunk_content,
c.embedding <=> $1::vector AS similarity_score
FROM document_chunks c
JOIN documents d ON c.document_id = d.id
JOIN document_permissions p ON d.id = p.document_id
WHERE
p.user_id = $2
AND p.permission_level IN ('read', 'write', 'admin')
AND d.category = $3
AND d.deleted_at IS NULL
AND c.embedding <=> $1::vector < 0.75
ORDER BY similarity_score
LIMIT 10;
Key Takeaways
- Three-table design: documents, document_chunks, document_permissions
- Soft deletes preserve data while hiding it from queries (deleted_at column)
- Chunking strategy splits documents for better embedding and retrieval
- HNSW index on embeddings enables fast vector similarity search
- Permission system ensures users only see authorized documents
- Partial indexes (WHERE deleted_at IS NULL) improve query performance
- CASCADE deletes automatically clean up chunks and permissions when documents are deleted
- Query combines vector search, permissions, and metadata filters in one SQL statement