Lesson 6.5: Production RAG Architecture
Complete RAG Stack
-- 1. Documents table
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
title TEXT,
source_url TEXT,
content TEXT,
metadata JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- 2. Chunks (documents split for embedding)
CREATE TABLE chunks (
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INT NOT NULL,
content TEXT NOT NULL,
token_count INT,
embedding VECTOR(1536),
UNIQUE(document_id, chunk_index)
);
CREATE INDEX chunks_embedding_hnsw ON chunks USING hnsw(embedding vector_cosine_ops);
CREATE INDEX chunks_document_idx ON chunks(document_id);
-- 3. User queries (for analytics)
CREATE TABLE rag_queries (
id BIGSERIAL PRIMARY KEY,
user_id BIGINT,
query TEXT NOT NULL,
embedding VECTOR(1536),
chunks_retrieved INT[], -- IDs of chunks used
llm_response TEXT,
latency_ms INT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
RAG Query Pipeline
-- Step 1: Embed query (done in application)
query_embedding = openai.embeddings.create(...)
-- Step 2: Retrieve relevant chunks
SELECT
c.id,
c.content,
c.chunk_index,
d.title,
d.source_url,
c.embedding <=> query_embedding AS distance
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE
-- Permission filter (if needed)
d.metadata->>'department' = 'engineering'
-- Similarity threshold
AND c.embedding <=> query_embedding < 0.7
ORDER BY distance
LIMIT 5;
-- Step 3: Format context for LLM
context = "\n\n".join([chunk['content'] for chunk in results])
-- Step 4: Send to LLM
prompt = f"Context:\n{context}\n\nQuestion: {user_query}\n\nAnswer:"
response = openai.chat.completions.create(...)
-- Step 5: Log query
INSERT INTO rag_queries (user_id, query, embedding, chunks_retrieved, llm_response, latency_ms)
VALUES (123, user_query, query_embedding, ARRAY[chunk_ids], response, latency);
Re-Ranking for Better Results
-- Initial retrieval: Get 50 candidates (fast, approximate)
WITH candidates AS (
SELECT id, content, embedding <=> query_embedding AS distance
FROM chunks
WHERE embedding <=> query_embedding < 0.8
ORDER BY distance
LIMIT 50
)
-- Re-rank: Use cross-encoder for top 50 (slow, accurate)
SELECT
id,
content,
cross_encoder_score(content, query) AS relevance -- Application function
FROM candidates
ORDER BY relevance DESC
LIMIT 5;
Key Takeaways
- Production RAG requires three tables: documents, chunks, and query logs
- Chunking splits large documents for better embedding and retrieval
- HNSW indexes enable fast vector search on chunks table
- RAG pipeline: Embed query → Retrieve chunks → Format context → LLM generation → Log
- Query logging tracks which chunks were used for analytics and improvement
- Re-ranking uses cross-encoders to improve top-k results from ANN search
- Two-stage retrieval (fast ANN + slow re-ranking) balances speed and accuracy
- Metadata filters in WHERE clause ensure security and relevance
Next Steps
Ready to build a complete RAG application? Continue with our Full-Stack RAG with Next.js, Supabase & Gemini course to implement everything you've learned in a production Next.js application.

