Scaling Chains for Production
As usage grows, chains must scale reliably. This lesson covers strategies for handling increased load while maintaining quality.
Scaling Challenges
Chains face unique scaling challenges:
- Rate limits on API providers
- Context window limits
- Cost growth with volume
- Latency under load
- Error handling at scale
Rate Limit Management
Understanding Rate Limits
// Common rate limit structures
const rateLimits = {
openai: {
requestsPerMinute: 3500, // GPT-4
tokensPerMinute: 90000, // GPT-4
tokensPerDay: 1000000 // Varies by tier
},
anthropic: {
requestsPerMinute: 4000, // Claude
tokensPerMinute: 100000
}
};
Rate Limiter Implementation
Loading Prompt Playground...
Token Bucket Algorithm
class TokenBucket {
constructor(capacity, refillRate) {
this.capacity = capacity;
this.tokens = capacity;
this.refillRate = refillRate; // tokens per second
this.lastRefill = Date.now();
}
refill() {
const now = Date.now();
const elapsed = (now - this.lastRefill) / 1000;
this.tokens = Math.min(
this.capacity,
this.tokens + elapsed * this.refillRate
);
this.lastRefill = now;
}
async acquire(tokens = 1) {
this.refill();
if (this.tokens >= tokens) {
this.tokens -= tokens;
return true;
}
// Calculate wait time
const deficit = tokens - this.tokens;
const waitTime = (deficit / this.refillRate) * 1000;
await sleep(waitTime);
return this.acquire(tokens);
}
}
// Usage with multiple limiters
class MultiLimiter {
constructor(config) {
this.requestLimiter = new TokenBucket(
config.requestsPerMinute,
config.requestsPerMinute / 60
);
this.tokenLimiter = new TokenBucket(
config.tokensPerMinute,
config.tokensPerMinute / 60
);
}
async acquire(estimatedTokens) {
await Promise.all([
this.requestLimiter.acquire(1),
this.tokenLimiter.acquire(estimatedTokens)
]);
}
}
Load Balancing
Multi-Provider Strategy
class ProviderLoadBalancer {
constructor(providers) {
this.providers = providers.map(p => ({
...p,
currentLoad: 0,
errorCount: 0,
lastError: null
}));
}
selectProvider(requirements) {
// Filter eligible providers
const eligible = this.providers.filter(p =>
p.capabilities.includes(requirements.capability) &&
p.currentLoad < p.maxLoad &&
!this.isInCooldown(p)
);
if (eligible.length === 0) {
throw new Error('No available providers');
}
// Select based on strategy
return this.selectByStrategy(eligible, requirements);
}
selectByStrategy(providers, requirements) {
if (requirements.priority === 'latency') {
// Select fastest
return providers.sort((a, b) => a.avgLatency - b.avgLatency)[0];
}
if (requirements.priority === 'cost') {
// Select cheapest
return providers.sort((a, b) => a.costPer1k - b.costPer1k)[0];
}
// Default: least loaded
return providers.sort((a, b) => a.currentLoad - b.currentLoad)[0];
}
isInCooldown(provider) {
if (!provider.lastError) return false;
const cooldownMs = Math.min(
60000,
1000 * Math.pow(2, provider.errorCount)
);
return Date.now() - provider.lastError < cooldownMs;
}
}
Key-Based Distribution
class APIKeyManager {
constructor(keys) {
this.keys = keys.map(key => ({
key,
requestsThisMinute: 0,
tokensThisMinute: 0,
lastReset: Date.now()
}));
}
getKey(estimatedTokens) {
this.resetIfNeeded();
// Find key with most remaining capacity
const available = this.keys
.filter(k => k.requestsThisMinute < 3500)
.sort((a, b) =>
(a.requestsThisMinute + a.tokensThisMinute) -
(b.requestsThisMinute + b.tokensThisMinute)
);
if (available.length === 0) {
throw new Error('All API keys exhausted');
}
const selected = available[0];
selected.requestsThisMinute++;
selected.tokensThisMinute += estimatedTokens;
return selected.key;
}
resetIfNeeded() {
const now = Date.now();
for (const key of this.keys) {
if (now - key.lastReset > 60000) {
key.requestsThisMinute = 0;
key.tokensThisMinute = 0;
key.lastReset = now;
}
}
}
}
Queue Management
Priority Queue System
Loading Prompt Playground...
Implementation
class PriorityQueueManager {
constructor() {
this.queues = {
critical: new Queue({ priority: 0 }),
realtime: new Queue({ priority: 1 }),
batch: new Queue({ priority: 2 }),
backfill: new Queue({ priority: 3 })
};
this.processing = false;
}
enqueue(request, priority = 'batch') {
const item = {
id: generateId(),
request,
enqueuedAt: Date.now(),
priority,
attempts: 0
};
this.queues[priority].push(item);
this.processNext();
return item.id;
}
async processNext() {
if (this.processing) return;
this.processing = true;
try {
// Check queues in priority order
for (const [priority, queue] of Object.entries(this.queues)) {
if (!queue.isEmpty()) {
const item = queue.pop();
await this.process(item);
break;
}
}
} finally {
this.processing = false;
// Continue processing if more items
if (this.hasItems()) {
setImmediate(() => this.processNext());
}
}
}
async process(item) {
try {
const result = await this.executeChain(item.request);
this.onSuccess(item, result);
} catch (error) {
item.attempts++;
if (item.attempts < 3) {
// Retry with backoff
setTimeout(
() => this.queues[item.priority].push(item),
1000 * Math.pow(2, item.attempts)
);
} else {
// Move to dead letter queue
this.deadLetterQueue.push({ ...item, error });
}
}
}
}
Horizontal Scaling
Worker Pool Architecture
class WorkerPool {
constructor(size, chainExecutor) {
this.workers = Array(size).fill(null).map((_, i) => ({
id: i,
busy: false,
currentTask: null
}));
this.chainExecutor = chainExecutor;
this.taskQueue = [];
}
async submit(task) {
return new Promise((resolve, reject) => {
this.taskQueue.push({ task, resolve, reject });
this.assignTasks();
});
}
assignTasks() {
for (const worker of this.workers) {
if (!worker.busy && this.taskQueue.length > 0) {
const { task, resolve, reject } = this.taskQueue.shift();
this.runOnWorker(worker, task, resolve, reject);
}
}
}
async runOnWorker(worker, task, resolve, reject) {
worker.busy = true;
worker.currentTask = task;
try {
const result = await this.chainExecutor(task);
resolve(result);
} catch (error) {
reject(error);
} finally {
worker.busy = false;
worker.currentTask = null;
this.assignTasks();
}
}
getStatus() {
return {
total: this.workers.length,
busy: this.workers.filter(w => w.busy).length,
idle: this.workers.filter(w => !w.busy).length,
queueDepth: this.taskQueue.length
};
}
}
Auto-Scaling
class AutoScaler {
constructor(config) {
this.config = config;
this.metrics = [];
}
async evaluate(currentMetrics) {
this.metrics.push({
timestamp: Date.now(),
...currentMetrics
});
// Keep last 5 minutes
const fiveMinutesAgo = Date.now() - 5 * 60 * 1000;
this.metrics = this.metrics.filter(m => m.timestamp > fiveMinutesAgo);
const avgQueueDepth = this.average('queueDepth');
const avgLatency = this.average('p95Latency');
const avgUtilization = this.average('utilization');
// Scale up conditions
if (avgQueueDepth > this.config.scaleUpQueueThreshold ||
avgLatency > this.config.scaleUpLatencyThreshold ||
avgUtilization > this.config.scaleUpUtilizationThreshold) {
return { action: 'scale_up', reason: 'High load detected' };
}
// Scale down conditions
if (avgQueueDepth < this.config.scaleDownQueueThreshold &&
avgUtilization < this.config.scaleDownUtilizationThreshold) {
return { action: 'scale_down', reason: 'Low load detected' };
}
return { action: 'none' };
}
}
Exercise: Design Scaling Strategy
Loading Prompt Playground...
Key Takeaways
- Implement rate limiting to stay within API quotas
- Use multiple API keys and providers for capacity
- Build priority queues for different request types
- Design for horizontal scaling from the start
- Implement auto-scaling based on queue depth and latency
- Plan for provider failover and degraded operation
- Monitor capacity metrics proactively
- Test scaling behavior before peak load hits
Congratulations on completing the Production Considerations module! You now have the knowledge to build and scale production-ready prompt chains.

