Monitoring and Debugging Chains

Production chains need observability. This lesson covers strategies for monitoring chain health and debugging issues.

Why Chains Need Special Monitoring

Chains are harder to debug than single prompts:

Errors can occur at any step
Issues may compound across steps
Root cause often isn't in the failing step
Context degrades in ways that aren't immediately visible

Observability Pillars

1. Logging

class ChainLogger {
  constructor(chainId) {
    this.chainId = chainId;
    this.runId = generateRunId();
    this.logs = [];
  }

  log(level, step, message, data = {}) {
    const entry = {
      timestamp: new Date().toISOString(),
      chainId: this.chainId,
      runId: this.runId,
      level,
      step,
      message,
      data
    };

    this.logs.push(entry);

    // Also send to logging service
    if (level === 'error') {
      this.sendToAlertSystem(entry);
    }
  }

  logStepStart(step, input) {
    this.log('info', step, 'Step started', {
      inputTokens: countTokens(input),
      inputPreview: truncate(input, 200)
    });
  }

  logStepComplete(step, output, duration) {
    this.log('info', step, 'Step completed', {
      outputTokens: countTokens(output),
      outputPreview: truncate(output, 200),
      durationMs: duration
    });
  }

  logStepError(step, error, input) {
    this.log('error', step, 'Step failed', {
      error: error.message,
      stack: error.stack,
      inputPreview: truncate(input, 500)
    });
  }
}

2. Tracing

Loading Prompt Playground...

Implementing Distributed Tracing

class Tracer {
  constructor() {
    this.traces = new Map();
  }

  startTrace(chainId) {
    const traceId = generateTraceId();
    this.traces.set(traceId, {
      traceId,
      chainId,
      spans: [],
      startTime: Date.now()
    });
    return traceId;
  }

  startSpan(traceId, name, parentSpanId = null) {
    const spanId = generateSpanId();
    const trace = this.traces.get(traceId);

    trace.spans.push({
      spanId,
      parentSpanId,
      name,
      startTime: Date.now(),
      endTime: null,
      status: 'running',
      attributes: {},
      events: []
    });

    return spanId;
  }

  addEvent(traceId, spanId, name, attributes = {}) {
    const span = this.getSpan(traceId, spanId);
    span.events.push({
      name,
      timestamp: Date.now(),
      attributes
    });
  }

  endSpan(traceId, spanId, status = 'ok', error = null) {
    const span = this.getSpan(traceId, spanId);
    span.endTime = Date.now();
    span.status = status;
    if (error) {
      span.error = {
        message: error.message,
        stack: error.stack
      };
    }
  }

  getTrace(traceId) {
    return this.traces.get(traceId);
  }
}

// Usage
async function tracedChain(input) {
  const tracer = new Tracer();
  const traceId = tracer.startTrace('document-processor');

  try {
    const span1 = tracer.startSpan(traceId, 'validation');
    const validated = await validate(input);
    tracer.endSpan(traceId, span1);

    const span2 = tracer.startSpan(traceId, 'processing');
    const result = await process(validated);
    tracer.endSpan(traceId, span2);

    return result;
  } catch (error) {
    tracer.addEvent(traceId, 'current', 'error', { message: error.message });
    throw error;
  }
}

3. Metrics

class ChainMetrics {
  constructor() {
    this.counters = {};
    this.histograms = {};
    this.gauges = {};
  }

  // Count occurrences
  increment(name, labels = {}) {
    const key = this.makeKey(name, labels);
    this.counters[key] = (this.counters[key] || 0) + 1;
  }

  // Track distributions
  recordDuration(name, durationMs, labels = {}) {
    const key = this.makeKey(name, labels);
    if (!this.histograms[key]) {
      this.histograms[key] = [];
    }
    this.histograms[key].push(durationMs);
  }

  // Track current values
  setGauge(name, value, labels = {}) {
    const key = this.makeKey(name, labels);
    this.gauges[key] = value;
  }

  getMetrics() {
    return {
      counters: this.counters,
      histograms: Object.fromEntries(
        Object.entries(this.histograms).map(([k, v]) => [
          k,
          {
            count: v.length,
            avg: v.reduce((a, b) => a + b, 0) / v.length,
            p50: percentile(v, 50),
            p95: percentile(v, 95),
            p99: percentile(v, 99)
          }
        ])
      ),
      gauges: this.gauges
    };
  }
}

// Key metrics to track
const metrics = new ChainMetrics();

async function instrumentedStep(name, fn) {
  const start = Date.now();
  metrics.increment('chain.step.started', { step: name });

  try {
    const result = await fn();
    metrics.increment('chain.step.completed', { step: name });
    metrics.recordDuration('chain.step.duration', Date.now() - start, { step: name });
    return result;
  } catch (error) {
    metrics.increment('chain.step.failed', { step: name, error: error.name });
    throw error;
  }
}

Debugging Strategies

1. Replay Failed Runs

Loading Prompt Playground...

2. Step-by-Step Debugging

class ChainDebugger {
  constructor(chain) {
    this.chain = chain;
    this.breakpoints = new Set();
    this.snapshots = [];
  }

  setBreakpoint(stepName) {
    this.breakpoints.add(stepName);
  }

  async runWithDebug(input) {
    let currentInput = input;

    for (const step of this.chain.steps) {
      // Capture snapshot before step
      this.snapshots.push({
        step: step.name,
        input: structuredClone(currentInput),
        timestamp: Date.now()
      });

      // Check breakpoint
      if (this.breakpoints.has(step.name)) {
        console.log(`Breakpoint at ${step.name}`);
        console.log('Input:', JSON.stringify(currentInput, null, 2));
        await this.waitForContinue();
      }

      // Execute step
      try {
        currentInput = await step.execute(currentInput);

        // Capture output
        this.snapshots[this.snapshots.length - 1].output = structuredClone(currentInput);

      } catch (error) {
        this.snapshots[this.snapshots.length - 1].error = error;
        throw error;
      }
    }

    return currentInput;
  }

  getSnapshots() {
    return this.snapshots;
  }
}

3. Output Comparison

async function compareOutputs(originalOutput, newOutput) {
  const prompt = `
Compare these two outputs and identify differences.

ORIGINAL OUTPUT:
${JSON.stringify(originalOutput, null, 2)}

NEW OUTPUT:
${JSON.stringify(newOutput, null, 2)}

Analyze:
1. What fields are different?
2. What values changed?
3. Are the differences significant?
4. Which output is more correct/complete?

Provide structured comparison.
`;

  return await llm.chat({ content: prompt });
}

Alerting

Setting Up Alerts

class AlertManager {
  constructor(config) {
    this.thresholds = config.thresholds;
    this.channels = config.channels;
  }

  checkThresholds(metrics) {
    const alerts = [];

    // Error rate threshold
    const errorRate = metrics.errors / metrics.total;
    if (errorRate > this.thresholds.errorRate) {
      alerts.push({
        severity: 'critical',
        message: `Error rate ${(errorRate * 100).toFixed(1)}% exceeds threshold`,
        metric: 'error_rate',
        value: errorRate
      });
    }

    // Latency threshold
    if (metrics.p95Latency > this.thresholds.p95Latency) {
      alerts.push({
        severity: 'warning',
        message: `P95 latency ${metrics.p95Latency}ms exceeds threshold`,
        metric: 'p95_latency',
        value: metrics.p95Latency
      });
    }

    // Success rate drop
    if (metrics.successRate < this.thresholds.minSuccessRate) {
      alerts.push({
        severity: 'critical',
        message: `Success rate dropped to ${(metrics.successRate * 100).toFixed(1)}%`,
        metric: 'success_rate',
        value: metrics.successRate
      });
    }

    return alerts;
  }

  async sendAlerts(alerts) {
    for (const alert of alerts) {
      for (const channel of this.channels[alert.severity]) {
        await channel.send(alert);
      }
    }
  }
}

Exercise: Debug a Failing Chain

Loading Prompt Playground...

Key Takeaways

Implement structured logging for all chain steps
Use distributed tracing for complex chains
Track key metrics: latency, error rate, token usage
Build replay capability for debugging
Set up alerts for threshold breaches
Capture enough context to reproduce issues
Compare outputs when debugging regressions
Monitor for gradual degradation, not just failures

Next, we'll cover scaling chains for production.

Monitoring and Debugging Chains

Production chains need observability. This lesson covers strategies for monitoring chain health and debugging issues.

Why Chains Need Special Monitoring

Chains are harder to debug than single prompts:

Errors can occur at any step
Issues may compound across steps
Root cause often isn't in the failing step
Context degrades in ways that aren't immediately visible

Observability Pillars

1. Logging

class ChainLogger {
  constructor(chainId) {
    this.chainId = chainId;
    this.runId = generateRunId();
    this.logs = [];
  }

  log(level, step, message, data = {}) {
    const entry = {
      timestamp: new Date().toISOString(),
      chainId: this.chainId,
      runId: this.runId,
      level,
      step,
      message,
      data
    };

    this.logs.push(entry);

    // Also send to logging service
    if (level === 'error') {
      this.sendToAlertSystem(entry);
    }
  }

  logStepStart(step, input) {
    this.log('info', step, 'Step started', {
      inputTokens: countTokens(input),
      inputPreview: truncate(input, 200)
    });
  }

  logStepComplete(step, output, duration) {
    this.log('info', step, 'Step completed', {
      outputTokens: countTokens(output),
      outputPreview: truncate(output, 200),
      durationMs: duration
    });
  }

  logStepError(step, error, input) {
    this.log('error', step, 'Step failed', {
      error: error.message,
      stack: error.stack,
      inputPreview: truncate(input, 500)
    });
  }
}

2. Tracing

Loading Prompt Playground...

Implementing Distributed Tracing

class Tracer {
  constructor() {
    this.traces = new Map();
  }

  startTrace(chainId) {
    const traceId = generateTraceId();
    this.traces.set(traceId, {
      traceId,
      chainId,
      spans: [],
      startTime: Date.now()
    });
    return traceId;
  }

  startSpan(traceId, name, parentSpanId = null) {
    const spanId = generateSpanId();
    const trace = this.traces.get(traceId);

    trace.spans.push({
      spanId,
      parentSpanId,
      name,
      startTime: Date.now(),
      endTime: null,
      status: 'running',
      attributes: {},
      events: []
    });

    return spanId;
  }

  addEvent(traceId, spanId, name, attributes = {}) {
    const span = this.getSpan(traceId, spanId);
    span.events.push({
      name,
      timestamp: Date.now(),
      attributes
    });
  }

  endSpan(traceId, spanId, status = 'ok', error = null) {
    const span = this.getSpan(traceId, spanId);
    span.endTime = Date.now();
    span.status = status;
    if (error) {
      span.error = {
        message: error.message,
        stack: error.stack
      };
    }
  }

  getTrace(traceId) {
    return this.traces.get(traceId);
  }
}

// Usage
async function tracedChain(input) {
  const tracer = new Tracer();
  const traceId = tracer.startTrace('document-processor');

  try {
    const span1 = tracer.startSpan(traceId, 'validation');
    const validated = await validate(input);
    tracer.endSpan(traceId, span1);

    const span2 = tracer.startSpan(traceId, 'processing');
    const result = await process(validated);
    tracer.endSpan(traceId, span2);

    return result;
  } catch (error) {
    tracer.addEvent(traceId, 'current', 'error', { message: error.message });
    throw error;
  }
}

3. Metrics

class ChainMetrics {
  constructor() {
    this.counters = {};
    this.histograms = {};
    this.gauges = {};
  }

  // Count occurrences
  increment(name, labels = {}) {
    const key = this.makeKey(name, labels);
    this.counters[key] = (this.counters[key] || 0) + 1;
  }

  // Track distributions
  recordDuration(name, durationMs, labels = {}) {
    const key = this.makeKey(name, labels);
    if (!this.histograms[key]) {
      this.histograms[key] = [];
    }
    this.histograms[key].push(durationMs);
  }

  // Track current values
  setGauge(name, value, labels = {}) {
    const key = this.makeKey(name, labels);
    this.gauges[key] = value;
  }

  getMetrics() {
    return {
      counters: this.counters,
      histograms: Object.fromEntries(
        Object.entries(this.histograms).map(([k, v]) => [
          k,
          {
            count: v.length,
            avg: v.reduce((a, b) => a + b, 0) / v.length,
            p50: percentile(v, 50),
            p95: percentile(v, 95),
            p99: percentile(v, 99)
          }
        ])
      ),
      gauges: this.gauges
    };
  }
}

// Key metrics to track
const metrics = new ChainMetrics();

async function instrumentedStep(name, fn) {
  const start = Date.now();
  metrics.increment('chain.step.started', { step: name });

  try {
    const result = await fn();
    metrics.increment('chain.step.completed', { step: name });
    metrics.recordDuration('chain.step.duration', Date.now() - start, { step: name });
    return result;
  } catch (error) {
    metrics.increment('chain.step.failed', { step: name, error: error.name });
    throw error;
  }
}

Debugging Strategies

1. Replay Failed Runs

Loading Prompt Playground...

2. Step-by-Step Debugging

class ChainDebugger {
  constructor(chain) {
    this.chain = chain;
    this.breakpoints = new Set();
    this.snapshots = [];
  }

  setBreakpoint(stepName) {
    this.breakpoints.add(stepName);
  }

  async runWithDebug(input) {
    let currentInput = input;

    for (const step of this.chain.steps) {
      // Capture snapshot before step
      this.snapshots.push({
        step: step.name,
        input: structuredClone(currentInput),
        timestamp: Date.now()
      });

      // Check breakpoint
      if (this.breakpoints.has(step.name)) {
        console.log(`Breakpoint at ${step.name}`);
        console.log('Input:', JSON.stringify(currentInput, null, 2));
        await this.waitForContinue();
      }

      // Execute step
      try {
        currentInput = await step.execute(currentInput);

        // Capture output
        this.snapshots[this.snapshots.length - 1].output = structuredClone(currentInput);

      } catch (error) {
        this.snapshots[this.snapshots.length - 1].error = error;
        throw error;
      }
    }

    return currentInput;
  }

  getSnapshots() {
    return this.snapshots;
  }
}

3. Output Comparison

async function compareOutputs(originalOutput, newOutput) {
  const prompt = `
Compare these two outputs and identify differences.

ORIGINAL OUTPUT:
${JSON.stringify(originalOutput, null, 2)}

NEW OUTPUT:
${JSON.stringify(newOutput, null, 2)}

Analyze:
1. What fields are different?
2. What values changed?
3. Are the differences significant?
4. Which output is more correct/complete?

Provide structured comparison.
`;

  return await llm.chat({ content: prompt });
}

Alerting

Setting Up Alerts

class AlertManager {
  constructor(config) {
    this.thresholds = config.thresholds;
    this.channels = config.channels;
  }

  checkThresholds(metrics) {
    const alerts = [];

    // Error rate threshold
    const errorRate = metrics.errors / metrics.total;
    if (errorRate > this.thresholds.errorRate) {
      alerts.push({
        severity: 'critical',
        message: `Error rate ${(errorRate * 100).toFixed(1)}% exceeds threshold`,
        metric: 'error_rate',
        value: errorRate
      });
    }

    // Latency threshold
    if (metrics.p95Latency > this.thresholds.p95Latency) {
      alerts.push({
        severity: 'warning',
        message: `P95 latency ${metrics.p95Latency}ms exceeds threshold`,
        metric: 'p95_latency',
        value: metrics.p95Latency
      });
    }

    // Success rate drop
    if (metrics.successRate < this.thresholds.minSuccessRate) {
      alerts.push({
        severity: 'critical',
        message: `Success rate dropped to ${(metrics.successRate * 100).toFixed(1)}%`,
        metric: 'success_rate',
        value: metrics.successRate
      });
    }

    return alerts;
  }

  async sendAlerts(alerts) {
    for (const alert of alerts) {
      for (const channel of this.channels[alert.severity]) {
        await channel.send(alert);
      }
    }
  }
}

Exercise: Debug a Failing Chain

Loading Prompt Playground...

Key Takeaways

Implement structured logging for all chain steps
Use distributed tracing for complex chains
Track key metrics: latency, error rate, token usage
Build replay capability for debugging
Set up alerts for threshold breaches
Capture enough context to reproduce issues
Compare outputs when debugging regressions
Monitor for gradual degradation, not just failures

Next, we'll cover scaling chains for production.

Monitoring and Debugging Chains

Why Chains Need Special Monitoring

Observability Pillars

1. Logging

2. Tracing

Implementing Distributed Tracing

3. Metrics

Debugging Strategies

1. Replay Failed Runs

2. Step-by-Step Debugging

3. Output Comparison

Alerting

Setting Up Alerts

Exercise: Debug a Failing Chain

Key Takeaways

Discussion

Monitoring and Debugging Chains

Why Chains Need Special Monitoring

Observability Pillars

1. Logging

2. Tracing

Implementing Distributed Tracing

3. Metrics

Debugging Strategies

1. Replay Failed Runs

2. Step-by-Step Debugging

3. Output Comparison

Alerting

Setting Up Alerts

Exercise: Debug a Failing Chain

Key Takeaways

Discussion