Recovery Patterns

Building resilient systems isn’t just about preventing failures—it’s about recovering gracefully when things go wrong. With AI assistance, you can implement sophisticated recovery patterns that minimize downtime, prevent cascading failures, and maintain service quality even during disruptions. This guide covers proven recovery patterns for production systems.

The Philosophy of Resilient Recovery

Modern systems must embrace failure as inevitable and design for recovery:

Fail Fast, Recover Fast

Detect failures quickly and initiate recovery immediately

Graceful Degradation

Maintain core functionality even when some features fail

Automated Healing

Systems should recover without human intervention when possible

Learn from Failures

Each failure is an opportunity to improve resilience

Core Recovery Patterns

Pattern 1: Circuit Breaker

Stop cascading failures by preventing calls to failing services.

JavaScript/TypeScript
Java/Spring

// AI-generated circuit breaker implementation
class CircuitBreaker {
  private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
  private failures = 0;
  private lastFailureTime?: Date;
  private successfulProbes = 0;

  constructor(
    private readonly options: {
      failureThreshold: number;
      resetTimeout: number;
      probeThreshold: number;
      onStateChange?: (oldState: string, newState: string) => void;
    }
  ) {}

  async execute<T>(operation: () => Promise<T>): Promise<T> {
    if (this.state === 'OPEN') {
      if (this.shouldAttemptReset()) {
        this.transitionTo('HALF_OPEN');
      } else {
        throw new Error('Circuit breaker is OPEN');
      }
    }

    try {
      const result = await operation();
      this.onSuccess();
      return result;
    } catch (error) {
      this.onFailure();
      throw error;
    }
  }

  private onSuccess() {
    this.failures = 0;

    if (this.state === 'HALF_OPEN') {
      this.successfulProbes++;
      if (this.successfulProbes >= this.options.probeThreshold) {
        this.transitionTo('CLOSED');
      }
    }
  }

  private onFailure() {
    this.failures++;
    this.lastFailureTime = new Date();

    if (this.failures >= this.options.failureThreshold) {
      this.transitionTo('OPEN');
    }

    if (this.state === 'HALF_OPEN') {
      this.transitionTo('OPEN');
    }
  }

  private shouldAttemptReset(): boolean {
    return (
      this.lastFailureTime &&
      Date.now() - this.lastFailureTime.getTime() > this.options.resetTimeout
    );
  }

  private transitionTo(newState: 'CLOSED' | 'OPEN' | 'HALF_OPEN') {
    const oldState = this.state;
    this.state = newState;

    if (newState === 'HALF_OPEN') {
      this.successfulProbes = 0;
    }

    this.options.onStateChange?.(oldState, newState);
  }
}

// Usage with fallback
const paymentBreaker = new CircuitBreaker({
  failureThreshold: 5,
  resetTimeout: 60000, // 1 minute
  probeThreshold: 3,
  onStateChange: (oldState, newState) => {
    logger.warn('Circuit breaker state changed', { oldState, newState });
  }
});

async function processPayment(order: Order) {
  try {
    return await paymentBreaker.execute(
      () => paymentService.process(order)
    );
  } catch (error) {
    if (error.message === 'Circuit breaker is OPEN') {
      // Fallback to queued processing
      return await queuePaymentForLater(order);
    }
    throw error;
  }
}

// Using Resilience4j with Spring Boot
@Service
public class PaymentService {
    private final CircuitBreaker circuitBreaker;
    private final PaymentClient paymentClient;

    public PaymentService() {
        CircuitBreakerConfig config = CircuitBreakerConfig.custom()
            .failureRateThreshold(50)
            .waitDurationInOpenState(Duration.ofSeconds(60))
            .slidingWindowSize(100)
            .permittedNumberOfCallsInHalfOpenState(3)
            .recordExceptions(IOException.class, TimeoutException.class)
            .ignoreExceptions(BusinessException.class)
            .build();

        this.circuitBreaker = CircuitBreaker.of("payment", config);

        // Add event listeners
        circuitBreaker.getEventPublisher()
            .onStateTransition(event ->
                log.warn("Circuit breaker state transition: {}", event));
    }

    public PaymentResult processPayment(Order order) {
        return circuitBreaker.executeSupplier(
            () -> paymentClient.process(order),
            throwable -> {
                log.error("Payment failed, using fallback", throwable);
                return queuePaymentForLater(order);
            }
        );
    }
}

Pattern 2: Retry with Exponential Backoff

Automatically retry failed operations with increasing delays.

// Intelligent retry mechanism with jitter
class RetryStrategy {
  async executeWithRetry<T>(
    operation: () => Promise<T>,
    options: {
      maxRetries: number;
      initialDelay: number;
      maxDelay: number;
      factor: number;
      jitter?: boolean;
      retryableErrors?: (error: any) => boolean;
    }
  ): Promise<T> {
    let lastError: any;

    for (let attempt = 0; attempt <= options.maxRetries; attempt++) {
      try {
        return await operation();
      } catch (error) {
        lastError = error;

        // Check if error is retryable
        if (options.retryableErrors && !options.retryableErrors(error)) {
          throw error;
        }

        if (attempt < options.maxRetries) {
          const delay = this.calculateDelay(
            attempt,
            options.initialDelay,
            options.maxDelay,
            options.factor,
            options.jitter
          );

          logger.debug('Retrying operation', {
            attempt: attempt + 1,
            maxRetries: options.maxRetries,
            delay,
            error: error.message
          });

          await this.sleep(delay);
        }
      }
    }

    throw new Error(`Operation failed after ${options.maxRetries} retries: ${lastError.message}`);
  }

  private calculateDelay(
    attempt: number,
    initialDelay: number,
    maxDelay: number,
    factor: number,
    jitter?: boolean
  ): number {
    // Exponential backoff
    let delay = Math.min(initialDelay * Math.pow(factor, attempt), maxDelay);

    // Add jitter to prevent thundering herd
    if (jitter) {
      delay = delay * (0.5 + Math.random() * 0.5);
    }

    return Math.floor(delay);
  }

  private sleep(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage with custom retry logic
const retry = new RetryStrategy();

async function fetchUserData(userId: string) {
  return retry.executeWithRetry(
    () => apiClient.getUser(userId),
    {
      maxRetries: 3,
      initialDelay: 1000,
      maxDelay: 10000,
      factor: 2,
      jitter: true,
      retryableErrors: (error) => {
        // Retry on network errors and 5xx status codes
        return error.code === 'ECONNRESET' ||
               error.code === 'ETIMEDOUT' ||
               (error.status >= 500 && error.status < 600);
      }
    }
  );
}

Pattern 3: Bulkhead Isolation

Isolate failures to prevent them from affecting the entire system.

Resource Isolation Pattern

// Thread pool isolation for different operations
class BulkheadManager {
  private bulkheads = new Map<string, Bulkhead>();

  createBulkhead(name: string, config: BulkheadConfig) {
    const bulkhead = new Bulkhead(config);
    this.bulkheads.set(name, bulkhead);
    return bulkhead;
  }

  getBulkhead(name: string): Bulkhead {
    const bulkhead = this.bulkheads.get(name);
    if (!bulkhead) {
      throw new Error(`Bulkhead ${name} not found`);
    }
    return bulkhead;
  }
}

class Bulkhead {
  private semaphore: Semaphore;
  private queue: Array<() => void> = [];
  private activeRequests = 0;

  constructor(private config: BulkheadConfig) {
    this.semaphore = new Semaphore(config.maxConcurrent);
  }

  async execute<T>(operation: () => Promise<T>): Promise<T> {
    // Check if queue is full
    if (this.queue.length >= this.config.maxQueueSize) {
      throw new Error('Bulkhead queue is full');
    }

    // Wait for available slot
    await this.semaphore.acquire();
    this.activeRequests++;

    try {
      return await operation();
    } finally {
      this.activeRequests--;
      this.semaphore.release();
    }
  }

  getMetrics() {
    return {
      activeRequests: this.activeRequests,
      queueSize: this.queue.length,
      maxConcurrent: this.config.maxConcurrent,
      maxQueueSize: this.config.maxQueueSize
    };
  }
}

// Usage example
const bulkheadManager = new BulkheadManager();

// Create separate bulkheads for different operations
bulkheadManager.createBulkhead('payment', {
  maxConcurrent: 10,
  maxQueueSize: 50
});

bulkheadManager.createBulkhead('search', {
  maxConcurrent: 20,
  maxQueueSize: 100
});

// Use bulkheads to isolate operations
async function processPayment(order: Order) {
  const bulkhead = bulkheadManager.getBulkhead('payment');
  return bulkhead.execute(() => paymentService.process(order));
}

async function searchProducts(query: string) {
  const bulkhead = bulkheadManager.getBulkhead('search');
  return bulkhead.execute(() => searchService.search(query));
}

Pattern 4: Timeout Management

Prevent resource exhaustion from slow operations.

// Comprehensive timeout handling
class TimeoutManager {
  async executeWithTimeout<T>(
    operation: () => Promise<T>,
    timeoutMs: number,
    options?: {
      onTimeout?: () => void;
      cancelOnTimeout?: boolean;
    }
  ): Promise<T> {
    const timeoutPromise = new Promise<never>((_, reject) => {
      const timer = setTimeout(() => {
        options?.onTimeout?.();
        reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`));
      }, timeoutMs);

      // Clean up timer if operation completes first
      operation().finally(() => clearTimeout(timer));
    });

    try {
      return await Promise.race([operation(), timeoutPromise]);
    } catch (error) {
      if (error instanceof TimeoutError && options?.cancelOnTimeout) {
        // Attempt to cancel the operation if possible
        this.cancelOperation(operation);
      }
      throw error;
    }
  }

  private cancelOperation(operation: any) {
    // Implementation depends on the operation type
    // For HTTP requests, abort the request
    // For database queries, kill the query
    // For async operations, set a cancellation flag
  }
}

// Cascading timeouts for distributed calls
class CascadingTimeout {
  constructor(private totalTimeout: number) {}

  async executeWithCascadingTimeout<T>(
    operations: Array<{
      name: string;
      operation: () => Promise<any>;
      weight: number; // Relative importance/expected duration
    }>
  ): Promise<T[]> {
    const results: T[] = [];
    let remainingTime = this.totalTimeout;
    const startTime = Date.now();

    const totalWeight = operations.reduce((sum, op) => sum + op.weight, 0);

    for (const { name, operation, weight } of operations) {
      const allocatedTime = Math.floor((weight / totalWeight) * this.totalTimeout);
      const actualTimeout = Math.min(allocatedTime, remainingTime);

      logger.debug('Executing operation with timeout', {
        name,
        allocatedTime,
        actualTimeout,
        remainingTime
      });

      try {
        const result = await new TimeoutManager().executeWithTimeout(
          operation,
          actualTimeout
        );
        results.push(result);

        // Update remaining time
        remainingTime = this.totalTimeout - (Date.now() - startTime);
      } catch (error) {
        logger.error('Operation failed', { name, error });
        throw error;
      }
    }

    return results;
  }
}

Pattern 5: Fallback Mechanisms

Provide alternative functionality when primary systems fail.

// Multi-level fallback strategy
class FallbackChain<T> {
  private strategies: Array<{
    name: string;
    execute: () => Promise<T>;
    condition?: (error: any) => boolean;
  }> = [];

  addStrategy(
    name: string,
    execute: () => Promise<T>,
    condition?: (error: any) => boolean
  ) {
    this.strategies.push({ name, execute, condition });
    return this;
  }

  async execute(): Promise<T> {
    const errors: Array<{ strategy: string; error: any }> = [];

    for (const strategy of this.strategies) {
      try {
        logger.debug('Attempting strategy', { name: strategy.name });
        const result = await strategy.execute();
        logger.info('Strategy succeeded', { name: strategy.name });
        return result;
      } catch (error) {
        errors.push({ strategy: strategy.name, error });

        // Check if we should try the next strategy
        if (strategy.condition && !strategy.condition(error)) {
          logger.error('Strategy failed with non-recoverable error', {
            strategy: strategy.name,
            error
          });
          throw error;
        }

        logger.warn('Strategy failed, trying next', {
          strategy: strategy.name,
          error: error.message
        });
      }
    }

    // All strategies failed
    throw new Error(
      `All fallback strategies failed: ${JSON.stringify(errors)}`
    );
  }
}

// Real-world example: User profile loading
async function getUserProfile(userId: string): Promise<UserProfile> {
  const fallback = new FallbackChain<UserProfile>();

  return fallback
    .addStrategy('primary-db', async () => {
      // Try primary database
      return await primaryDb.getUser(userId);
    })
    .addStrategy('replica-db', async () => {
      // Fallback to read replica
      logger.warn('Using read replica for user profile');
      return await replicaDb.getUser(userId);
    })
    .addStrategy('cache', async () => {
      // Fallback to cache (might be stale)
      logger.warn('Using cached user profile');
      const cached = await cache.get(`user:${userId}`);
      if (!cached) throw new Error('Not in cache');
      return { ...cached, stale: true };
    })
    .addStrategy('default', async () => {
      // Last resort: return minimal profile
      logger.error('Using default user profile');
      return {
        id: userId,
        name: 'User',
        avatar: '/default-avatar.png',
        limited: true
      };
    })
    .execute();
}

Advanced Recovery Patterns

Self-Healing Systems

Implement automated recovery without human intervention.

// Self-healing service manager
class SelfHealingService {
  private healthChecks = new Map<string, HealthCheck>();
  private healingStrategies = new Map<string, HealingStrategy>();
  private isHealing = false;

  registerHealthCheck(name: string, check: HealthCheck) {
    this.healthChecks.set(name, check);
  }

  registerHealingStrategy(name: string, strategy: HealingStrategy) {
    this.healingStrategies.set(name, strategy);
  }

  async monitorAndHeal() {
    setInterval(async () => {
      if (this.isHealing) return;

      for (const [name, check] of this.healthChecks) {
        try {
          const isHealthy = await check.isHealthy();

          if (!isHealthy) {
            await this.attemptHealing(name, check);
          }
        } catch (error) {
          logger.error('Health check failed', { name, error });
        }
      }
    }, 30000); // Check every 30 seconds
  }

  private async attemptHealing(
    checkName: string,
    check: HealthCheck
  ) {
    this.isHealing = true;
    const diagnosis = await check.diagnose();

    logger.warn('Unhealthy service detected', {
      check: checkName,
      diagnosis
    });

    try {
      // Find appropriate healing strategy
      const strategy = this.findHealingStrategy(diagnosis);

      if (strategy) {
        logger.info('Attempting self-healing', {
          check: checkName,
          strategy: strategy.name
        });

        await strategy.heal(diagnosis);

        // Verify healing was successful
        await this.sleep(5000);
        const isHealthy = await check.isHealthy();

        if (isHealthy) {
          logger.info('Self-healing successful', {
            check: checkName,
            strategy: strategy.name
          });
        } else {
          logger.error('Self-healing failed', {
            check: checkName,
            strategy: strategy.name
          });
          // Could escalate to alerts here
        }
      }
    } finally {
      this.isHealing = false;
    }
  }

  private findHealingStrategy(diagnosis: Diagnosis): HealingStrategy | null {
    for (const [, strategy] of this.healingStrategies) {
      if (strategy.canHeal(diagnosis)) {
        return strategy;
      }
    }
    return null;
  }
}

// Example: Database connection pool healing
const dbHealing = new SelfHealingService();

dbHealing.registerHealthCheck('db-connections', {
  isHealthy: async () => {
    const stats = await db.getPoolStats();
    return stats.active < stats.max * 0.9 &&
           stats.waiting === 0;
  },
  diagnose: async () => {
    const stats = await db.getPoolStats();
    return {
      type: 'connection-exhaustion',
      stats,
      timestamp: new Date()
    };
  }
});

dbHealing.registerHealingStrategy('connection-recovery', {
  name: 'connection-recovery',
  canHeal: (diagnosis) => diagnosis.type === 'connection-exhaustion',
  heal: async (diagnosis) => {
    // Clear idle connections
    await db.clearIdleConnections();

    // Increase pool size temporarily
    await db.setPoolSize(diagnosis.stats.max * 1.5);

    // Schedule pool size reduction
    setTimeout(async () => {
      await db.setPoolSize(diagnosis.stats.max);
    }, 300000); // 5 minutes
  }
});

Chaos Engineering Integration

Test recovery mechanisms proactively.

// Chaos engineering framework for testing recovery
class ChaosMonkey {
  private experiments = new Map<string, ChaosExperiment>();

  registerExperiment(experiment: ChaosExperiment) {
    this.experiments.set(experiment.name, experiment);
  }

  async runExperiment(name: string, options?: RunOptions) {
    const experiment = this.experiments.get(name);
    if (!experiment) {
      throw new Error(`Experiment ${name} not found`);
    }

    logger.info('Starting chaos experiment', { name });

    // Record initial state
    const initialMetrics = await this.captureMetrics();

    // Inject failure
    const cleanup = await experiment.inject();

    try {
      // Let the system respond
      await this.sleep(options?.duration || 60000);

      // Measure impact
      const impactMetrics = await this.captureMetrics();

      // Verify recovery mechanisms kicked in
      const recoverySuccess = await experiment.verifyRecovery();

      return {
        experiment: name,
        initialMetrics,
        impactMetrics,
        recoverySuccess,
        observations: experiment.observations
      };
    } finally {
      // Always cleanup
      await cleanup();
    }
  }

  private async captureMetrics() {
    return {
      errorRate: await metrics.getErrorRate(),
      responseTime: await metrics.getResponseTime(),
      throughput: await metrics.getThroughput(),
      availability: await metrics.getAvailability()
    };
  }
}

// Example experiment: Test circuit breaker
const chaosMonkey = new ChaosMonkey();

chaosMonkey.registerExperiment({
  name: 'payment-service-failure',
  description: 'Simulate payment service outage',

  inject: async () => {
    // Make payment service return errors
    await mockServer.setResponse('/api/payment', {
      status: 500,
      body: { error: 'Internal Server Error' }
    });

    // Return cleanup function
    return async () => {
      await mockServer.resetResponse('/api/payment');
    };
  },

  verifyRecovery: async () => {
    // Check if circuit breaker opened
    const circuitState = await getCircuitBreakerState('payment');

    // Check if fallback was used
    const fallbackMetrics = await metrics.getFallbackUsage();

    return {
      circuitBreakerOpened: circuitState === 'OPEN',
      fallbackUsed: fallbackMetrics.count > 0,
      userExperienceMaintained: await checkUserExperience()
    };
  },

  observations: []
});

Recovery Best Practices

Fast Detection

Use health checks and synthetic monitoring
Set appropriate timeouts
Monitor error rates and latencies
Alert on anomalies, not just thresholds

Graceful Degradation

Identify core vs. optional features
Implement feature flags for quick disabling
Provide meaningful fallbacks
Communicate degradation to users

Automated Recovery

Implement self-healing mechanisms
Use circuit breakers to prevent cascades
Automate common recovery procedures
Test recovery paths regularly

Learn and Improve

Conduct blameless postmortems
Update runbooks based on incidents
Implement fixes to prevent recurrence
Share learnings across teams

Monitoring Recovery Effectiveness

Track these metrics to measure recovery success:

// Recovery metrics collector
class RecoveryMetrics {
  private metrics = {
    mttr: new Map<string, number[]>(), // Mean Time To Recovery
    recoverySuccess: new Map<string, number>(),
    fallbackUsage: new Map<string, number>(),
    circuitBreakerTrips: new Map<string, number>()
  };

  recordRecovery(service: string, duration: number, success: boolean) {
    // Track MTTR
    if (!this.metrics.mttr.has(service)) {
      this.metrics.mttr.set(service, []);
    }
    this.metrics.mttr.get(service)!.push(duration);

    // Track success rate
    const current = this.metrics.recoverySuccess.get(service) || 0;
    this.metrics.recoverySuccess.set(
      service,
      success ? current + 1 : current
    );
  }

  getReport(service: string) {
    const mttrValues = this.metrics.mttr.get(service) || [];
    const avgMttr = mttrValues.reduce((a, b) => a + b, 0) / mttrValues.length;

    return {
      averageMTTR: avgMttr,
      recoverySuccessRate: this.calculateSuccessRate(service),
      fallbackUsageRate: this.metrics.fallbackUsage.get(service) || 0,
      circuitBreakerTrips: this.metrics.circuitBreakerTrips.get(service) || 0
    };
  }
}

Recovery Runbooks

Create automated runbooks for common failure scenarios:

// Automated recovery runbook
class RecoveryRunbook {
  constructor(
    private name: string,
    private steps: RecoveryStep[]
  ) {}

  async execute(context: FailureContext): Promise<RecoveryResult> {
    const results: StepResult[] = [];

    logger.info('Executing recovery runbook', {
      runbook: this.name,
      context
    });

    for (const step of this.steps) {
      try {
        logger.info('Executing recovery step', {
          step: step.name
        });

        const result = await step.execute(context);
        results.push(result);

        if (!result.success && step.critical) {
          logger.error('Critical step failed', {
            step: step.name,
            result
          });
          break;
        }
      } catch (error) {
        logger.error('Recovery step failed', {
          step: step.name,
          error
        });

        if (step.critical) break;
      }
    }

    return {
      runbook: this.name,
      success: results.every(r => r.success),
      steps: results,
      timestamp: new Date()
    };
  }
}

// Example: Database recovery runbook
const dbRecoveryRunbook = new RecoveryRunbook('database-recovery', [
  {
    name: 'verify-connectivity',
    critical: true,
    execute: async (context) => {
      const canConnect = await db.testConnection();
      return { success: canConnect };
    }
  },
  {
    name: 'clear-connection-pool',
    critical: false,
    execute: async (context) => {
      await db.clearPool();
      return { success: true };
    }
  },
  {
    name: 'failover-to-replica',
    critical: true,
    execute: async (context) => {
      if (context.severity === 'critical') {
        await db.failoverToReplica();
        return { success: true };
      }
      return { success: false, skipped: true };
    }
  },
  {
    name: 'notify-oncall',
    critical: false,
    execute: async (context) => {
      await alerting.notifyOncall({
        severity: context.severity,
        runbook: 'database-recovery',
        context
      });
      return { success: true };
    }
  }
]);

Next Steps

Master recovery patterns with:

Debugging Patterns - Find root causes quickly
Logging Patterns - Track recovery effectiveness
Monitoring Patterns - Detect failures early