Skip to content

Recovery Patterns

Building resilient systems isn’t just about preventing failures—it’s about recovering gracefully when things go wrong. With AI assistance, you can implement sophisticated recovery patterns that minimize downtime, prevent cascading failures, and maintain service quality even during disruptions. This guide covers proven recovery patterns for production systems.

Modern systems must embrace failure as inevitable and design for recovery:

Fail Fast, Recover Fast

Detect failures quickly and initiate recovery immediately

Graceful Degradation

Maintain core functionality even when some features fail

Automated Healing

Systems should recover without human intervention when possible

Learn from Failures

Each failure is an opportunity to improve resilience

Stop cascading failures by preventing calls to failing services.

// AI-generated circuit breaker implementation
class CircuitBreaker {
private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
private failures = 0;
private lastFailureTime?: Date;
private successfulProbes = 0;
constructor(
private readonly options: {
failureThreshold: number;
resetTimeout: number;
probeThreshold: number;
onStateChange?: (oldState: string, newState: string) => void;
}
) {}
async execute<T>(operation: () => Promise<T>): Promise<T> {
if (this.state === 'OPEN') {
if (this.shouldAttemptReset()) {
this.transitionTo('HALF_OPEN');
} else {
throw new Error('Circuit breaker is OPEN');
}
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
private onSuccess() {
this.failures = 0;
if (this.state === 'HALF_OPEN') {
this.successfulProbes++;
if (this.successfulProbes >= this.options.probeThreshold) {
this.transitionTo('CLOSED');
}
}
}
private onFailure() {
this.failures++;
this.lastFailureTime = new Date();
if (this.failures >= this.options.failureThreshold) {
this.transitionTo('OPEN');
}
if (this.state === 'HALF_OPEN') {
this.transitionTo('OPEN');
}
}
private shouldAttemptReset(): boolean {
return (
this.lastFailureTime &&
Date.now() - this.lastFailureTime.getTime() > this.options.resetTimeout
);
}
private transitionTo(newState: 'CLOSED' | 'OPEN' | 'HALF_OPEN') {
const oldState = this.state;
this.state = newState;
if (newState === 'HALF_OPEN') {
this.successfulProbes = 0;
}
this.options.onStateChange?.(oldState, newState);
}
}
// Usage with fallback
const paymentBreaker = new CircuitBreaker({
failureThreshold: 5,
resetTimeout: 60000, // 1 minute
probeThreshold: 3,
onStateChange: (oldState, newState) => {
logger.warn('Circuit breaker state changed', { oldState, newState });
}
});
async function processPayment(order: Order) {
try {
return await paymentBreaker.execute(
() => paymentService.process(order)
);
} catch (error) {
if (error.message === 'Circuit breaker is OPEN') {
// Fallback to queued processing
return await queuePaymentForLater(order);
}
throw error;
}
}

Automatically retry failed operations with increasing delays.

// Intelligent retry mechanism with jitter
class RetryStrategy {
async executeWithRetry<T>(
operation: () => Promise<T>,
options: {
maxRetries: number;
initialDelay: number;
maxDelay: number;
factor: number;
jitter?: boolean;
retryableErrors?: (error: any) => boolean;
}
): Promise<T> {
let lastError: any;
for (let attempt = 0; attempt <= options.maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error;
// Check if error is retryable
if (options.retryableErrors && !options.retryableErrors(error)) {
throw error;
}
if (attempt < options.maxRetries) {
const delay = this.calculateDelay(
attempt,
options.initialDelay,
options.maxDelay,
options.factor,
options.jitter
);
logger.debug('Retrying operation', {
attempt: attempt + 1,
maxRetries: options.maxRetries,
delay,
error: error.message
});
await this.sleep(delay);
}
}
}
throw new Error(`Operation failed after ${options.maxRetries} retries: ${lastError.message}`);
}
private calculateDelay(
attempt: number,
initialDelay: number,
maxDelay: number,
factor: number,
jitter?: boolean
): number {
// Exponential backoff
let delay = Math.min(initialDelay * Math.pow(factor, attempt), maxDelay);
// Add jitter to prevent thundering herd
if (jitter) {
delay = delay * (0.5 + Math.random() * 0.5);
}
return Math.floor(delay);
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage with custom retry logic
const retry = new RetryStrategy();
async function fetchUserData(userId: string) {
return retry.executeWithRetry(
() => apiClient.getUser(userId),
{
maxRetries: 3,
initialDelay: 1000,
maxDelay: 10000,
factor: 2,
jitter: true,
retryableErrors: (error) => {
// Retry on network errors and 5xx status codes
return error.code === 'ECONNRESET' ||
error.code === 'ETIMEDOUT' ||
(error.status >= 500 && error.status < 600);
}
}
);
}

Isolate failures to prevent them from affecting the entire system.

Resource Isolation Pattern

// Thread pool isolation for different operations
class BulkheadManager {
private bulkheads = new Map<string, Bulkhead>();
createBulkhead(name: string, config: BulkheadConfig) {
const bulkhead = new Bulkhead(config);
this.bulkheads.set(name, bulkhead);
return bulkhead;
}
getBulkhead(name: string): Bulkhead {
const bulkhead = this.bulkheads.get(name);
if (!bulkhead) {
throw new Error(`Bulkhead ${name} not found`);
}
return bulkhead;
}
}
class Bulkhead {
private semaphore: Semaphore;
private queue: Array<() => void> = [];
private activeRequests = 0;
constructor(private config: BulkheadConfig) {
this.semaphore = new Semaphore(config.maxConcurrent);
}
async execute<T>(operation: () => Promise<T>): Promise<T> {
// Check if queue is full
if (this.queue.length >= this.config.maxQueueSize) {
throw new Error('Bulkhead queue is full');
}
// Wait for available slot
await this.semaphore.acquire();
this.activeRequests++;
try {
return await operation();
} finally {
this.activeRequests--;
this.semaphore.release();
}
}
getMetrics() {
return {
activeRequests: this.activeRequests,
queueSize: this.queue.length,
maxConcurrent: this.config.maxConcurrent,
maxQueueSize: this.config.maxQueueSize
};
}
}
// Usage example
const bulkheadManager = new BulkheadManager();
// Create separate bulkheads for different operations
bulkheadManager.createBulkhead('payment', {
maxConcurrent: 10,
maxQueueSize: 50
});
bulkheadManager.createBulkhead('search', {
maxConcurrent: 20,
maxQueueSize: 100
});
// Use bulkheads to isolate operations
async function processPayment(order: Order) {
const bulkhead = bulkheadManager.getBulkhead('payment');
return bulkhead.execute(() => paymentService.process(order));
}
async function searchProducts(query: string) {
const bulkhead = bulkheadManager.getBulkhead('search');
return bulkhead.execute(() => searchService.search(query));
}

Prevent resource exhaustion from slow operations.

// Comprehensive timeout handling
class TimeoutManager {
async executeWithTimeout<T>(
operation: () => Promise<T>,
timeoutMs: number,
options?: {
onTimeout?: () => void;
cancelOnTimeout?: boolean;
}
): Promise<T> {
const timeoutPromise = new Promise<never>((_, reject) => {
const timer = setTimeout(() => {
options?.onTimeout?.();
reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`));
}, timeoutMs);
// Clean up timer if operation completes first
operation().finally(() => clearTimeout(timer));
});
try {
return await Promise.race([operation(), timeoutPromise]);
} catch (error) {
if (error instanceof TimeoutError && options?.cancelOnTimeout) {
// Attempt to cancel the operation if possible
this.cancelOperation(operation);
}
throw error;
}
}
private cancelOperation(operation: any) {
// Implementation depends on the operation type
// For HTTP requests, abort the request
// For database queries, kill the query
// For async operations, set a cancellation flag
}
}
// Cascading timeouts for distributed calls
class CascadingTimeout {
constructor(private totalTimeout: number) {}
async executeWithCascadingTimeout<T>(
operations: Array<{
name: string;
operation: () => Promise<any>;
weight: number; // Relative importance/expected duration
}>
): Promise<T[]> {
const results: T[] = [];
let remainingTime = this.totalTimeout;
const startTime = Date.now();
const totalWeight = operations.reduce((sum, op) => sum + op.weight, 0);
for (const { name, operation, weight } of operations) {
const allocatedTime = Math.floor((weight / totalWeight) * this.totalTimeout);
const actualTimeout = Math.min(allocatedTime, remainingTime);
logger.debug('Executing operation with timeout', {
name,
allocatedTime,
actualTimeout,
remainingTime
});
try {
const result = await new TimeoutManager().executeWithTimeout(
operation,
actualTimeout
);
results.push(result);
// Update remaining time
remainingTime = this.totalTimeout - (Date.now() - startTime);
} catch (error) {
logger.error('Operation failed', { name, error });
throw error;
}
}
return results;
}
}

Provide alternative functionality when primary systems fail.

// Multi-level fallback strategy
class FallbackChain<T> {
private strategies: Array<{
name: string;
execute: () => Promise<T>;
condition?: (error: any) => boolean;
}> = [];
addStrategy(
name: string,
execute: () => Promise<T>,
condition?: (error: any) => boolean
) {
this.strategies.push({ name, execute, condition });
return this;
}
async execute(): Promise<T> {
const errors: Array<{ strategy: string; error: any }> = [];
for (const strategy of this.strategies) {
try {
logger.debug('Attempting strategy', { name: strategy.name });
const result = await strategy.execute();
logger.info('Strategy succeeded', { name: strategy.name });
return result;
} catch (error) {
errors.push({ strategy: strategy.name, error });
// Check if we should try the next strategy
if (strategy.condition && !strategy.condition(error)) {
logger.error('Strategy failed with non-recoverable error', {
strategy: strategy.name,
error
});
throw error;
}
logger.warn('Strategy failed, trying next', {
strategy: strategy.name,
error: error.message
});
}
}
// All strategies failed
throw new Error(
`All fallback strategies failed: ${JSON.stringify(errors)}`
);
}
}
// Real-world example: User profile loading
async function getUserProfile(userId: string): Promise<UserProfile> {
const fallback = new FallbackChain<UserProfile>();
return fallback
.addStrategy('primary-db', async () => {
// Try primary database
return await primaryDb.getUser(userId);
})
.addStrategy('replica-db', async () => {
// Fallback to read replica
logger.warn('Using read replica for user profile');
return await replicaDb.getUser(userId);
})
.addStrategy('cache', async () => {
// Fallback to cache (might be stale)
logger.warn('Using cached user profile');
const cached = await cache.get(`user:${userId}`);
if (!cached) throw new Error('Not in cache');
return { ...cached, stale: true };
})
.addStrategy('default', async () => {
// Last resort: return minimal profile
logger.error('Using default user profile');
return {
id: userId,
name: 'User',
avatar: '/default-avatar.png',
limited: true
};
})
.execute();
}

Implement automated recovery without human intervention.

// Self-healing service manager
class SelfHealingService {
private healthChecks = new Map<string, HealthCheck>();
private healingStrategies = new Map<string, HealingStrategy>();
private isHealing = false;
registerHealthCheck(name: string, check: HealthCheck) {
this.healthChecks.set(name, check);
}
registerHealingStrategy(name: string, strategy: HealingStrategy) {
this.healingStrategies.set(name, strategy);
}
async monitorAndHeal() {
setInterval(async () => {
if (this.isHealing) return;
for (const [name, check] of this.healthChecks) {
try {
const isHealthy = await check.isHealthy();
if (!isHealthy) {
await this.attemptHealing(name, check);
}
} catch (error) {
logger.error('Health check failed', { name, error });
}
}
}, 30000); // Check every 30 seconds
}
private async attemptHealing(
checkName: string,
check: HealthCheck
) {
this.isHealing = true;
const diagnosis = await check.diagnose();
logger.warn('Unhealthy service detected', {
check: checkName,
diagnosis
});
try {
// Find appropriate healing strategy
const strategy = this.findHealingStrategy(diagnosis);
if (strategy) {
logger.info('Attempting self-healing', {
check: checkName,
strategy: strategy.name
});
await strategy.heal(diagnosis);
// Verify healing was successful
await this.sleep(5000);
const isHealthy = await check.isHealthy();
if (isHealthy) {
logger.info('Self-healing successful', {
check: checkName,
strategy: strategy.name
});
} else {
logger.error('Self-healing failed', {
check: checkName,
strategy: strategy.name
});
// Could escalate to alerts here
}
}
} finally {
this.isHealing = false;
}
}
private findHealingStrategy(diagnosis: Diagnosis): HealingStrategy | null {
for (const [, strategy] of this.healingStrategies) {
if (strategy.canHeal(diagnosis)) {
return strategy;
}
}
return null;
}
}
// Example: Database connection pool healing
const dbHealing = new SelfHealingService();
dbHealing.registerHealthCheck('db-connections', {
isHealthy: async () => {
const stats = await db.getPoolStats();
return stats.active < stats.max * 0.9 &&
stats.waiting === 0;
},
diagnose: async () => {
const stats = await db.getPoolStats();
return {
type: 'connection-exhaustion',
stats,
timestamp: new Date()
};
}
});
dbHealing.registerHealingStrategy('connection-recovery', {
name: 'connection-recovery',
canHeal: (diagnosis) => diagnosis.type === 'connection-exhaustion',
heal: async (diagnosis) => {
// Clear idle connections
await db.clearIdleConnections();
// Increase pool size temporarily
await db.setPoolSize(diagnosis.stats.max * 1.5);
// Schedule pool size reduction
setTimeout(async () => {
await db.setPoolSize(diagnosis.stats.max);
}, 300000); // 5 minutes
}
});

Test recovery mechanisms proactively.

// Chaos engineering framework for testing recovery
class ChaosMonkey {
private experiments = new Map<string, ChaosExperiment>();
registerExperiment(experiment: ChaosExperiment) {
this.experiments.set(experiment.name, experiment);
}
async runExperiment(name: string, options?: RunOptions) {
const experiment = this.experiments.get(name);
if (!experiment) {
throw new Error(`Experiment ${name} not found`);
}
logger.info('Starting chaos experiment', { name });
// Record initial state
const initialMetrics = await this.captureMetrics();
// Inject failure
const cleanup = await experiment.inject();
try {
// Let the system respond
await this.sleep(options?.duration || 60000);
// Measure impact
const impactMetrics = await this.captureMetrics();
// Verify recovery mechanisms kicked in
const recoverySuccess = await experiment.verifyRecovery();
return {
experiment: name,
initialMetrics,
impactMetrics,
recoverySuccess,
observations: experiment.observations
};
} finally {
// Always cleanup
await cleanup();
}
}
private async captureMetrics() {
return {
errorRate: await metrics.getErrorRate(),
responseTime: await metrics.getResponseTime(),
throughput: await metrics.getThroughput(),
availability: await metrics.getAvailability()
};
}
}
// Example experiment: Test circuit breaker
const chaosMonkey = new ChaosMonkey();
chaosMonkey.registerExperiment({
name: 'payment-service-failure',
description: 'Simulate payment service outage',
inject: async () => {
// Make payment service return errors
await mockServer.setResponse('/api/payment', {
status: 500,
body: { error: 'Internal Server Error' }
});
// Return cleanup function
return async () => {
await mockServer.resetResponse('/api/payment');
};
},
verifyRecovery: async () => {
// Check if circuit breaker opened
const circuitState = await getCircuitBreakerState('payment');
// Check if fallback was used
const fallbackMetrics = await metrics.getFallbackUsage();
return {
circuitBreakerOpened: circuitState === 'OPEN',
fallbackUsed: fallbackMetrics.count > 0,
userExperienceMaintained: await checkUserExperience()
};
},
observations: []
});

Fast Detection

  • Use health checks and synthetic monitoring
  • Set appropriate timeouts
  • Monitor error rates and latencies
  • Alert on anomalies, not just thresholds

Graceful Degradation

  • Identify core vs. optional features
  • Implement feature flags for quick disabling
  • Provide meaningful fallbacks
  • Communicate degradation to users

Automated Recovery

  • Implement self-healing mechanisms
  • Use circuit breakers to prevent cascades
  • Automate common recovery procedures
  • Test recovery paths regularly

Learn and Improve

  • Conduct blameless postmortems
  • Update runbooks based on incidents
  • Implement fixes to prevent recurrence
  • Share learnings across teams

Track these metrics to measure recovery success:

// Recovery metrics collector
class RecoveryMetrics {
private metrics = {
mttr: new Map<string, number[]>(), // Mean Time To Recovery
recoverySuccess: new Map<string, number>(),
fallbackUsage: new Map<string, number>(),
circuitBreakerTrips: new Map<string, number>()
};
recordRecovery(service: string, duration: number, success: boolean) {
// Track MTTR
if (!this.metrics.mttr.has(service)) {
this.metrics.mttr.set(service, []);
}
this.metrics.mttr.get(service)!.push(duration);
// Track success rate
const current = this.metrics.recoverySuccess.get(service) || 0;
this.metrics.recoverySuccess.set(
service,
success ? current + 1 : current
);
}
getReport(service: string) {
const mttrValues = this.metrics.mttr.get(service) || [];
const avgMttr = mttrValues.reduce((a, b) => a + b, 0) / mttrValues.length;
return {
averageMTTR: avgMttr,
recoverySuccessRate: this.calculateSuccessRate(service),
fallbackUsageRate: this.metrics.fallbackUsage.get(service) || 0,
circuitBreakerTrips: this.metrics.circuitBreakerTrips.get(service) || 0
};
}
}

Create automated runbooks for common failure scenarios:

// Automated recovery runbook
class RecoveryRunbook {
constructor(
private name: string,
private steps: RecoveryStep[]
) {}
async execute(context: FailureContext): Promise<RecoveryResult> {
const results: StepResult[] = [];
logger.info('Executing recovery runbook', {
runbook: this.name,
context
});
for (const step of this.steps) {
try {
logger.info('Executing recovery step', {
step: step.name
});
const result = await step.execute(context);
results.push(result);
if (!result.success && step.critical) {
logger.error('Critical step failed', {
step: step.name,
result
});
break;
}
} catch (error) {
logger.error('Recovery step failed', {
step: step.name,
error
});
if (step.critical) break;
}
}
return {
runbook: this.name,
success: results.every(r => r.success),
steps: results,
timestamp: new Date()
};
}
}
// Example: Database recovery runbook
const dbRecoveryRunbook = new RecoveryRunbook('database-recovery', [
{
name: 'verify-connectivity',
critical: true,
execute: async (context) => {
const canConnect = await db.testConnection();
return { success: canConnect };
}
},
{
name: 'clear-connection-pool',
critical: false,
execute: async (context) => {
await db.clearPool();
return { success: true };
}
},
{
name: 'failover-to-replica',
critical: true,
execute: async (context) => {
if (context.severity === 'critical') {
await db.failoverToReplica();
return { success: true };
}
return { success: false, skipped: true };
}
},
{
name: 'notify-oncall',
critical: false,
execute: async (context) => {
await alerting.notifyOncall({
severity: context.severity,
runbook: 'database-recovery',
context
});
return { success: true };
}
}
]);

Master recovery patterns with: