Fail Fast, Recover Fast
Detect failures quickly and initiate recovery immediately
Ta treść nie jest jeszcze dostępna w Twoim języku.
Building resilient systems isn’t just about preventing failures—it’s about recovering gracefully when things go wrong. With AI assistance, you can implement sophisticated recovery patterns that minimize downtime, prevent cascading failures, and maintain service quality even during disruptions. This guide covers proven recovery patterns for production systems.
Modern systems must embrace failure as inevitable and design for recovery:
Fail Fast, Recover Fast
Detect failures quickly and initiate recovery immediately
Graceful Degradation
Maintain core functionality even when some features fail
Automated Healing
Systems should recover without human intervention when possible
Learn from Failures
Each failure is an opportunity to improve resilience
Stop cascading failures by preventing calls to failing services.
// AI-generated circuit breaker implementationclass CircuitBreaker { private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED'; private failures = 0; private lastFailureTime?: Date; private successfulProbes = 0;
constructor( private readonly options: { failureThreshold: number; resetTimeout: number; probeThreshold: number; onStateChange?: (oldState: string, newState: string) => void; } ) {}
async execute<T>(operation: () => Promise<T>): Promise<T> { if (this.state === 'OPEN') { if (this.shouldAttemptReset()) { this.transitionTo('HALF_OPEN'); } else { throw new Error('Circuit breaker is OPEN'); } }
try { const result = await operation(); this.onSuccess(); return result; } catch (error) { this.onFailure(); throw error; } }
private onSuccess() { this.failures = 0;
if (this.state === 'HALF_OPEN') { this.successfulProbes++; if (this.successfulProbes >= this.options.probeThreshold) { this.transitionTo('CLOSED'); } } }
private onFailure() { this.failures++; this.lastFailureTime = new Date();
if (this.failures >= this.options.failureThreshold) { this.transitionTo('OPEN'); }
if (this.state === 'HALF_OPEN') { this.transitionTo('OPEN'); } }
private shouldAttemptReset(): boolean { return ( this.lastFailureTime && Date.now() - this.lastFailureTime.getTime() > this.options.resetTimeout ); }
private transitionTo(newState: 'CLOSED' | 'OPEN' | 'HALF_OPEN') { const oldState = this.state; this.state = newState;
if (newState === 'HALF_OPEN') { this.successfulProbes = 0; }
this.options.onStateChange?.(oldState, newState); }}
// Usage with fallbackconst paymentBreaker = new CircuitBreaker({ failureThreshold: 5, resetTimeout: 60000, // 1 minute probeThreshold: 3, onStateChange: (oldState, newState) => { logger.warn('Circuit breaker state changed', { oldState, newState }); }});
async function processPayment(order: Order) { try { return await paymentBreaker.execute( () => paymentService.process(order) ); } catch (error) { if (error.message === 'Circuit breaker is OPEN') { // Fallback to queued processing return await queuePaymentForLater(order); } throw error; }}
// Using Resilience4j with Spring Boot@Servicepublic class PaymentService { private final CircuitBreaker circuitBreaker; private final PaymentClient paymentClient;
public PaymentService() { CircuitBreakerConfig config = CircuitBreakerConfig.custom() .failureRateThreshold(50) .waitDurationInOpenState(Duration.ofSeconds(60)) .slidingWindowSize(100) .permittedNumberOfCallsInHalfOpenState(3) .recordExceptions(IOException.class, TimeoutException.class) .ignoreExceptions(BusinessException.class) .build();
this.circuitBreaker = CircuitBreaker.of("payment", config);
// Add event listeners circuitBreaker.getEventPublisher() .onStateTransition(event -> log.warn("Circuit breaker state transition: {}", event)); }
public PaymentResult processPayment(Order order) { return circuitBreaker.executeSupplier( () -> paymentClient.process(order), throwable -> { log.error("Payment failed, using fallback", throwable); return queuePaymentForLater(order); } ); }}
Automatically retry failed operations with increasing delays.
// Intelligent retry mechanism with jitterclass RetryStrategy { async executeWithRetry<T>( operation: () => Promise<T>, options: { maxRetries: number; initialDelay: number; maxDelay: number; factor: number; jitter?: boolean; retryableErrors?: (error: any) => boolean; } ): Promise<T> { let lastError: any;
for (let attempt = 0; attempt <= options.maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error;
// Check if error is retryable if (options.retryableErrors && !options.retryableErrors(error)) { throw error; }
if (attempt < options.maxRetries) { const delay = this.calculateDelay( attempt, options.initialDelay, options.maxDelay, options.factor, options.jitter );
logger.debug('Retrying operation', { attempt: attempt + 1, maxRetries: options.maxRetries, delay, error: error.message });
await this.sleep(delay); } } }
throw new Error(`Operation failed after ${options.maxRetries} retries: ${lastError.message}`); }
private calculateDelay( attempt: number, initialDelay: number, maxDelay: number, factor: number, jitter?: boolean ): number { // Exponential backoff let delay = Math.min(initialDelay * Math.pow(factor, attempt), maxDelay);
// Add jitter to prevent thundering herd if (jitter) { delay = delay * (0.5 + Math.random() * 0.5); }
return Math.floor(delay); }
private sleep(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); }}
// Usage with custom retry logicconst retry = new RetryStrategy();
async function fetchUserData(userId: string) { return retry.executeWithRetry( () => apiClient.getUser(userId), { maxRetries: 3, initialDelay: 1000, maxDelay: 10000, factor: 2, jitter: true, retryableErrors: (error) => { // Retry on network errors and 5xx status codes return error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT' || (error.status >= 500 && error.status < 600); } } );}
Isolate failures to prevent them from affecting the entire system.
Resource Isolation Pattern
// Thread pool isolation for different operationsclass BulkheadManager { private bulkheads = new Map<string, Bulkhead>();
createBulkhead(name: string, config: BulkheadConfig) { const bulkhead = new Bulkhead(config); this.bulkheads.set(name, bulkhead); return bulkhead; }
getBulkhead(name: string): Bulkhead { const bulkhead = this.bulkheads.get(name); if (!bulkhead) { throw new Error(`Bulkhead ${name} not found`); } return bulkhead; }}
class Bulkhead { private semaphore: Semaphore; private queue: Array<() => void> = []; private activeRequests = 0;
constructor(private config: BulkheadConfig) { this.semaphore = new Semaphore(config.maxConcurrent); }
async execute<T>(operation: () => Promise<T>): Promise<T> { // Check if queue is full if (this.queue.length >= this.config.maxQueueSize) { throw new Error('Bulkhead queue is full'); }
// Wait for available slot await this.semaphore.acquire(); this.activeRequests++;
try { return await operation(); } finally { this.activeRequests--; this.semaphore.release(); } }
getMetrics() { return { activeRequests: this.activeRequests, queueSize: this.queue.length, maxConcurrent: this.config.maxConcurrent, maxQueueSize: this.config.maxQueueSize }; }}
// Usage exampleconst bulkheadManager = new BulkheadManager();
// Create separate bulkheads for different operationsbulkheadManager.createBulkhead('payment', { maxConcurrent: 10, maxQueueSize: 50});
bulkheadManager.createBulkhead('search', { maxConcurrent: 20, maxQueueSize: 100});
// Use bulkheads to isolate operationsasync function processPayment(order: Order) { const bulkhead = bulkheadManager.getBulkhead('payment'); return bulkhead.execute(() => paymentService.process(order));}
async function searchProducts(query: string) { const bulkhead = bulkheadManager.getBulkhead('search'); return bulkhead.execute(() => searchService.search(query));}
Prevent resource exhaustion from slow operations.
// Comprehensive timeout handlingclass TimeoutManager { async executeWithTimeout<T>( operation: () => Promise<T>, timeoutMs: number, options?: { onTimeout?: () => void; cancelOnTimeout?: boolean; } ): Promise<T> { const timeoutPromise = new Promise<never>((_, reject) => { const timer = setTimeout(() => { options?.onTimeout?.(); reject(new TimeoutError(`Operation timed out after ${timeoutMs}ms`)); }, timeoutMs);
// Clean up timer if operation completes first operation().finally(() => clearTimeout(timer)); });
try { return await Promise.race([operation(), timeoutPromise]); } catch (error) { if (error instanceof TimeoutError && options?.cancelOnTimeout) { // Attempt to cancel the operation if possible this.cancelOperation(operation); } throw error; } }
private cancelOperation(operation: any) { // Implementation depends on the operation type // For HTTP requests, abort the request // For database queries, kill the query // For async operations, set a cancellation flag }}
// Cascading timeouts for distributed callsclass CascadingTimeout { constructor(private totalTimeout: number) {}
async executeWithCascadingTimeout<T>( operations: Array<{ name: string; operation: () => Promise<any>; weight: number; // Relative importance/expected duration }> ): Promise<T[]> { const results: T[] = []; let remainingTime = this.totalTimeout; const startTime = Date.now();
const totalWeight = operations.reduce((sum, op) => sum + op.weight, 0);
for (const { name, operation, weight } of operations) { const allocatedTime = Math.floor((weight / totalWeight) * this.totalTimeout); const actualTimeout = Math.min(allocatedTime, remainingTime);
logger.debug('Executing operation with timeout', { name, allocatedTime, actualTimeout, remainingTime });
try { const result = await new TimeoutManager().executeWithTimeout( operation, actualTimeout ); results.push(result);
// Update remaining time remainingTime = this.totalTimeout - (Date.now() - startTime); } catch (error) { logger.error('Operation failed', { name, error }); throw error; } }
return results; }}
Provide alternative functionality when primary systems fail.
// Multi-level fallback strategyclass FallbackChain<T> { private strategies: Array<{ name: string; execute: () => Promise<T>; condition?: (error: any) => boolean; }> = [];
addStrategy( name: string, execute: () => Promise<T>, condition?: (error: any) => boolean ) { this.strategies.push({ name, execute, condition }); return this; }
async execute(): Promise<T> { const errors: Array<{ strategy: string; error: any }> = [];
for (const strategy of this.strategies) { try { logger.debug('Attempting strategy', { name: strategy.name }); const result = await strategy.execute(); logger.info('Strategy succeeded', { name: strategy.name }); return result; } catch (error) { errors.push({ strategy: strategy.name, error });
// Check if we should try the next strategy if (strategy.condition && !strategy.condition(error)) { logger.error('Strategy failed with non-recoverable error', { strategy: strategy.name, error }); throw error; }
logger.warn('Strategy failed, trying next', { strategy: strategy.name, error: error.message }); } }
// All strategies failed throw new Error( `All fallback strategies failed: ${JSON.stringify(errors)}` ); }}
// Real-world example: User profile loadingasync function getUserProfile(userId: string): Promise<UserProfile> { const fallback = new FallbackChain<UserProfile>();
return fallback .addStrategy('primary-db', async () => { // Try primary database return await primaryDb.getUser(userId); }) .addStrategy('replica-db', async () => { // Fallback to read replica logger.warn('Using read replica for user profile'); return await replicaDb.getUser(userId); }) .addStrategy('cache', async () => { // Fallback to cache (might be stale) logger.warn('Using cached user profile'); const cached = await cache.get(`user:${userId}`); if (!cached) throw new Error('Not in cache'); return { ...cached, stale: true }; }) .addStrategy('default', async () => { // Last resort: return minimal profile logger.error('Using default user profile'); return { id: userId, name: 'User', avatar: '/default-avatar.png', limited: true }; }) .execute();}
Implement automated recovery without human intervention.
// Self-healing service managerclass SelfHealingService { private healthChecks = new Map<string, HealthCheck>(); private healingStrategies = new Map<string, HealingStrategy>(); private isHealing = false;
registerHealthCheck(name: string, check: HealthCheck) { this.healthChecks.set(name, check); }
registerHealingStrategy(name: string, strategy: HealingStrategy) { this.healingStrategies.set(name, strategy); }
async monitorAndHeal() { setInterval(async () => { if (this.isHealing) return;
for (const [name, check] of this.healthChecks) { try { const isHealthy = await check.isHealthy();
if (!isHealthy) { await this.attemptHealing(name, check); } } catch (error) { logger.error('Health check failed', { name, error }); } } }, 30000); // Check every 30 seconds }
private async attemptHealing( checkName: string, check: HealthCheck ) { this.isHealing = true; const diagnosis = await check.diagnose();
logger.warn('Unhealthy service detected', { check: checkName, diagnosis });
try { // Find appropriate healing strategy const strategy = this.findHealingStrategy(diagnosis);
if (strategy) { logger.info('Attempting self-healing', { check: checkName, strategy: strategy.name });
await strategy.heal(diagnosis);
// Verify healing was successful await this.sleep(5000); const isHealthy = await check.isHealthy();
if (isHealthy) { logger.info('Self-healing successful', { check: checkName, strategy: strategy.name }); } else { logger.error('Self-healing failed', { check: checkName, strategy: strategy.name }); // Could escalate to alerts here } } } finally { this.isHealing = false; } }
private findHealingStrategy(diagnosis: Diagnosis): HealingStrategy | null { for (const [, strategy] of this.healingStrategies) { if (strategy.canHeal(diagnosis)) { return strategy; } } return null; }}
// Example: Database connection pool healingconst dbHealing = new SelfHealingService();
dbHealing.registerHealthCheck('db-connections', { isHealthy: async () => { const stats = await db.getPoolStats(); return stats.active < stats.max * 0.9 && stats.waiting === 0; }, diagnose: async () => { const stats = await db.getPoolStats(); return { type: 'connection-exhaustion', stats, timestamp: new Date() }; }});
dbHealing.registerHealingStrategy('connection-recovery', { name: 'connection-recovery', canHeal: (diagnosis) => diagnosis.type === 'connection-exhaustion', heal: async (diagnosis) => { // Clear idle connections await db.clearIdleConnections();
// Increase pool size temporarily await db.setPoolSize(diagnosis.stats.max * 1.5);
// Schedule pool size reduction setTimeout(async () => { await db.setPoolSize(diagnosis.stats.max); }, 300000); // 5 minutes }});
Test recovery mechanisms proactively.
// Chaos engineering framework for testing recoveryclass ChaosMonkey { private experiments = new Map<string, ChaosExperiment>();
registerExperiment(experiment: ChaosExperiment) { this.experiments.set(experiment.name, experiment); }
async runExperiment(name: string, options?: RunOptions) { const experiment = this.experiments.get(name); if (!experiment) { throw new Error(`Experiment ${name} not found`); }
logger.info('Starting chaos experiment', { name });
// Record initial state const initialMetrics = await this.captureMetrics();
// Inject failure const cleanup = await experiment.inject();
try { // Let the system respond await this.sleep(options?.duration || 60000);
// Measure impact const impactMetrics = await this.captureMetrics();
// Verify recovery mechanisms kicked in const recoverySuccess = await experiment.verifyRecovery();
return { experiment: name, initialMetrics, impactMetrics, recoverySuccess, observations: experiment.observations }; } finally { // Always cleanup await cleanup(); } }
private async captureMetrics() { return { errorRate: await metrics.getErrorRate(), responseTime: await metrics.getResponseTime(), throughput: await metrics.getThroughput(), availability: await metrics.getAvailability() }; }}
// Example experiment: Test circuit breakerconst chaosMonkey = new ChaosMonkey();
chaosMonkey.registerExperiment({ name: 'payment-service-failure', description: 'Simulate payment service outage',
inject: async () => { // Make payment service return errors await mockServer.setResponse('/api/payment', { status: 500, body: { error: 'Internal Server Error' } });
// Return cleanup function return async () => { await mockServer.resetResponse('/api/payment'); }; },
verifyRecovery: async () => { // Check if circuit breaker opened const circuitState = await getCircuitBreakerState('payment');
// Check if fallback was used const fallbackMetrics = await metrics.getFallbackUsage();
return { circuitBreakerOpened: circuitState === 'OPEN', fallbackUsed: fallbackMetrics.count > 0, userExperienceMaintained: await checkUserExperience() }; },
observations: []});
Fast Detection
Graceful Degradation
Automated Recovery
Learn and Improve
Track these metrics to measure recovery success:
// Recovery metrics collectorclass RecoveryMetrics { private metrics = { mttr: new Map<string, number[]>(), // Mean Time To Recovery recoverySuccess: new Map<string, number>(), fallbackUsage: new Map<string, number>(), circuitBreakerTrips: new Map<string, number>() };
recordRecovery(service: string, duration: number, success: boolean) { // Track MTTR if (!this.metrics.mttr.has(service)) { this.metrics.mttr.set(service, []); } this.metrics.mttr.get(service)!.push(duration);
// Track success rate const current = this.metrics.recoverySuccess.get(service) || 0; this.metrics.recoverySuccess.set( service, success ? current + 1 : current ); }
getReport(service: string) { const mttrValues = this.metrics.mttr.get(service) || []; const avgMttr = mttrValues.reduce((a, b) => a + b, 0) / mttrValues.length;
return { averageMTTR: avgMttr, recoverySuccessRate: this.calculateSuccessRate(service), fallbackUsageRate: this.metrics.fallbackUsage.get(service) || 0, circuitBreakerTrips: this.metrics.circuitBreakerTrips.get(service) || 0 }; }}
Create automated runbooks for common failure scenarios:
// Automated recovery runbookclass RecoveryRunbook { constructor( private name: string, private steps: RecoveryStep[] ) {}
async execute(context: FailureContext): Promise<RecoveryResult> { const results: StepResult[] = [];
logger.info('Executing recovery runbook', { runbook: this.name, context });
for (const step of this.steps) { try { logger.info('Executing recovery step', { step: step.name });
const result = await step.execute(context); results.push(result);
if (!result.success && step.critical) { logger.error('Critical step failed', { step: step.name, result }); break; } } catch (error) { logger.error('Recovery step failed', { step: step.name, error });
if (step.critical) break; } }
return { runbook: this.name, success: results.every(r => r.success), steps: results, timestamp: new Date() }; }}
// Example: Database recovery runbookconst dbRecoveryRunbook = new RecoveryRunbook('database-recovery', [ { name: 'verify-connectivity', critical: true, execute: async (context) => { const canConnect = await db.testConnection(); return { success: canConnect }; } }, { name: 'clear-connection-pool', critical: false, execute: async (context) => { await db.clearPool(); return { success: true }; } }, { name: 'failover-to-replica', critical: true, execute: async (context) => { if (context.severity === 'critical') { await db.failoverToReplica(); return { success: true }; } return { success: false, skipped: true }; } }, { name: 'notify-oncall', critical: false, execute: async (context) => { await alerting.notifyOncall({ severity: context.severity, runbook: 'database-recovery', context }); return { success: true }; } }]);
Master recovery patterns with: