Alert on Symptoms
Focus on user-facing issues, not every metric spike
Ta treść nie jest jeszcze dostępna w Twoim języku.
Effective monitoring transforms reactive firefighting into proactive system management. With AI assistance, you can build sophisticated monitoring systems that detect issues before users notice, predict failures, and automatically initiate recovery. This guide covers battle-tested monitoring patterns for production systems.
Successful monitoring follows these principles:
Alert on Symptoms
Focus on user-facing issues, not every metric spike
Golden Signals
Latency, traffic, errors, and saturation tell the story
Context is King
Link metrics to traces, logs, and business impact
Automate Response
Turn insights into automated remediation
Google’s SRE book popularized monitoring four key metrics that indicate system health.
# Latency - Request duration- name: high_latency expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, method, le) ) > 0.5 for: 5m labels: severity: warning annotations: summary: "High latency on {{ $labels.service }}" description: "95th percentile latency is {{ $value }}s"
# Traffic - Request rate- name: traffic_spike expr: | sum(rate(http_requests_total[5m])) by (service) > 2 * avg_over_time( sum(rate(http_requests_total[5m])) by (service)[1h:5m] ) for: 10m labels: severity: info annotations: summary: "Traffic spike detected"
# Errors - Error rate- name: high_error_rate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) > 0.05 for: 5m labels: severity: critical annotations: summary: "Error rate above 5%"
# Saturation - Resource usage- name: high_cpu_usage expr: | 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning annotations: summary: "CPU usage above 80%"
// AWS CloudWatch implementationconst AWS = require('aws-sdk');const cloudwatch = new AWS.CloudWatch();
class GoldenSignalsMonitor { async createAlarms(serviceName) { // Latency alarm await cloudwatch.putMetricAlarm({ AlarmName: `${serviceName}-high-latency`, MetricName: 'Duration', Namespace: 'AWS/Lambda', Statistic: 'Average', Period: 300, EvaluationPeriods: 2, Threshold: 1000, ComparisonOperator: 'GreaterThanThreshold', AlarmActions: [process.env.SNS_TOPIC_ARN] }).promise();
// Error rate alarm await cloudwatch.putMetricAlarm({ AlarmName: `${serviceName}-high-errors`, MetricName: 'Errors', Namespace: 'AWS/Lambda', Statistic: 'Sum', Period: 300, EvaluationPeriods: 1, Threshold: 10, ComparisonOperator: 'GreaterThanThreshold' }).promise();
// Custom metric for traffic const params = { Namespace: 'CustomApp', MetricData: [{ MetricName: 'RequestCount', Value: 1, Unit: 'Count', Dimensions: [ { Name: 'Service', Value: serviceName } ] }] }; await cloudwatch.putMetricData(params).promise(); }}
Monitor Service Level Indicators (SLIs) against Service Level Objectives (SLOs).
// SLO monitoring implementationclass SLOMonitor { constructor(prometheus) { this.prometheus = prometheus; this.slos = new Map(); }
defineSLO(name, config) { this.slos.set(name, { name, description: config.description, sli: config.sli, target: config.target, window: config.window || '30d', burnRate: config.burnRate || { '1h': 14.4, // 14.4x burn rate = 5% budget in 1h '6h': 6, // 6x burn rate = 5% budget in 6h '1d': 3, // 3x burn rate = 10% budget in 1d '3d': 1 // 1x burn rate = 10% budget in 3d } }); }
generateAlerts() { const alerts = [];
for (const [name, slo] of this.slos) { // Multi-window, multi-burn-rate alerts for (const [shortWindow, shortBurn] of Object.entries({ '5m': 14.4, '30m': 6 })) { for (const [longWindow, longBurn] of Object.entries({ '1h': 14.4, '6h': 6 })) { alerts.push({ alert: `${name}_burn_rate`, expr: `( ${slo.sli}[${shortWindow}] < ${slo.target} AND ${slo.sli}[${longWindow}] < ${slo.target} )`, labels: { severity: 'page', slo: name }, annotations: { summary: `SLO ${name} burn rate exceeded`, description: `Error budget burn rate is above threshold` } }); } } }
return alerts; }
calculateErrorBudget(sloName, timeRange = '30d') { const slo = this.slos.get(sloName); const query = ` 1 - ( sum(increase(${slo.sli}_total[${timeRange}])) / sum(increase(requests_total[${timeRange}])) ) `;
return this.prometheus.query(query); }}
// Usageconst monitor = new SLOMonitor(prometheusClient);
monitor.defineSLO('api-availability', { description: 'API availability SLO', sli: 'http_requests_total{status!~"5.."}', target: 0.999, // 99.9% availability window: '30d'});
monitor.defineSLO('api-latency', { description: 'API latency SLO', sli: 'http_request_duration_seconds{quantile="0.95"} < 0.3', target: 0.95, // 95% of requests under 300ms window: '30d'});
Use machine learning to predict issues before they occur.
AI-Powered Anomaly Detection
class PredictiveMonitor { constructor(metricsStore, alertManager) { this.metricsStore = metricsStore; this.alertManager = alertManager; this.models = new Map(); }
async trainModel(metricName, options = {}) { const historicalData = await this.metricsStore.query({ metric: metricName, start: '-30d', end: 'now', step: options.step || '5m' });
// Feature engineering const features = this.extractFeatures(historicalData);
// Train model (simplified - use proper ML library) const model = { metric: metricName, seasonality: this.detectSeasonality(features), trend: this.calculateTrend(features), stdDev: this.calculateStdDev(features), updateTime: Date.now() };
this.models.set(metricName, model); return model; }
extractFeatures(data) { return { hourOfDay: data.map(d => new Date(d.timestamp).getHours()), dayOfWeek: data.map(d => new Date(d.timestamp).getDay()), values: data.map(d => d.value), differences: data.slice(1).map((d, i) => d.value - data[i].value), movingAvg: this.movingAverage(data.map(d => d.value), 12) }; }
async detectAnomalies(metricName, realtimeValue) { const model = this.models.get(metricName); if (!model) { throw new Error(`No model trained for ${metricName}`); }
const now = new Date(); const expectedValue = this.predict(model, now); const threshold = model.stdDev * 3; // 3-sigma rule
const anomaly = Math.abs(realtimeValue - expectedValue) > threshold;
if (anomaly) { const confidence = this.calculateConfidence( realtimeValue, expectedValue, threshold );
await this.alertManager.createAlert({ title: `Anomaly detected in ${metricName}`, severity: confidence > 0.9 ? 'critical' : 'warning', details: { expected: expectedValue, actual: realtimeValue, deviation: Math.abs(realtimeValue - expectedValue), confidence, model: { lastUpdated: new Date(model.updateTime), accuracy: model.accuracy } } }); }
return { anomaly, expectedValue, confidence }; }
predict(model, timestamp) { const hour = timestamp.getHours(); const day = timestamp.getDay();
// Simplified prediction combining trend and seasonality let prediction = model.trend.baseline;
// Add hourly seasonality if (model.seasonality.hourly) { prediction += model.seasonality.hourly[hour]; }
// Add weekly seasonality if (model.seasonality.weekly) { prediction += model.seasonality.weekly[day]; }
return prediction; }
async forecastCapacity(resource, days = 30) { const model = this.models.get(`${resource}_usage`); const currentUsage = await this.getCurrentUsage(resource); const growthRate = model.trend.rate;
const forecast = []; for (let d = 0; d < days; d++) { const predictedUsage = currentUsage * Math.pow(1 + growthRate, d); forecast.push({ date: new Date(Date.now() + d * 24 * 60 * 60 * 1000), usage: predictedUsage, percentOfCapacity: (predictedUsage / 100) * 100 }); }
// Alert if capacity will be exceeded const capacityBreach = forecast.find(f => f.percentOfCapacity > 80); if (capacityBreach) { await this.alertManager.createAlert({ title: `${resource} capacity warning`, severity: 'warning', details: { message: `${resource} will reach 80% capacity on ${capacityBreach.date}`, currentUsage: `${currentUsage}%`, projectedUsage: `${capacityBreach.usage}%`, daysUntilBreach: forecast.indexOf(capacityBreach) } }); }
return forecast; }}
Link technical metrics to business outcomes.
class BusinessMetricsMonitor { constructor(analytics, monitoring) { this.analytics = analytics; this.monitoring = monitoring; }
async setupBusinessDashboard() { // Revenue impact monitoring await this.monitoring.createMetric({ name: 'revenue_per_minute', query: ` sum(rate(order_total_amount[1m])) * avg(order_conversion_rate) `, unit: 'dollars', alerts: [{ condition: 'decrease > 20%', severity: 'critical', message: 'Revenue drop detected' }] });
// User experience metrics await this.monitoring.createMetric({ name: 'apdex_score', query: ` ( sum(rate(http_request_duration_seconds_bucket{le="0.5"}[5m])) + sum(rate(http_request_duration_seconds_bucket{le="2.0"}[5m])) / 2 ) / sum(rate(http_request_duration_seconds_count[5m])) `, unit: 'ratio', alerts: [{ condition: '< 0.8', severity: 'warning', message: 'User experience degraded (Apdex < 0.8)' }] });
// Conversion funnel monitoring const funnelSteps = [ 'page_view', 'add_to_cart', 'checkout_start', 'payment_complete' ];
for (let i = 1; i < funnelSteps.length; i++) { const fromStep = funnelSteps[i - 1]; const toStep = funnelSteps[i];
await this.monitoring.createMetric({ name: `conversion_${fromStep}_to_${toStep}`, query: ` sum(rate(events_total{event="${toStep}"}[5m])) / sum(rate(events_total{event="${fromStep}"}[5m])) `, unit: 'ratio', alerts: [{ condition: 'decrease > 15%', severity: 'warning', message: `Conversion drop: ${fromStep} → ${toStep}` }] }); } }
async correlateWithTechnical(businessMetric, timeRange) { // Find technical metrics that correlate with business metrics const businessData = await this.analytics.getMetric(businessMetric, timeRange); const technicalMetrics = await this.monitoring.getAllMetrics();
const correlations = [];
for (const techMetric of technicalMetrics) { const techData = await this.monitoring.getMetric(techMetric.name, timeRange); const correlation = this.calculateCorrelation(businessData, techData);
if (Math.abs(correlation) > 0.7) { correlations.push({ technical: techMetric.name, business: businessMetric, correlation, impact: this.estimateImpact(correlation, techData, businessData) }); } }
return correlations.sort((a, b) => Math.abs(b.correlation) - Math.abs(a.correlation)); }}
Connect metrics with traces for deep insights.
// OpenTelemetry integrationconst { MeterProvider } = require('@opentelemetry/metrics');const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus');const { trace, context } = require('@opentelemetry/api');
class TracingMetricsCollector { constructor() { this.exporter = new PrometheusExporter({ port: 9090 }); this.meterProvider = new MeterProvider(); this.meterProvider.addMetricReader(this.exporter);
this.meters = new Map(); this.setupMetrics(); }
setupMetrics() { const meter = this.meterProvider.getMeter('app-metrics');
// Request duration histogram linked to traces this.requestDuration = meter.createHistogram('http_request_duration', { description: 'HTTP request duration in seconds', unit: 's' });
// Active spans gauge this.activeSpans = meter.createUpDownCounter('active_spans', { description: 'Number of active spans' });
// Error counter with trace context this.errors = meter.createCounter('errors_total', { description: 'Total number of errors with trace context' }); }
recordRequest(duration, attributes) { const span = trace.getActiveSpan(); const spanContext = span?.spanContext();
this.requestDuration.record(duration, { ...attributes, trace_id: spanContext?.traceId, span_id: spanContext?.spanId, has_error: span?.status?.code === 2 }); }
async monitorTraceHealth() { // Monitor trace sampling effectiveness const samplingRate = meter.createGauge('trace_sampling_rate', { description: 'Current trace sampling rate' });
// Monitor trace completion const traceCompleteness = meter.createGauge('trace_completeness', { description: 'Percentage of complete traces' });
setInterval(async () => { const stats = await this.calculateTraceStats(); samplingRate.record(stats.samplingRate); traceCompleteness.record(stats.completeness); }, 60000); }}
Proactively test critical user journeys.
class SyntheticMonitor { constructor(monitoring, alerting) { this.monitoring = monitoring; this.alerting = alerting; this.scenarios = new Map(); }
defineScenario(name, scenario) { this.scenarios.set(name, { name, interval: scenario.interval || 300000, // 5 minutes default timeout: scenario.timeout || 30000, steps: scenario.steps, assertions: scenario.assertions, regions: scenario.regions || ['us-east-1'] }); }
async runScenario(name) { const scenario = this.scenarios.get(name); const results = new Map();
for (const region of scenario.regions) { const startTime = Date.now(); const stepResults = []; let success = true;
try { for (const step of scenario.steps) { const stepStart = Date.now(); const result = await this.executeStep(step, region);
stepResults.push({ name: step.name, duration: Date.now() - stepStart, success: result.success, details: result.details });
if (!result.success) { success = false; break; } }
// Run assertions if (success && scenario.assertions) { for (const assertion of scenario.assertions) { if (!await this.checkAssertion(assertion, stepResults)) { success = false; break; } } } } catch (error) { success = false; stepResults.push({ name: 'error', error: error.message, stack: error.stack }); }
const totalDuration = Date.now() - startTime;
results.set(region, { success, duration: totalDuration, steps: stepResults, timestamp: new Date() });
// Record metrics await this.recordMetrics(name, region, { success, duration: totalDuration, steps: stepResults }); }
// Alert on failures await this.checkAlerts(name, results);
return results; }
async executeStep(step, region) { switch (step.type) { case 'http': return await this.executeHttpStep(step, region); case 'browser': return await this.executeBrowserStep(step, region); case 'api': return await this.executeApiStep(step, region); default: throw new Error(`Unknown step type: ${step.type}`); } }
async recordMetrics(scenarioName, region, result) { // Success rate await this.monitoring.recordMetric({ name: 'synthetic_success_rate', value: result.success ? 1 : 0, labels: { scenario: scenarioName, region, type: 'synthetic' } });
// Duration await this.monitoring.recordMetric({ name: 'synthetic_duration_seconds', value: result.duration / 1000, labels: { scenario: scenarioName, region, success: result.success } });
// Step-level metrics for (const step of result.steps) { await this.monitoring.recordMetric({ name: 'synthetic_step_duration_seconds', value: step.duration / 1000, labels: { scenario: scenarioName, step: step.name, region, success: step.success } }); } }}
// Example scenariomonitor.defineScenario('checkout-flow', { interval: 300000, // Run every 5 minutes timeout: 30000, // 30 second timeout regions: ['us-east-1', 'eu-west-1', 'ap-southeast-1'], steps: [ { name: 'load-homepage', type: 'http', url: 'https://example.com', expectedStatus: 200 }, { name: 'search-product', type: 'api', endpoint: '/api/search', method: 'GET', params: { q: 'test-product' }, expectedStatus: 200, validateResponse: (res) => res.results.length > 0 }, { name: 'add-to-cart', type: 'api', endpoint: '/api/cart', method: 'POST', body: { productId: 'test-123', quantity: 1 }, expectedStatus: 201 }, { name: 'checkout', type: 'browser', script: async (page) => { await page.goto('https://example.com/checkout'); await page.fill('#email', 'test@example.com'); await page.click('button[type="submit"]'); await page.waitForSelector('.success-message'); } } ], assertions: [ { name: 'total-time-under-5s', check: (results) => { const totalTime = results.reduce((sum, r) => sum + r.duration, 0); return totalTime < 5000; } } ]});
Alert Fatigue Prevention
Dashboard Design
Metric Naming
Cost Optimization
Create alerts that are actionable and reduce noise:
# Good alert examplegroups: - name: api_alerts rules: - alert: APIHighErrorRate expr: | ( sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) ) > 0.05 for: 5m labels: severity: critical team: platform pager: true annotations: summary: "High error rate on {{ $labels.service }}" description: | Error rate is {{ $value | humanizePercentage }} for {{ $labels.service }}.
Dashboard: https://grafana.example.com/d/api-errors Runbook: https://wiki.example.com/runbooks/api-errors
Recent changes: https://github.com/example/{{ $labels.service }}/commits impact: "Users experiencing failures when using {{ $labels.service }}" action: | 1. Check service logs for error details 2. Verify upstream dependencies 3. Consider rolling back recent deployments 4. Scale up if load-related
// Grafana dashboard as codeconst dashboardConfig = { title: 'Service Health Overview', panels: [ // Row 1: Key metrics at a glance { title: 'Service Status', type: 'stat', gridPos: { x: 0, y: 0, w: 6, h: 4 }, targets: [{ expr: 'up{job="api"}', format: 'table' }], thresholds: { mode: 'absolute', steps: [ { color: 'red', value: 0 }, { color: 'green', value: 1 } ] } }, { title: 'Current QPS', type: 'graph', gridPos: { x: 6, y: 0, w: 6, h: 4 }, targets: [{ expr: 'sum(rate(http_requests_total[1m]))', legendFormat: 'Requests/sec' }] }, { title: 'Error Rate', type: 'gauge', gridPos: { x: 12, y: 0, w: 6, h: 4 }, targets: [{ expr: '100 * sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))' }], thresholds: { mode: 'percentage', steps: [ { color: 'green', value: 0 }, { color: 'yellow', value: 1 }, { color: 'red', value: 5 } ] } }, { title: 'P95 Latency', type: 'stat', gridPos: { x: 18, y: 0, w: 6, h: 4 }, targets: [{ expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))' }], unit: 's', thresholds: { mode: 'absolute', steps: [ { color: 'green', value: 0 }, { color: 'yellow', value: 0.5 }, { color: 'red', value: 1 } ] } },
// Row 2: Detailed views { title: 'Request Rate by Endpoint', type: 'graph', gridPos: { x: 0, y: 4, w: 12, h: 8 }, targets: [{ expr: 'sum(rate(http_requests_total[5m])) by (handler)', legendFormat: '{{ handler }}' }] }, { title: 'Latency Distribution', type: 'heatmap', gridPos: { x: 12, y: 4, w: 12, h: 8 }, targets: [{ expr: 'sum(rate(http_request_duration_seconds_bucket[5m])) by (le)', format: 'heatmap' }] } ]};
Automate monitoring tasks to reduce toil:
// Automated dashboard generationclass DashboardGenerator { generateServiceDashboard(serviceName) { return { title: `${serviceName} Service Dashboard`, uid: `${serviceName}-overview`, panels: [ this.generateREDPanel(serviceName), this.generateResourcePanel(serviceName), this.generateDependencyPanel(serviceName) ], templating: { list: [{ name: 'namespace', type: 'query', query: 'label_values(namespace)' }] } }; }
generateREDPanel(service) { // Rate, Errors, Duration panels return [ { title: 'Request Rate', targets: [{ expr: `sum(rate(http_requests_total{service="${service}"}[5m]))` }] }, { title: 'Error Rate', targets: [{ expr: `sum(rate(http_requests_total{service="${service}",status=~"5.."}[5m]))` }] }, { title: 'Duration (P50/P95/P99)', targets: [ { expr: `histogram_quantile(0.5, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`, legendFormat: 'P50' }, { expr: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`, legendFormat: 'P95' }, { expr: `histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`, legendFormat: 'P99' } ] } ]; }}
Master monitoring with: