Przejdź do głównej zawartości

Monitoring Patterns

Ta treść nie jest jeszcze dostępna w Twoim języku.

Effective monitoring transforms reactive firefighting into proactive system management. With AI assistance, you can build sophisticated monitoring systems that detect issues before users notice, predict failures, and automatically initiate recovery. This guide covers battle-tested monitoring patterns for production systems.

Successful monitoring follows these principles:

Alert on Symptoms

Focus on user-facing issues, not every metric spike

Golden Signals

Latency, traffic, errors, and saturation tell the story

Context is King

Link metrics to traces, logs, and business impact

Automate Response

Turn insights into automated remediation

Google’s SRE book popularized monitoring four key metrics that indicate system health.

# Latency - Request duration
- name: high_latency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (service, method, le)
) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High latency on {{ $labels.service }}"
description: "95th percentile latency is {{ $value }}s"
# Traffic - Request rate
- name: traffic_spike
expr: |
sum(rate(http_requests_total[5m])) by (service)
> 2 * avg_over_time(
sum(rate(http_requests_total[5m])) by (service)[1h:5m]
)
for: 10m
labels:
severity: info
annotations:
summary: "Traffic spike detected"
# Errors - Error rate
- name: high_error_rate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "Error rate above 5%"
# Saturation - Resource usage
- name: high_cpu_usage
expr: |
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
> 80
for: 10m
labels:
severity: warning
annotations:
summary: "CPU usage above 80%"

Monitor Service Level Indicators (SLIs) against Service Level Objectives (SLOs).

// SLO monitoring implementation
class SLOMonitor {
constructor(prometheus) {
this.prometheus = prometheus;
this.slos = new Map();
}
defineSLO(name, config) {
this.slos.set(name, {
name,
description: config.description,
sli: config.sli,
target: config.target,
window: config.window || '30d',
burnRate: config.burnRate || {
'1h': 14.4, // 14.4x burn rate = 5% budget in 1h
'6h': 6, // 6x burn rate = 5% budget in 6h
'1d': 3, // 3x burn rate = 10% budget in 1d
'3d': 1 // 1x burn rate = 10% budget in 3d
}
});
}
generateAlerts() {
const alerts = [];
for (const [name, slo] of this.slos) {
// Multi-window, multi-burn-rate alerts
for (const [shortWindow, shortBurn] of Object.entries({
'5m': 14.4,
'30m': 6
})) {
for (const [longWindow, longBurn] of Object.entries({
'1h': 14.4,
'6h': 6
})) {
alerts.push({
alert: `${name}_burn_rate`,
expr: `(
${slo.sli}[${shortWindow}] < ${slo.target}
AND
${slo.sli}[${longWindow}] < ${slo.target}
)`,
labels: {
severity: 'page',
slo: name
},
annotations: {
summary: `SLO ${name} burn rate exceeded`,
description: `Error budget burn rate is above threshold`
}
});
}
}
}
return alerts;
}
calculateErrorBudget(sloName, timeRange = '30d') {
const slo = this.slos.get(sloName);
const query = `
1 - (
sum(increase(${slo.sli}_total[${timeRange}]))
/
sum(increase(requests_total[${timeRange}]))
)
`;
return this.prometheus.query(query);
}
}
// Usage
const monitor = new SLOMonitor(prometheusClient);
monitor.defineSLO('api-availability', {
description: 'API availability SLO',
sli: 'http_requests_total{status!~"5.."}',
target: 0.999, // 99.9% availability
window: '30d'
});
monitor.defineSLO('api-latency', {
description: 'API latency SLO',
sli: 'http_request_duration_seconds{quantile="0.95"} < 0.3',
target: 0.95, // 95% of requests under 300ms
window: '30d'
});

Use machine learning to predict issues before they occur.

AI-Powered Anomaly Detection

class PredictiveMonitor {
constructor(metricsStore, alertManager) {
this.metricsStore = metricsStore;
this.alertManager = alertManager;
this.models = new Map();
}
async trainModel(metricName, options = {}) {
const historicalData = await this.metricsStore.query({
metric: metricName,
start: '-30d',
end: 'now',
step: options.step || '5m'
});
// Feature engineering
const features = this.extractFeatures(historicalData);
// Train model (simplified - use proper ML library)
const model = {
metric: metricName,
seasonality: this.detectSeasonality(features),
trend: this.calculateTrend(features),
stdDev: this.calculateStdDev(features),
updateTime: Date.now()
};
this.models.set(metricName, model);
return model;
}
extractFeatures(data) {
return {
hourOfDay: data.map(d => new Date(d.timestamp).getHours()),
dayOfWeek: data.map(d => new Date(d.timestamp).getDay()),
values: data.map(d => d.value),
differences: data.slice(1).map((d, i) => d.value - data[i].value),
movingAvg: this.movingAverage(data.map(d => d.value), 12)
};
}
async detectAnomalies(metricName, realtimeValue) {
const model = this.models.get(metricName);
if (!model) {
throw new Error(`No model trained for ${metricName}`);
}
const now = new Date();
const expectedValue = this.predict(model, now);
const threshold = model.stdDev * 3; // 3-sigma rule
const anomaly = Math.abs(realtimeValue - expectedValue) > threshold;
if (anomaly) {
const confidence = this.calculateConfidence(
realtimeValue,
expectedValue,
threshold
);
await this.alertManager.createAlert({
title: `Anomaly detected in ${metricName}`,
severity: confidence > 0.9 ? 'critical' : 'warning',
details: {
expected: expectedValue,
actual: realtimeValue,
deviation: Math.abs(realtimeValue - expectedValue),
confidence,
model: {
lastUpdated: new Date(model.updateTime),
accuracy: model.accuracy
}
}
});
}
return { anomaly, expectedValue, confidence };
}
predict(model, timestamp) {
const hour = timestamp.getHours();
const day = timestamp.getDay();
// Simplified prediction combining trend and seasonality
let prediction = model.trend.baseline;
// Add hourly seasonality
if (model.seasonality.hourly) {
prediction += model.seasonality.hourly[hour];
}
// Add weekly seasonality
if (model.seasonality.weekly) {
prediction += model.seasonality.weekly[day];
}
return prediction;
}
async forecastCapacity(resource, days = 30) {
const model = this.models.get(`${resource}_usage`);
const currentUsage = await this.getCurrentUsage(resource);
const growthRate = model.trend.rate;
const forecast = [];
for (let d = 0; d < days; d++) {
const predictedUsage = currentUsage * Math.pow(1 + growthRate, d);
forecast.push({
date: new Date(Date.now() + d * 24 * 60 * 60 * 1000),
usage: predictedUsage,
percentOfCapacity: (predictedUsage / 100) * 100
});
}
// Alert if capacity will be exceeded
const capacityBreach = forecast.find(f => f.percentOfCapacity > 80);
if (capacityBreach) {
await this.alertManager.createAlert({
title: `${resource} capacity warning`,
severity: 'warning',
details: {
message: `${resource} will reach 80% capacity on ${capacityBreach.date}`,
currentUsage: `${currentUsage}%`,
projectedUsage: `${capacityBreach.usage}%`,
daysUntilBreach: forecast.indexOf(capacityBreach)
}
});
}
return forecast;
}
}

Link technical metrics to business outcomes.

class BusinessMetricsMonitor {
constructor(analytics, monitoring) {
this.analytics = analytics;
this.monitoring = monitoring;
}
async setupBusinessDashboard() {
// Revenue impact monitoring
await this.monitoring.createMetric({
name: 'revenue_per_minute',
query: `
sum(rate(order_total_amount[1m]))
*
avg(order_conversion_rate)
`,
unit: 'dollars',
alerts: [{
condition: 'decrease > 20%',
severity: 'critical',
message: 'Revenue drop detected'
}]
});
// User experience metrics
await this.monitoring.createMetric({
name: 'apdex_score',
query: `
(
sum(rate(http_request_duration_seconds_bucket{le="0.5"}[5m]))
+
sum(rate(http_request_duration_seconds_bucket{le="2.0"}[5m])) / 2
) / sum(rate(http_request_duration_seconds_count[5m]))
`,
unit: 'ratio',
alerts: [{
condition: '< 0.8',
severity: 'warning',
message: 'User experience degraded (Apdex < 0.8)'
}]
});
// Conversion funnel monitoring
const funnelSteps = [
'page_view',
'add_to_cart',
'checkout_start',
'payment_complete'
];
for (let i = 1; i < funnelSteps.length; i++) {
const fromStep = funnelSteps[i - 1];
const toStep = funnelSteps[i];
await this.monitoring.createMetric({
name: `conversion_${fromStep}_to_${toStep}`,
query: `
sum(rate(events_total{event="${toStep}"}[5m]))
/
sum(rate(events_total{event="${fromStep}"}[5m]))
`,
unit: 'ratio',
alerts: [{
condition: 'decrease > 15%',
severity: 'warning',
message: `Conversion drop: ${fromStep}${toStep}`
}]
});
}
}
async correlateWithTechnical(businessMetric, timeRange) {
// Find technical metrics that correlate with business metrics
const businessData = await this.analytics.getMetric(businessMetric, timeRange);
const technicalMetrics = await this.monitoring.getAllMetrics();
const correlations = [];
for (const techMetric of technicalMetrics) {
const techData = await this.monitoring.getMetric(techMetric.name, timeRange);
const correlation = this.calculateCorrelation(businessData, techData);
if (Math.abs(correlation) > 0.7) {
correlations.push({
technical: techMetric.name,
business: businessMetric,
correlation,
impact: this.estimateImpact(correlation, techData, businessData)
});
}
}
return correlations.sort((a, b) => Math.abs(b.correlation) - Math.abs(a.correlation));
}
}

Connect metrics with traces for deep insights.

// OpenTelemetry integration
const { MeterProvider } = require('@opentelemetry/metrics');
const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus');
const { trace, context } = require('@opentelemetry/api');
class TracingMetricsCollector {
constructor() {
this.exporter = new PrometheusExporter({ port: 9090 });
this.meterProvider = new MeterProvider();
this.meterProvider.addMetricReader(this.exporter);
this.meters = new Map();
this.setupMetrics();
}
setupMetrics() {
const meter = this.meterProvider.getMeter('app-metrics');
// Request duration histogram linked to traces
this.requestDuration = meter.createHistogram('http_request_duration', {
description: 'HTTP request duration in seconds',
unit: 's'
});
// Active spans gauge
this.activeSpans = meter.createUpDownCounter('active_spans', {
description: 'Number of active spans'
});
// Error counter with trace context
this.errors = meter.createCounter('errors_total', {
description: 'Total number of errors with trace context'
});
}
recordRequest(duration, attributes) {
const span = trace.getActiveSpan();
const spanContext = span?.spanContext();
this.requestDuration.record(duration, {
...attributes,
trace_id: spanContext?.traceId,
span_id: spanContext?.spanId,
has_error: span?.status?.code === 2
});
}
async monitorTraceHealth() {
// Monitor trace sampling effectiveness
const samplingRate = meter.createGauge('trace_sampling_rate', {
description: 'Current trace sampling rate'
});
// Monitor trace completion
const traceCompleteness = meter.createGauge('trace_completeness', {
description: 'Percentage of complete traces'
});
setInterval(async () => {
const stats = await this.calculateTraceStats();
samplingRate.record(stats.samplingRate);
traceCompleteness.record(stats.completeness);
}, 60000);
}
}

Proactively test critical user journeys.

class SyntheticMonitor {
constructor(monitoring, alerting) {
this.monitoring = monitoring;
this.alerting = alerting;
this.scenarios = new Map();
}
defineScenario(name, scenario) {
this.scenarios.set(name, {
name,
interval: scenario.interval || 300000, // 5 minutes default
timeout: scenario.timeout || 30000,
steps: scenario.steps,
assertions: scenario.assertions,
regions: scenario.regions || ['us-east-1']
});
}
async runScenario(name) {
const scenario = this.scenarios.get(name);
const results = new Map();
for (const region of scenario.regions) {
const startTime = Date.now();
const stepResults = [];
let success = true;
try {
for (const step of scenario.steps) {
const stepStart = Date.now();
const result = await this.executeStep(step, region);
stepResults.push({
name: step.name,
duration: Date.now() - stepStart,
success: result.success,
details: result.details
});
if (!result.success) {
success = false;
break;
}
}
// Run assertions
if (success && scenario.assertions) {
for (const assertion of scenario.assertions) {
if (!await this.checkAssertion(assertion, stepResults)) {
success = false;
break;
}
}
}
} catch (error) {
success = false;
stepResults.push({
name: 'error',
error: error.message,
stack: error.stack
});
}
const totalDuration = Date.now() - startTime;
results.set(region, {
success,
duration: totalDuration,
steps: stepResults,
timestamp: new Date()
});
// Record metrics
await this.recordMetrics(name, region, {
success,
duration: totalDuration,
steps: stepResults
});
}
// Alert on failures
await this.checkAlerts(name, results);
return results;
}
async executeStep(step, region) {
switch (step.type) {
case 'http':
return await this.executeHttpStep(step, region);
case 'browser':
return await this.executeBrowserStep(step, region);
case 'api':
return await this.executeApiStep(step, region);
default:
throw new Error(`Unknown step type: ${step.type}`);
}
}
async recordMetrics(scenarioName, region, result) {
// Success rate
await this.monitoring.recordMetric({
name: 'synthetic_success_rate',
value: result.success ? 1 : 0,
labels: {
scenario: scenarioName,
region,
type: 'synthetic'
}
});
// Duration
await this.monitoring.recordMetric({
name: 'synthetic_duration_seconds',
value: result.duration / 1000,
labels: {
scenario: scenarioName,
region,
success: result.success
}
});
// Step-level metrics
for (const step of result.steps) {
await this.monitoring.recordMetric({
name: 'synthetic_step_duration_seconds',
value: step.duration / 1000,
labels: {
scenario: scenarioName,
step: step.name,
region,
success: step.success
}
});
}
}
}
// Example scenario
monitor.defineScenario('checkout-flow', {
interval: 300000, // Run every 5 minutes
timeout: 30000, // 30 second timeout
regions: ['us-east-1', 'eu-west-1', 'ap-southeast-1'],
steps: [
{
name: 'load-homepage',
type: 'http',
url: 'https://example.com',
expectedStatus: 200
},
{
name: 'search-product',
type: 'api',
endpoint: '/api/search',
method: 'GET',
params: { q: 'test-product' },
expectedStatus: 200,
validateResponse: (res) => res.results.length > 0
},
{
name: 'add-to-cart',
type: 'api',
endpoint: '/api/cart',
method: 'POST',
body: { productId: 'test-123', quantity: 1 },
expectedStatus: 201
},
{
name: 'checkout',
type: 'browser',
script: async (page) => {
await page.goto('https://example.com/checkout');
await page.fill('#email', 'test@example.com');
await page.click('button[type="submit"]');
await page.waitForSelector('.success-message');
}
}
],
assertions: [
{
name: 'total-time-under-5s',
check: (results) => {
const totalTime = results.reduce((sum, r) => sum + r.duration, 0);
return totalTime < 5000;
}
}
]
});

Alert Fatigue Prevention

  • Use alert routing and deduplication
  • Implement alert suppression windows
  • Group related alerts together
  • Set appropriate severity levels

Dashboard Design

  • Start with overview, drill down to details
  • Use consistent color schemes
  • Include relevant time ranges
  • Add context and documentation

Metric Naming

  • Use consistent naming conventions
  • Include units in metric names
  • Follow Prometheus best practices
  • Document custom metrics

Cost Optimization

  • Sample high-cardinality metrics
  • Set appropriate retention policies
  • Use metric aggregation
  • Monitor monitoring costs

Create alerts that are actionable and reduce noise:

# Good alert example
groups:
- name: api_alerts
rules:
- alert: APIHighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
) > 0.05
for: 5m
labels:
severity: critical
team: platform
pager: true
annotations:
summary: "High error rate on {{ $labels.service }}"
description: |
Error rate is {{ $value | humanizePercentage }} for {{ $labels.service }}.
Dashboard: https://grafana.example.com/d/api-errors
Runbook: https://wiki.example.com/runbooks/api-errors
Recent changes: https://github.com/example/{{ $labels.service }}/commits
impact: "Users experiencing failures when using {{ $labels.service }}"
action: |
1. Check service logs for error details
2. Verify upstream dependencies
3. Consider rolling back recent deployments
4. Scale up if load-related
// Grafana dashboard as code
const dashboardConfig = {
title: 'Service Health Overview',
panels: [
// Row 1: Key metrics at a glance
{
title: 'Service Status',
type: 'stat',
gridPos: { x: 0, y: 0, w: 6, h: 4 },
targets: [{
expr: 'up{job="api"}',
format: 'table'
}],
thresholds: {
mode: 'absolute',
steps: [
{ color: 'red', value: 0 },
{ color: 'green', value: 1 }
]
}
},
{
title: 'Current QPS',
type: 'graph',
gridPos: { x: 6, y: 0, w: 6, h: 4 },
targets: [{
expr: 'sum(rate(http_requests_total[1m]))',
legendFormat: 'Requests/sec'
}]
},
{
title: 'Error Rate',
type: 'gauge',
gridPos: { x: 12, y: 0, w: 6, h: 4 },
targets: [{
expr: '100 * sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))'
}],
thresholds: {
mode: 'percentage',
steps: [
{ color: 'green', value: 0 },
{ color: 'yellow', value: 1 },
{ color: 'red', value: 5 }
]
}
},
{
title: 'P95 Latency',
type: 'stat',
gridPos: { x: 18, y: 0, w: 6, h: 4 },
targets: [{
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))'
}],
unit: 's',
thresholds: {
mode: 'absolute',
steps: [
{ color: 'green', value: 0 },
{ color: 'yellow', value: 0.5 },
{ color: 'red', value: 1 }
]
}
},
// Row 2: Detailed views
{
title: 'Request Rate by Endpoint',
type: 'graph',
gridPos: { x: 0, y: 4, w: 12, h: 8 },
targets: [{
expr: 'sum(rate(http_requests_total[5m])) by (handler)',
legendFormat: '{{ handler }}'
}]
},
{
title: 'Latency Distribution',
type: 'heatmap',
gridPos: { x: 12, y: 4, w: 12, h: 8 },
targets: [{
expr: 'sum(rate(http_request_duration_seconds_bucket[5m])) by (le)',
format: 'heatmap'
}]
}
]
};

Automate monitoring tasks to reduce toil:

// Automated dashboard generation
class DashboardGenerator {
generateServiceDashboard(serviceName) {
return {
title: `${serviceName} Service Dashboard`,
uid: `${serviceName}-overview`,
panels: [
this.generateREDPanel(serviceName),
this.generateResourcePanel(serviceName),
this.generateDependencyPanel(serviceName)
],
templating: {
list: [{
name: 'namespace',
type: 'query',
query: 'label_values(namespace)'
}]
}
};
}
generateREDPanel(service) {
// Rate, Errors, Duration panels
return [
{
title: 'Request Rate',
targets: [{
expr: `sum(rate(http_requests_total{service="${service}"}[5m]))`
}]
},
{
title: 'Error Rate',
targets: [{
expr: `sum(rate(http_requests_total{service="${service}",status=~"5.."}[5m]))`
}]
},
{
title: 'Duration (P50/P95/P99)',
targets: [
{
expr: `histogram_quantile(0.5, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`,
legendFormat: 'P50'
},
{
expr: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`,
legendFormat: 'P95'
},
{
expr: `histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="${service}"}[5m])) by (le))`,
legendFormat: 'P99'
}
]
}
];
}
}

Master monitoring with: