Przejdź do głównej zawartości

Observability Setup

Ta treść nie jest jeszcze dostępna w Twoim języku.

Observability is the difference between guessing and knowing what’s happening in your systems. Whether you’re tracking performance, debugging issues, or optimizing costs, Claude Code transforms monitoring from an afterthought into a first-class development practice. This lesson explores how to leverage AI assistance for comprehensive observability.

Scenario: Your application serves millions of requests daily across dozens of microservices. When something goes wrong, you need answers in seconds, not hours. Traditional approach: grep through logs, guess at correlations, hope for the best. With Claude Code: intelligent monitoring that tells you what’s wrong before users notice.

Week 1: Basic Logging
- Console.log everywhere
- No structure or consistency
- Logs lost on container restart
Week 2: Metrics Setup
- Manual metric instrumentation
- Basic CPU/memory graphs
- No custom business metrics
Week 3: Debugging Crisis
- Production issue at 3 AM
- Grep through gigabytes of logs
- No correlation between services
Week 4: Post-Mortem
- "We need better monitoring"
- Start over with new tools
> Set up OpenTelemetry for our Node.js application:
> - Automatic instrumentation
> - Custom spans and metrics
> - Log correlation
> - Export to multiple backends
> - Environment-specific configuration

Claude generates a comprehensive setup:

  1. Install Dependencies
Terminal window
npm install @opentelemetry/api \
@opentelemetry/sdk-node \
@opentelemetry/auto-instrumentations-node \
@opentelemetry/exporter-metrics-otlp-http \
@opentelemetry/exporter-trace-otlp-http \
@opentelemetry/instrumentation-http \
@opentelemetry/instrumentation-express \
@opentelemetry/instrumentation-winston
  1. Create Telemetry Configuration

    telemetry.js
    const { NodeSDK } = require('@opentelemetry/sdk-node');
    const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
    const { Resource } = require('@opentelemetry/resources');
    const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
    const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-http');
    const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-http');
    const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics');
    // Configure resource attributes
    const resource = Resource.default().merge(
    new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: process.env.SERVICE_NAME || 'myapp',
    [SemanticResourceAttributes.SERVICE_VERSION]: process.env.SERVICE_VERSION || '1.0.0',
    [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'development',
    'team.name': 'platform',
    'team.owner': 'engineering'
    })
    );
    // Configure trace exporter
    const traceExporter = new OTLPTraceExporter({
    url: process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT || 'http://localhost:4318/v1/traces',
    headers: {
    'api-key': process.env.OTEL_EXPORTER_API_KEY
    }
    });
    // Configure metric exporter
    const metricExporter = new OTLPMetricExporter({
    url: process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT || 'http://localhost:4318/v1/metrics',
    headers: {
    'api-key': process.env.OTEL_EXPORTER_API_KEY
    }
    });
    // Initialize SDK
    const sdk = new NodeSDK({
    resource,
    traceExporter,
    metricReader: new PeriodicExportingMetricReader({
    exporter: metricExporter,
    exportIntervalMillis: 10000
    }),
    instrumentations: [
    getNodeAutoInstrumentations({
    '@opentelemetry/instrumentation-fs': {
    enabled: false // Disable noisy fs instrumentation
    }
    })
    ]
    });
    // Initialize and export
    sdk.start()
    .then(() => console.log('OpenTelemetry initialized'))
    .catch((error) => console.error('Failed to initialize OpenTelemetry', error));
    module.exports = { sdk };
  2. Add to Application Entry Point

    // index.js - Must be first import!
    require('./telemetry');
    const express = require('express');
    const { trace, metrics } = require('@opentelemetry/api');
    // Get tracer and meter
    const tracer = trace.getTracer('myapp');
    const meter = metrics.getMeter('myapp');
    // Create custom metrics
    const requestCounter = meter.createCounter('http_requests', {
    description: 'Count of HTTP requests',
    unit: '1'
    });
    const requestDuration = meter.createHistogram('http_request_duration', {
    description: 'Duration of HTTP requests',
    unit: 'ms'
    });
    const app = express();
    // Middleware for custom metrics
    app.use((req, res, next) => {
    const start = Date.now();
    res.on('finish', () => {
    const duration = Date.now() - start;
    const labels = {
    method: req.method,
    route: req.route?.path || 'unknown',
    status_code: res.statusCode.toString()
    };
    requestCounter.add(1, labels);
    requestDuration.record(duration, labels);
    });
    next();
    });
  3. Add Custom Spans

    // Custom instrumentation example
    async function processOrder(orderId) {
    // Start a new span
    return tracer.startActiveSpan('process_order', async (span) => {
    try {
    // Add span attributes
    span.setAttributes({
    'order.id': orderId,
    'order.processing_step': 'validation'
    });
    // Validate order
    const order = await validateOrder(orderId);
    // Create child span for payment
    await tracer.startActiveSpan('process_payment', async (paymentSpan) => {
    paymentSpan.setAttributes({
    'payment.amount': order.total,
    'payment.currency': order.currency
    });
    await processPayment(order);
    paymentSpan.setStatus({ code: SpanStatusCode.OK });
    });
    // Record custom metric
    orderProcessingCounter.add(1, {
    status: 'success',
    payment_method: order.paymentMethod
    });
    span.setStatus({ code: SpanStatusCode.OK });
    return order;
    } catch (error) {
    span.recordException(error);
    span.setStatus({
    code: SpanStatusCode.ERROR,
    message: error.message
    });
    throw error;
    } finally {
    span.end();
    }
    });
    }
> Set up distributed tracing across our microservices:
> - Trace context propagation
> - Service dependency mapping
> - Critical path analysis
> - Performance bottleneck identification
// Trace context propagation
const { propagation, trace, context } = require('@opentelemetry/api');
// HTTP client with trace propagation
async function callDownstreamService(url, data) {
const span = trace.getActiveSpan();
// Create headers with trace context
const headers = {};
propagation.inject(context.active(), headers);
try {
const response = await fetch(url, {
method: 'POST',
headers: {
...headers,
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
});
span?.addEvent('downstream_service_called', {
'http.url': url,
'http.status_code': response.status
});
return response.json();
} catch (error) {
span?.recordException(error);
throw error;
}
}
// Extract trace context in receiving service
app.use((req, res, next) => {
// Extract trace context from incoming request
const extractedContext = propagation.extract(
context.active(),
req.headers
);
// Continue trace with extracted context
context.with(extractedContext, () => {
next();
});
});
> Set up structured logging with:
> - JSON format for easy parsing
> - Correlation IDs
> - Log levels and sampling
> - Integration with OpenTelemetry
> - Sensitive data masking
logger.js
const winston = require('winston');
const { trace, context } = require('@opentelemetry/api');
// Custom format that includes trace information
const traceFormat = winston.format((info) => {
const span = trace.getActiveSpan();
if (span) {
const spanContext = span.spanContext();
info.traceId = spanContext.traceId;
info.spanId = spanContext.spanId;
}
// Add request ID if available
const requestId = context.active().getValue('requestId');
if (requestId) {
info.requestId = requestId;
}
return info;
});
// Sensitive data masking
const maskSensitive = winston.format((info) => {
const sensitive = ['password', 'token', 'apiKey', 'ssn', 'creditCard'];
const mask = (obj) => {
if (typeof obj !== 'object' || obj === null) return obj;
const masked = Array.isArray(obj) ? [] : {};
for (const [key, value] of Object.entries(obj)) {
if (sensitive.some(s => key.toLowerCase().includes(s))) {
masked[key] = '***REDACTED***';
} else if (typeof value === 'object' && value !== null) {
masked[key] = mask(value);
} else {
masked[key] = value;
}
}
return masked;
};
if (info.meta) {
info.meta = mask(info.meta);
}
return info;
});
// Create logger instance
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
traceFormat(),
maskSensitive(),
winston.format.json()
),
defaultMeta: {
service: process.env.SERVICE_NAME || 'myapp',
environment: process.env.NODE_ENV || 'development'
},
transports: [
new winston.transports.Console({
format: process.env.NODE_ENV === 'development'
? winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
: winston.format.json()
})
]
});
// Add log sampling for high-volume logs
logger.sample = (rate = 0.1) => {
return {
log: (level, message, meta) => {
if (Math.random() < rate) {
logger.log(level, message, { ...meta, sampled: true });
}
}
};
};
module.exports = logger;
// Log aggregation patterns
class LogAggregator {
constructor() {
this.buffers = new Map();
this.flushInterval = 5000; // 5 seconds
setInterval(() => this.flush(), this.flushInterval);
}
aggregate(key, data) {
if (!this.buffers.has(key)) {
this.buffers.set(key, {
count: 0,
firstSeen: Date.now(),
lastSeen: Date.now(),
samples: []
});
}
const buffer = this.buffers.get(key);
buffer.count++;
buffer.lastSeen = Date.now();
// Keep only first 5 samples
if (buffer.samples.length < 5) {
buffer.samples.push(data);
}
}
flush() {
for (const [key, buffer] of this.buffers) {
if (buffer.count > 0) {
logger.info('Aggregated log entry', {
key,
count: buffer.count,
duration: buffer.lastSeen - buffer.firstSeen,
samples: buffer.samples
});
}
}
this.buffers.clear();
}
}
// Usage for high-frequency events
const aggregator = new LogAggregator();
// Instead of logging every request
app.use((req, res, next) => {
aggregator.aggregate(`request:${req.method}:${req.path}`, {
userAgent: req.headers['user-agent'],
ip: req.ip
});
next();
});
> Implement custom business metrics:
> - Order processing time
> - Revenue per minute
> - Cart abandonment rate
> - API success rates
> - Feature adoption metrics
metrics.js
const { metrics } = require('@opentelemetry/api');
const meter = metrics.getMeter('business-metrics');
// Business metric definitions
const orderCounter = meter.createCounter('orders_total', {
description: 'Total number of orders',
unit: '1'
});
const revenueCounter = meter.createCounter('revenue_total', {
description: 'Total revenue',
unit: 'USD'
});
const cartAbandonmentGauge = meter.createUpDownCounter('cart_abandonment', {
description: 'Number of abandoned carts',
unit: '1'
});
const apiLatencyHistogram = meter.createHistogram('api_latency', {
description: 'API endpoint latency',
unit: 'ms'
});
const activeUsersGauge = meter.createObservableGauge('active_users', {
description: 'Number of active users'
});
// Set up observable gauge callback
activeUsersGauge.addCallback(async (observableResult) => {
const count = await getActiveUserCount();
observableResult.observe(count, {
period: '5m'
});
});
// Business metric helpers
class BusinessMetrics {
static recordOrder(order) {
orderCounter.add(1, {
status: order.status,
payment_method: order.paymentMethod,
customer_type: order.isNewCustomer ? 'new' : 'returning'
});
revenueCounter.add(order.total, {
currency: order.currency,
region: order.region
});
}
static recordCartAbandonment(cart) {
cartAbandonmentGauge.add(1, {
value: cart.total,
items_count: cart.items.length,
reason: cart.abandonmentReason || 'unknown'
});
}
static recordApiCall(endpoint, method, duration, success) {
apiLatencyHistogram.record(duration, {
endpoint,
method,
success: success.toString()
});
}
static async recordFeatureUsage(feature, userId) {
const featureCounter = meter.createCounter(`feature_usage_${feature}`, {
description: `Usage of ${feature} feature`
});
featureCounter.add(1, {
user_segment: await getUserSegment(userId),
first_time: await isFirstTimeUsage(userId, feature)
});
}
}
module.exports = BusinessMetrics;
// Performance monitoring
const performanceObserver = new PerformanceObserver((list) => {
for (const entry of list.getEntries()) {
// Record performance metrics
const histogram = meter.createHistogram(`performance_${entry.entryType}`, {
description: `Performance timing for ${entry.entryType}`,
unit: 'ms'
});
histogram.record(entry.duration, {
name: entry.name,
type: entry.entryType
});
}
});
performanceObserver.observe({
entryTypes: ['measure', 'navigation', 'resource']
});
// Database query performance
const dbQueryHistogram = meter.createHistogram('db_query_duration', {
description: 'Database query execution time',
unit: 'ms'
});
// Wrap database queries
async function instrumentedQuery(sql, params) {
const startTime = performance.now();
const labels = {
operation: sql.split(' ')[0].toUpperCase(),
table: extractTableName(sql)
};
try {
const result = await db.query(sql, params);
labels.success = 'true';
return result;
} catch (error) {
labels.success = 'false';
labels.error_type = error.constructor.name;
throw error;
} finally {
const duration = performance.now() - startTime;
dbQueryHistogram.record(duration, labels);
}
}
> Set up Prometheus for metrics collection:
> - Scrape configuration
> - Recording rules
> - Alerting rules
> - Federation setup
prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-east-1'
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Recording rules
rule_files:
- 'recording_rules.yml'
- 'alerting_rules.yml'
# Scrape configurations
scrape_configs:
# Application metrics
- job_name: 'myapp'
static_configs:
- targets: ['myapp:9090']
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: '([^:]+):.*'
replacement: '${1}'
# Kubernetes service discovery
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# recording_rules.yml
groups:
- name: myapp_recording_rules
interval: 30s
rules:
# Request rate
- record: myapp:http_requests:rate5m
expr: rate(http_requests_total[5m])
# Error rate
- record: myapp:http_errors:rate5m
expr: rate(http_requests_total{status=~"5.."}[5m])
# P95 latency
- record: myapp:http_latency:p95
expr: histogram_quantile(0.95, rate(http_request_duration_bucket[5m]))
# Business metrics
- record: myapp:orders:rate1h
expr: rate(orders_total[1h])
- record: myapp:revenue:rate1h
expr: rate(revenue_total[1h])
# alerting_rules.yml
groups:
- name: myapp_alerts
rules:
- alert: HighErrorRate
expr: myapp:http_errors:rate5m > 0.05
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.instance }}"
- alert: HighLatency
expr: myapp:http_latency:p95 > 1000
for: 10m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "P95 latency is {{ $value }}ms"
- alert: LowOrderRate
expr: myapp:orders:rate1h < 10
for: 30m
labels:
severity: warning
team: business
annotations:
summary: "Low order rate"
description: "Order rate dropped to {{ $value }} orders/hour"
> Create comprehensive Grafana dashboards:
> - System overview
> - Business metrics
> - Performance analysis
> - Error tracking
> - SLA monitoring
dashboard.json
{
"dashboard": {
"title": "MyApp Production Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (method)",
"legendFormat": "{{method}}"
}
],
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
"legendFormat": "Error %"
}
],
"alert": {
"conditions": [
{
"evaluator": { "params": [5], "type": "gt" },
"operator": { "type": "and" },
"query": { "params": ["A", "5m", "now"] },
"reducer": { "params": [], "type": "avg" }
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
},
{
"title": "Response Time Percentiles",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_bucket[5m])) by (le))",
"legendFormat": "p99"
},
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_bucket[5m])) by (le))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_bucket[5m])) by (le))",
"legendFormat": "p50"
}
],
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }
},
{
"title": "Business KPIs",
"type": "stat",
"targets": [
{
"expr": "sum(rate(orders_total[1h])) * 3600",
"legendFormat": "Orders/Hour"
},
{
"expr": "sum(rate(revenue_total[1h])) * 3600",
"legendFormat": "Revenue/Hour"
},
{
"expr": "sum(active_users)",
"legendFormat": "Active Users"
}
],
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }
}
]
}
}
> Set up error tracking with:
> - Automatic error capture
> - Stack trace collection
> - User context
> - Release tracking
> - Error grouping
error-tracking.js
const Sentry = require('@sentry/node');
const { ProfilingIntegration } = require('@sentry/profiling-node');
// Initialize Sentry
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
release: process.env.SERVICE_VERSION,
integrations: [
new Sentry.Integrations.Http({ tracing: true }),
new Sentry.Integrations.Express({ app }),
new ProfilingIntegration()
],
tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
profilesSampleRate: 0.1,
beforeSend(event, hint) {
// Filter out known issues
if (event.exception?.values?.[0]?.type === 'NetworkError') {
return null;
}
// Add custom context
event.extra = {
...event.extra,
nodeVersion: process.version,
memory: process.memoryUsage()
};
return event;
}
});
// Error handler middleware
app.use((err, req, res, next) => {
// Log to our logger
logger.error('Unhandled error', {
error: err.message,
stack: err.stack,
url: req.url,
method: req.method,
ip: req.ip,
userAgent: req.get('user-agent')
});
// Send to Sentry with context
Sentry.withScope((scope) => {
scope.setContext('request', {
url: req.url,
method: req.method,
headers: req.headers,
query: req.query,
body: req.body
});
scope.setUser({
id: req.user?.id,
email: req.user?.email,
ip_address: req.ip
});
scope.setTag('endpoint', req.route?.path || 'unknown');
scope.setLevel('error');
Sentry.captureException(err);
});
// Send error response
res.status(err.status || 500).json({
error: {
message: process.env.NODE_ENV === 'production'
? 'Internal server error'
: err.message,
id: res.sentry
}
});
});
alertmanager.yml
global:
resolve_timeout: 5m
slack_api_url: 'YOUR_SLACK_WEBHOOK'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: pagerduty
continue: true
- match:
team: platform
receiver: platform-slack
- match:
team: business
receiver: business-alerts
receivers:
- name: 'default'
slack_configs:
- channel: '#alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
- name: 'platform-slack'
slack_configs:
- channel: '#platform-alerts'
send_resolved: true
title: '🚨 {{ .GroupLabels.alertname }}'
text: |
*Alert:* {{ .GroupLabels.alertname }}
*Severity:* {{ .CommonLabels.severity }}
*Description:* {{ .CommonAnnotations.description }}
*Runbook:* <{{ .CommonAnnotations.runbook_url }}|View Runbook>
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service']
> Set up Claude Code telemetry monitoring:
> - Usage metrics
> - Cost tracking
> - Performance analysis
> - Team adoption metrics
Terminal window
# Enable Claude Code telemetry
export CLAUDE_CODE_ENABLE_TELEMETRY=1
export OTEL_METRICS_EXPORTER=otlp
export OTEL_LOGS_EXPORTER=otlp
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
export OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer your-token"
# Add custom attributes for team tracking
export OTEL_RESOURCE_ATTRIBUTES="department=engineering,team.id=platform,cost_center=eng-123"
// Claude Code metrics dashboard queries
const claudeMetrics = {
// Developer productivity
linesOfCode: `
sum(rate(claude_code.lines_of_code.count[1d])) by (user_account_uuid, type)
`,
// Cost tracking
costPerTeam: `
sum(claude_code.cost.usage) by (department, team_id)
`,
// Tool acceptance rate
acceptanceRate: `
sum(rate(claude_code.code_edit_tool.decision{decision="accept"}[1h])) /
sum(rate(claude_code.code_edit_tool.decision[1h])) * 100
`,
// Active developers
activeDevelopers: `
count(count by (user_account_uuid)(claude_code.session.count))
`,
// Language usage
languageDistribution: `
sum(claude_code.code_edit_tool.decision) by (language)
`
};
> Set up comprehensive APM:
> - Transaction tracing
> - Database query analysis
> - External service monitoring
> - Resource utilization
// APM integration
const apm = require('elastic-apm-node').start({
serviceName: process.env.SERVICE_NAME,
secretToken: process.env.ELASTIC_APM_SECRET_TOKEN,
serverUrl: process.env.ELASTIC_APM_SERVER_URL,
environment: process.env.NODE_ENV,
transactionSampleRate: 0.1,
captureBody: 'errors',
errorOnAbortedRequests: true,
captureErrorLogStackTraces: 'always',
usePathAsTransactionName: false
});
// Custom transaction tracking
async function complexBusinessOperation(data) {
const transaction = apm.startTransaction('process_order', 'business');
try {
// Track database operations
const span = apm.startSpan('validate_inventory', 'db');
const inventory = await checkInventory(data.items);
span.end();
// Track external API calls
const paymentSpan = apm.startSpan('process_payment', 'external');
const payment = await processPayment(data.payment);
paymentSpan.end();
// Track custom operations
const fulfillmentSpan = apm.startSpan('create_fulfillment', 'custom');
const order = await createFulfillmentOrder(data, payment);
fulfillmentSpan.end();
transaction.result = 'success';
return order;
} catch (error) {
apm.captureError(error);
transaction.result = 'error';
throw error;
} finally {
transaction.end();
}
}
> Implement SLO monitoring:
> - Availability targets
> - Latency objectives
> - Error budgets
> - Burn rate alerts
slo_rules.yml
groups:
- name: slo_rules
interval: 30s
rules:
# Availability SLO - 99.9%
- record: slo:availability:ratio
expr: |
sum(rate(http_requests_total{status!~"5.."}[5m])) /
sum(rate(http_requests_total[5m]))
# Error budget burn rate
- record: slo:error_budget:burn_rate_1h
expr: |
(1 - slo:availability:ratio) / (1 - 0.999) * (30 * 24) / 1
- record: slo:error_budget:burn_rate_6h
expr: |
(1 - slo:availability:ratio) / (1 - 0.999) * (30 * 24) / 6
# Latency SLO - 95% of requests under 500ms
- record: slo:latency:ratio
expr: |
sum(rate(http_request_duration_bucket{le="0.5"}[5m])) /
sum(rate(http_request_duration_count[5m]))
# Multi-window multi-burn-rate alerts
- alert: ErrorBudgetBurn
expr: |
(
slo:error_budget:burn_rate_1h > 14.4
and
slo:error_budget:burn_rate_6h > 6
)
labels:
severity: critical
slo: availability
annotations:
summary: "Error budget is burning too fast"
description: "Error budget burn rate is {{ $value }} times normal"

You’ve learned how to leverage Claude Code for comprehensive observability - from instrumentation to visualization to alerting. The key is treating monitoring as a first-class citizen in your development process, not an afterthought.

Remember: You can’t fix what you can’t see. Use Claude Code to build observability into your applications from the start, creating systems that tell you what’s wrong before your users do. With proper monitoring, you’ll ship with confidence and sleep better at night.