Monitoring and Observability with AI

Effective monitoring and observability are crucial for maintaining reliable systems. This lesson demonstrates how Cursor’s AI capabilities help you implement comprehensive monitoring solutions, from basic metrics to advanced distributed tracing.

AI-Enhanced Observability

Modern applications require sophisticated monitoring across metrics, logs, and traces. AI assistance makes implementing these complex systems accessible, helping you build production-grade observability from day one.

MCP Servers for Monitoring

Configure monitoring MCP servers:

{
  "mcpServers": {
    "sentry": {
      "command": "npx",
      "args": ["-y", "sentry-mcp"],
      "env": {
        "SENTRY_AUTH_TOKEN": "${SENTRY_TOKEN}",
        "SENTRY_ORG": "your-org"
      }
    },
    "grafana": {
      "command": "npx",
      "args": ["-y", "grafana-mcp"],
      "env": {
        "GRAFANA_URL": "https://grafana.company.com",
        "GRAFANA_API_KEY": "${GRAFANA_KEY}"
      }
    },
    "sonarqube": {
      "command": "npx",
      "args": ["-y", "sonarqube-mcp-server"],
      "env": {
        "SONAR_HOST_URL": "https://sonar.company.com",
        "SONAR_TOKEN": "${SONAR_TOKEN}"
      }
    }
  }
}

Observability Pillars AI Helps Build

Metrics Collection

AI generates metric instrumentation and dashboards

Log Aggregation

AI creates structured logging and parsing rules

Distributed Tracing

AI implements tracing across microservices

Alerting Rules

AI creates intelligent alerting strategies

Metrics and Monitoring

Prometheus Implementation

Application Instrumentation

// Ask AI to instrument application
"Add Prometheus metrics to Express API:
- HTTP request duration histogram
- Request counter by status code
- Active connections gauge
- Custom business metrics
- Error rate tracking"

// AI generates instrumentation
import { register, Counter, Histogram, Gauge } from 'prom-client';
import express from 'express';

// Define metrics
const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10]
});

const httpRequestTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

const activeConnections = new Gauge({
  name: 'active_connections',
  help: 'Number of active connections'
});

// Middleware to track metrics
export const metricsMiddleware = (req: express.Request, res: express.Response, next: express.NextFunction) => {
  const start = Date.now();

  // Track active connections
  activeConnections.inc();

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || 'unknown';
    const labels = {
      method: req.method,
      route,
      status_code: res.statusCode.toString()
    };

    httpRequestDuration.observe(labels, duration);
    httpRequestTotal.inc(labels);
    activeConnections.dec();
  });

  next();
};

// Metrics endpoint
export const metricsEndpoint = async (req: express.Request, res: express.Response) => {
  res.set('Content-Type', register.contentType);
  res.send(await register.metrics());
};

Custom Business Metrics

// AI implements business metrics
"Create custom metrics for e-commerce:
- Order processing time
- Payment success rate
- Cart abandonment rate
- Product view tracking
- Revenue metrics"

// AI generates business metrics
const orderProcessingTime = new Histogram({
  name: 'order_processing_duration_seconds',
  help: 'Time to process orders',
  labelNames: ['payment_method', 'shipping_type'],
  buckets: [1, 5, 10, 30, 60, 120, 300]
});

const paymentSuccessRate = new Gauge({
  name: 'payment_success_rate',
  help: 'Percentage of successful payments',
  labelNames: ['payment_provider']
});

const cartAbandonment = new Counter({
  name: 'cart_abandonment_total',
  help: 'Number of abandoned carts',
  labelNames: ['reason']
});

const revenue = new Counter({
  name: 'revenue_total',
  help: 'Total revenue',
  labelNames: ['currency', 'product_category']
});

// Usage in business logic
export class OrderService {
  async processOrder(order: Order): Promise<ProcessedOrder> {
    const timer = orderProcessingTime.startTimer({
      payment_method: order.paymentMethod,
      shipping_type: order.shippingType
    });

    try {
      const result = await this.processPayment(order);

      if (result.success) {
        revenue.inc({
          currency: order.currency,
          product_category: order.category
        }, order.total);

        this.updatePaymentSuccessRate(order.paymentProvider, true);
      } else {
        this.updatePaymentSuccessRate(order.paymentProvider, false);
      }

      return result;
    } finally {
      timer();
    }
  }
}

Grafana Dashboard Creation

// AI creates Grafana dashboard
"Generate Grafana dashboard for:
- Service health overview
- Request rate and latency
- Error tracking
- Business metrics
- SLA compliance"

// AI provides dashboard JSON
{
  "dashboard": {
    "title": "Service Health Dashboard",
    "panels": [
      {
        "title": "Request Rate",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (service)",
            "legendFormat": "{{service}}"
          }
        ],
        "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
      },
      {
        "title": "P95 Latency",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
            "legendFormat": "{{service}}"
          }
        ],
        "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
      },
      {
        "title": "Error Rate",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
            "legendFormat": "Error %"
          }
        ],
        "alert": {
          "conditions": [
            {
              "evaluator": { "params": [5], "type": "gt" },
              "operator": { "type": "and" },
              "query": { "params": ["A", "5m", "now"] },
              "reducer": { "params": [], "type": "avg" },
              "type": "query"
            }
          ],
          "name": "High Error Rate"
        },
        "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }
      }
    ]
  }
}

MCP-Powered Monitoring Workflows

Real-time Issue Investigation with Sentry MCP

"Using Sentry MCP, show me:
- Recent errors in production
- Error trends over last 24 hours
- Most affected users
- Stack traces for PaymentError"

// Direct issue resolution
"Using Sentry MCP:
- Find all instances of 'connection timeout'
- Group by service
- Show resolution status
- Create issue for unresolved errors"

Dashboard Management with Grafana MCP

"Using Grafana MCP:
- Search dashboards for 'API performance'
- Run query: rate(http_requests_total[5m])
- Create alert for response time > 1s
- Export dashboard JSON for backup"

// Query optimization
"Using Grafana MCP, optimize this PromQL query:
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
Make it more efficient for large datasets"

Code Quality with SonarQube MCP

"Using SonarQube MCP:
- Get quality gate status for main branch
- List critical security hotspots
- Show code coverage trends
- Find duplicated code blocks"

// Pre-deployment checks
"Using SonarQube MCP, verify:
- No new critical issues
- Code coverage > 80%
- No security vulnerabilities
- Technical debt ratio < 5%"

Performance Monitoring with Dynatrace MCP

"Using Dynatrace MCP:
- Show current problem feed
- Get service flow for checkout process
- Analyze database query performance
- Find memory leak indicators"

Unified Monitoring Workflows

"Coordinate monitoring tools:
1. When Sentry reports error spike
2. Check Grafana metrics for correlation
3. Review SonarQube for recent changes
4. Analyze Dynatrace for root cause"

// Direct query from IDE
"Using Sentry MCP, find all errors:
- Related to payment processing
- In last 6 hours
- Affecting > 10 users
Create Jira ticket for each"

// Instant access to data
// No context switching
// Automated workflows

// Manual process:
// 1. Open Sentry dashboard
// 2. Navigate to issues
// 3. Apply filters manually
// 4. Copy error details
// 5. Switch to Jira
// 6. Create tickets manually

// Time consuming
// Context switching
// Error-prone

APM Integration

// AI implements OpenTelemetry
"Set up OpenTelemetry with:
- Automatic instrumentation
- Custom spans
- Context propagation
- Multiple exporters
- Sampling strategies"

import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
import { PrometheusExporter } from '@opentelemetry/exporter-prometheus';

// Configure SDK
const sdk = new NodeSDK({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
    [SemanticResourceAttributes.SERVICE_VERSION]: process.env.VERSION || '1.0.0',
    environment: process.env.NODE_ENV
  }),
  instrumentations: [
    getNodeAutoInstrumentations({
      '@opentelemetry/instrumentation-fs': {
        enabled: false // Disable noisy fs instrumentation
      }
    })
  ],
  traceExporter: new JaegerExporter({
    endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces'
  }),
  metricExporter: new PrometheusExporter({
    port: 9090
  })
});

// Initialize SDK
sdk.start();

// Custom instrumentation
import { trace, context, SpanStatusCode } from '@opentelemetry/api';

const tracer = trace.getTracer('api-service');

export async function processUserRequest(userId: string, request: any) {
  const span = tracer.startSpan('process_user_request', {
    attributes: {
      'user.id': userId,
      'request.type': request.type
    }
  });

  try {
    // Add events
    span.addEvent('validation_started');
    await validateRequest(request);
    span.addEvent('validation_completed');

    // Create child span
    const childSpan = tracer.startSpan('database_operation', {
      parent: span
    });

    try {
      const result = await databaseOperation(userId, request);
      childSpan.setStatus({ code: SpanStatusCode.OK });
      return result;
    } finally {
      childSpan.end();
    }
  } catch (error) {
    span.recordException(error);
    span.setStatus({
      code: SpanStatusCode.ERROR,
      message: error.message
    });
    throw error;
  } finally {
    span.end();
  }
}

// AI configures Datadog APM
"Configure Datadog APM with:
- Trace collection
- Custom tags
- Error tracking
- Performance monitoring
- Log correlation"

import tracer from 'dd-trace';

// Initialize tracer
tracer.init({
  env: process.env.NODE_ENV,
  service: 'api-service',
  version: process.env.VERSION,
  logInjection: true,
  runtimeMetrics: true,
  profiling: true,
  analytics: true,
  tags: {
    team: 'platform',
    component: 'api'
  }
});

// Custom instrumentation
export class PaymentService {
  async processPayment(payment: Payment) {
    const span = tracer.startSpan('payment.process', {
      resource: payment.provider,
      tags: {
        'payment.amount': payment.amount,
        'payment.currency': payment.currency,
        'payment.method': payment.method
      }
    });

    try {
      // Track custom metrics
      tracer.dogstatsd.increment('payment.attempt', 1, [
        `provider:${payment.provider}`,
        `method:${payment.method}`
      ]);

      const startTime = Date.now();
      const result = await this.callPaymentProvider(payment);

      // Track timing
      tracer.dogstatsd.histogram('payment.duration', Date.now() - startTime, [
        `provider:${payment.provider}`,
        `success:${result.success}`
      ]);

      if (result.success) {
        tracer.dogstatsd.increment('payment.success', 1, [
          `provider:${payment.provider}`
        ]);
      } else {
        span.setTag('error', true);
        span.setTag('error.message', result.error);
      }

      return result;
    } finally {
      span.finish();
    }
  }
}

// AI sets up New Relic
"Implement New Relic monitoring:
- APM configuration
- Custom events
- Browser monitoring
- Infrastructure monitoring
- Synthetic monitoring"

import newrelic from 'newrelic';

// Custom instrumentation
export class OrderProcessor {
  async processOrder(order: Order) {
    // Start transaction
    return newrelic.startSegment('order:process', true, async () => {
      // Add custom attributes
      newrelic.addCustomAttributes({
        orderId: order.id,
        customerId: order.customerId,
        orderTotal: order.total,
        itemCount: order.items.length
      });

      try {
        // Track custom events
        newrelic.recordCustomEvent('OrderProcessing', {
          orderId: order.id,
          stage: 'started',
          timestamp: Date.now()
        });

        // Process order steps
        await this.validateOrder(order);
        await this.checkInventory(order);
        await this.processPayment(order);
        await this.createShipment(order);

        // Record success
        newrelic.recordCustomEvent('OrderProcessing', {
          orderId: order.id,
          stage: 'completed',
          processingTime: Date.now() - order.createdAt,
          success: true
        });

        return { success: true, orderId: order.id };
      } catch (error) {
        // Track errors
        newrelic.noticeError(error, {
          orderId: order.id,
          stage: 'processing',
          customerId: order.customerId
        });

        throw error;
      }
    });
  }
}

Logging Architecture

Structured Logging Implementation

// AI creates structured logging
"Implement structured logging with:
- JSON format
- Correlation IDs
- Context propagation
- Log levels
- Sensitive data masking"

import winston from 'winston';
import { v4 as uuidv4 } from 'uuid';

// Create logger factory
export class LoggerFactory {
  static createLogger(service: string) {
    return winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
      ),
      defaultMeta: {
        service,
        environment: process.env.NODE_ENV,
        version: process.env.VERSION
      },
      transports: [
        new winston.transports.Console({
          format: winston.format.combine(
            winston.format.colorize(),
            winston.format.simple()
          )
        })
      ]
    });
  }
}

// Request context middleware
export class RequestContext {
  private static storage = new AsyncLocalStorage<Context>();

  static middleware() {
    return (req: Request, res: Response, next: NextFunction) => {
      const context: Context = {
        requestId: req.headers['x-request-id'] || uuidv4(),
        userId: req.user?.id,
        sessionId: req.session?.id,
        userAgent: req.headers['user-agent'],
        ip: req.ip
      };

      RequestContext.storage.run(context, () => {
        req.context = context;
        next();
      });
    };
  }

  static getContext(): Context | undefined {
    return this.storage.getStore();
  }
}

// Enhanced logger with context
export class ContextualLogger {
  constructor(private logger: winston.Logger) {}

  private enrichLog(level: string, message: string, meta?: any) {
    const context = RequestContext.getContext();

    return this.logger.log({
      level,
      message,
      ...context,
      ...this.sanitizeMeta(meta)
    });
  }

  private sanitizeMeta(meta: any): any {
    if (!meta) return {};

    // AI implements sensitive data masking
    const sensitive = ['password', 'token', 'apiKey', 'secret', 'creditCard'];
    const sanitized = { ...meta };

    for (const key of Object.keys(sanitized)) {
      if (sensitive.some(s => key.toLowerCase().includes(s))) {
        sanitized[key] = '[REDACTED]';
      }
    }

    return sanitized;
  }

  info(message: string, meta?: any) {
    this.enrichLog('info', message, meta);
  }

  error(message: string, error?: Error, meta?: any) {
    this.enrichLog('error', message, {
      ...meta,
      error: {
        message: error?.message,
        stack: error?.stack,
        name: error?.name
      }
    });
  }
}

Log Aggregation Pipeline

ELK Stack
Loki

# AI creates ELK configuration
"Set up ELK stack with:
- Filebeat for collection
- Logstash for processing
- Elasticsearch for storage
- Kibana for visualization"

# filebeat.yml
filebeat.inputs:
- type: container
  paths:
    - '/var/lib/docker/containers/*/*.log'
  processors:
    - add_docker_metadata:
        host: "unix:///var/run/docker.sock"
    - decode_json_fields:
        fields: ["message"]
        target: "json"
        overwrite_keys: true
    - drop_event:
        when:
          or:
            - equals:
                json.level: "debug"
            - contains:
                json.path: "/health"

output.logstash:
  hosts: ["logstash:5044"]
  ssl.certificate_authorities: ["/etc/ca.crt"]
  ssl.certificate: "/etc/client.crt"
  ssl.key: "/etc/client.key"

# logstash.conf
input {
  beats {
    port => 5044
    ssl => true
    ssl_certificate => "/etc/logstash/server.crt"
    ssl_key => "/etc/logstash/server.key"
  }
}

filter {
  # Parse timestamp
  date {
    match => [ "[json][timestamp]", "ISO8601" ]
    target => "@timestamp"
  }

  # Add GeoIP data
  geoip {
    source => "[json][ip]"
    target => "geoip"
  }

  # Extract fields
  mutate {
    add_field => {
      "service" => "%{[json][service]}"
      "environment" => "%{[json][environment]}"
      "request_id" => "%{[json][requestId]}"
    }
  }

  # Parse user agent
  useragent {
    source => "[json][userAgent]"
    target => "user_agent"
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "logs-%{[service]}-%{+YYYY.MM.dd}"
    template_name => "logs"
    template => "/etc/logstash/templates/logs.json"
  }
}

# AI configures Loki stack
"Configure Loki with:
- Promtail for collection
- Loki for storage
- Grafana for queries
- LogQL queries"

# promtail-config.yaml
server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://loki:3100/loki/api/v1/push

scrape_configs:
  - job_name: containers
    static_configs:
      - targets:
          - localhost
        labels:
          job: containerlogs
          __path__: /var/log/containers/*/*.log

    pipeline_stages:
      - json:
          expressions:
            output: log
            level: level
            timestamp: timestamp
            service: service
            request_id: requestId

      - labels:
          level:
          service:
          environment:

      - timestamp:
          format: RFC3339Nano
          source: timestamp

      - metrics:
          log_lines_total:
            type: Counter
            description: "Total log lines"
            source: level
            config:
              action: inc

          http_request_duration_seconds:
            type: Histogram
            description: "HTTP request duration"
            source: duration
            config:
              buckets: [0.1, 0.5, 1, 2, 5, 10]

# loki-config.yaml
auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  wal:
    enabled: true
    dir: /loki/wal
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1

schema_config:
  configs:
    - from: 2024-01-01
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

storage_config:
  boltdb_shipper:
    active_index_directory: /loki/boltdb-shipper-active
    cache_location: /loki/boltdb-shipper-cache
    shared_store: filesystem
  filesystem:
    directory: /loki/chunks

limits_config:
  retention_period: 30d
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

Distributed Tracing

Implementing Tracing

// AI implements distributed tracing
"Create distributed tracing for microservices:
- Trace context propagation
- Service mesh integration
- Database query tracing
- External API tracing
- Performance analysis"

import { Tracer, Span, SpanContext } from '@opentelemetry/api';
import { W3CTraceContextPropagator } from '@opentelemetry/core';

export class TracingService {
  private tracer: Tracer;
  private propagator = new W3CTraceContextPropagator();

  async handleRequest(req: Request, res: Response) {
    // Extract parent context
    const parentContext = this.propagator.extract(
      context.active(),
      req.headers
    );

    // Start new span
    const span = this.tracer.startSpan('http.request', {
      attributes: {
        'http.method': req.method,
        'http.url': req.url,
        'http.target': req.path,
        'http.host': req.hostname,
        'http.scheme': req.protocol,
        'http.user_agent': req.headers['user-agent']
      }
    }, parentContext);

    // Propagate to downstream services
    const headers = {};
    this.propagator.inject(
      trace.setSpan(context.active(), span),
      headers
    );

    try {
      // Make downstream call
      const response = await this.callDownstreamService({
        headers,
        ...requestData
      });

      span.setAttributes({
        'http.status_code': response.statusCode,
        'http.response_content_length': response.contentLength
      });

      return response;
    } catch (error) {
      span.recordException(error);
      span.setStatus({
        code: SpanStatusCode.ERROR,
        message: error.message
      });
      throw error;
    } finally {
      span.end();
    }
  }
}

// Database tracing
export class DatabaseTracer {
  async query(sql: string, params: any[]) {
    const span = tracer.startSpan('db.query', {
      attributes: {
        'db.system': 'postgresql',
        'db.statement': this.sanitizeSql(sql),
        'db.operation': this.extractOperation(sql)
      }
    });

    try {
      const result = await this.pool.query(sql, params);

      span.setAttributes({
        'db.rows_affected': result.rowCount
      });

      return result;
    } catch (error) {
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  }

  private sanitizeSql(sql: string): string {
    // Remove sensitive data
    return sql.replace(/\b\d{4,}\b/g, '?');
  }
}

Alerting and Incident Management

Alert Rule Configuration

SLO-based Alerts

Alert based on Service Level Objectives

Anomaly Detection

ML-powered anomaly detection

Multi-channel Alerts

Slack, PagerDuty, email integration

Alert Grouping

Intelligent alert correlation

# AI creates Prometheus alert rules
"Generate alert rules for:
- SLO violations
- Error budget consumption
- Resource exhaustion
- Security incidents
- Business metrics"

groups:
  - name: slo_alerts
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status=~"5.."}[5m]))
            /
            sum(rate(http_requests_total[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: critical
          team: platform
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
          runbook_url: "https://wiki.company.com/runbooks/high-error-rate"

      - alert: SLOBudgetBurn
        expr: |
          (
            1 - (
              sum(rate(http_requests_total{status!~"5.."}[1h]))
              /
              sum(rate(http_requests_total[1h]))
            )
          ) > (1 - 0.999) * 14.4
        for: 5m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "SLO error budget burn rate too high"
          description: "At current burn rate, monthly error budget will be exhausted in {{ $value }} hours"

      - alert: HighMemoryUsage
        expr: |
          (
            container_memory_usage_bytes{pod=~"api-.*"}
            /
            container_spec_memory_limit_bytes{pod=~"api-.*"}
          ) > 0.9
        for: 10m
        labels:
          severity: warning
          team: platform
        annotations:
          summary: "Pod {{ $labels.pod }} memory usage above 90%"
          description: "Memory usage is {{ $value | humanizePercentage }}"

  - name: business_alerts
    rules:
      - alert: PaymentFailureRate
        expr: |
          (
            sum(rate(payment_failures_total[5m]))
            /
            sum(rate(payment_attempts_total[5m]))
          ) > 0.1
        for: 5m
        labels:
          severity: critical
          team: payments
        annotations:
          summary: "High payment failure rate"
          description: "Payment failure rate is {{ $value | humanizePercentage }}"
          impact: "Revenue loss estimated at ${{ $value * 1000 | humanize }}/hour"

Incident Response Automation

// AI creates incident response system
"Build incident response automation:
- Automatic incident creation
- Runbook execution
- Status page updates
- Post-mortem generation
- Root cause analysis"

export class IncidentManager {
  async handleAlert(alert: Alert) {
    // Create incident
    const incident = await this.createIncident({
      title: alert.annotations.summary,
      severity: alert.labels.severity,
      team: alert.labels.team,
      alertName: alert.alertname,
      startTime: new Date()
    });

    // Execute runbook if available
    if (alert.annotations.runbook_url) {
      await this.executeRunbook(alert.annotations.runbook_url, incident);
    }

    // Notify team
    await this.notifyTeam(incident);

    // Update status page
    await this.updateStatusPage({
      component: this.getAffectedComponent(alert),
      status: 'degraded',
      message: alert.annotations.summary
    });

    // Start diagnostic collection
    await this.collectDiagnostics(incident);
  }

  private async executeRunbook(runbookUrl: string, incident: Incident) {
    const runbook = await this.fetchRunbook(runbookUrl);

    for (const step of runbook.steps) {
      try {
        const result = await this.executeStep(step);

        await this.addIncidentNote(incident, {
          type: 'runbook_step',
          step: step.name,
          result: result,
          timestamp: new Date()
        });

        if (result.resolved) {
          await this.resolveIncident(incident, {
            resolution: 'Automated runbook resolution',
            steps: runbook.steps
          });
          break;
        }
      } catch (error) {
        await this.addIncidentNote(incident, {
          type: 'runbook_error',
          step: step.name,
          error: error.message
        });
      }
    }
  }
}

Performance Monitoring

Application Performance

// AI implements performance monitoring
"Create performance monitoring for:
- Response time tracking
- Database query performance
- Cache hit rates
- Resource utilization
- User experience metrics"

export class PerformanceMonitor {
  private metrics = new MetricsCollector();

  async monitorEndpoint(
    handler: RequestHandler
  ): Promise<RequestHandler> {
    return async (req, res, next) => {
      const timer = this.metrics.startTimer('http_request_duration');
      const startMemory = process.memoryUsage();
      const startCpu = process.cpuUsage();

      // Track database queries
      const queryTracker = this.trackDatabaseQueries();

      try {
        await handler(req, res, next);
      } finally {
        const duration = timer.end();
        const endMemory = process.memoryUsage();
        const endCpu = process.cpuUsage();

        // Record metrics
        this.metrics.record({
          endpoint: req.path,
          method: req.method,
          statusCode: res.statusCode,
          duration,
          memoryDelta: endMemory.heapUsed - startMemory.heapUsed,
          cpuUser: endCpu.user - startCpu.user,
          cpuSystem: endCpu.system - startCpu.system,
          dbQueries: queryTracker.getQueries(),
          dbDuration: queryTracker.getTotalDuration()
        });

        // Check for performance issues
        if (duration > 1000) {
          this.logger.warn('Slow endpoint detected', {
            endpoint: req.path,
            duration,
            queries: queryTracker.getSlowQueries()
          });
        }
      }
    };
  }

  private trackDatabaseQueries() {
    const queries: QueryInfo[] = [];

    // Hook into database driver
    const originalQuery = db.query;
    db.query = async function(...args) {
      const start = Date.now();
      const result = await originalQuery.apply(this, args);
      const duration = Date.now() - start;

      queries.push({
        sql: args[0],
        duration,
        rows: result.rowCount
      });

      return result;
    };

    return {
      getQueries: () => queries,
      getTotalDuration: () => queries.reduce((sum, q) => sum + q.duration, 0),
      getSlowQueries: () => queries.filter(q => q.duration > 100)
    };
  }
}

Practical Exercises

Exercise 1: Complete Observability Stack

Set up Prometheus metrics collection
Implement structured logging
Add distributed tracing
Create Grafana dashboards
Configure alerting rules

Exercise 2: Performance Analysis

Identify slow endpoints
Analyze database queries
Find memory leaks
Optimize resource usage
Create performance dashboard

Exercise 3: Incident Response

Create alert rules for SLOs
Set up alert routing
Implement runbook automation
Build status page integration
Generate post-mortem reports

Best Practices

Instrument Early

Add monitoring from the start, not as an afterthought

Alert on Symptoms

Alert on user-facing issues, not just technical metrics

Context is King

Include relevant context in logs and traces

Automate Response

Automate common incident responses

Next Steps

Migration Strategies

Migrating legacy systems to modern monitoring

Architecture Patterns

Designing for observability from the start

Advanced DevOps

Integrating monitoring with CI/CD