Przejdź do głównej zawartości

Monitoring and Observability with AI

Ta treść nie jest jeszcze dostępna w Twoim języku.

Effective monitoring and observability are crucial for maintaining reliable systems. This lesson demonstrates how Cursor IDE’s AI capabilities help you implement comprehensive monitoring solutions, from basic metrics to advanced distributed tracing.

Modern applications require sophisticated monitoring across metrics, logs, and traces. AI assistance makes implementing these complex systems accessible, helping you build production-grade observability from day one.

Configure monitoring MCP servers:

~/.cursor/mcp.json
{
"mcpServers": {
"sentry": {
"command": "npx",
"args": ["-y", "sentry-mcp"],
"env": {
"SENTRY_AUTH_TOKEN": "${SENTRY_TOKEN}",
"SENTRY_ORG": "your-org"
}
},
"grafana": {
"command": "npx",
"args": ["-y", "grafana-mcp"],
"env": {
"GRAFANA_URL": "https://grafana.company.com",
"GRAFANA_API_KEY": "${GRAFANA_KEY}"
}
},
"sonarqube": {
"command": "npx",
"args": ["-y", "sonarqube-mcp-server"],
"env": {
"SONAR_HOST_URL": "https://sonar.company.com",
"SONAR_TOKEN": "${SONAR_TOKEN}"
}
}
}
}

Metrics Collection

AI generates metric instrumentation and dashboards

Log Aggregation

AI creates structured logging and parsing rules

Distributed Tracing

AI implements tracing across microservices

Alerting Rules

AI creates intelligent alerting strategies

  1. Application Instrumentation

    // Ask AI to instrument application
    "Add Prometheus metrics to Express API:
    - HTTP request duration histogram
    - Request counter by status code
    - Active connections gauge
    - Custom business metrics
    - Error rate tracking"
    // AI generates instrumentation
    import { register, Counter, Histogram, Gauge } from 'prom-client';
    import express from 'express';
    // Define metrics
    const httpRequestDuration = new Histogram({
    name: 'http_request_duration_seconds',
    help: 'Duration of HTTP requests in seconds',
    labelNames: ['method', 'route', 'status_code'],
    buckets: [0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10]
    });
    const httpRequestTotal = new Counter({
    name: 'http_requests_total',
    help: 'Total number of HTTP requests',
    labelNames: ['method', 'route', 'status_code']
    });
    const activeConnections = new Gauge({
    name: 'active_connections',
    help: 'Number of active connections'
    });
    // Middleware to track metrics
    export const metricsMiddleware = (req: express.Request, res: express.Response, next: express.NextFunction) => {
    const start = Date.now();
    // Track active connections
    activeConnections.inc();
    res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || 'unknown';
    const labels = {
    method: req.method,
    route,
    status_code: res.statusCode.toString()
    };
    httpRequestDuration.observe(labels, duration);
    httpRequestTotal.inc(labels);
    activeConnections.dec();
    });
    next();
    };
    // Metrics endpoint
    export const metricsEndpoint = async (req: express.Request, res: express.Response) => {
    res.set('Content-Type', register.contentType);
    res.send(await register.metrics());
    };
  2. Custom Business Metrics

    // AI implements business metrics
    "Create custom metrics for e-commerce:
    - Order processing time
    - Payment success rate
    - Cart abandonment rate
    - Product view tracking
    - Revenue metrics"
    // AI generates business metrics
    const orderProcessingTime = new Histogram({
    name: 'order_processing_duration_seconds',
    help: 'Time to process orders',
    labelNames: ['payment_method', 'shipping_type'],
    buckets: [1, 5, 10, 30, 60, 120, 300]
    });
    const paymentSuccessRate = new Gauge({
    name: 'payment_success_rate',
    help: 'Percentage of successful payments',
    labelNames: ['payment_provider']
    });
    const cartAbandonment = new Counter({
    name: 'cart_abandonment_total',
    help: 'Number of abandoned carts',
    labelNames: ['reason']
    });
    const revenue = new Counter({
    name: 'revenue_total',
    help: 'Total revenue',
    labelNames: ['currency', 'product_category']
    });
    // Usage in business logic
    export class OrderService {
    async processOrder(order: Order): Promise<ProcessedOrder> {
    const timer = orderProcessingTime.startTimer({
    payment_method: order.paymentMethod,
    shipping_type: order.shippingType
    });
    try {
    const result = await this.processPayment(order);
    if (result.success) {
    revenue.inc({
    currency: order.currency,
    product_category: order.category
    }, order.total);
    this.updatePaymentSuccessRate(order.paymentProvider, true);
    } else {
    this.updatePaymentSuccessRate(order.paymentProvider, false);
    }
    return result;
    } finally {
    timer();
    }
    }
    }
  3. Grafana Dashboard Creation

    // AI creates Grafana dashboard
    "Generate Grafana dashboard for:
    - Service health overview
    - Request rate and latency
    - Error tracking
    - Business metrics
    - SLA compliance"
    // AI provides dashboard JSON
    {
    "dashboard": {
    "title": "Service Health Dashboard",
    "panels": [
    {
    "title": "Request Rate",
    "targets": [
    {
    "expr": "sum(rate(http_requests_total[5m])) by (service)",
    "legendFormat": "{{service}}"
    }
    ],
    "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }
    },
    {
    "title": "P95 Latency",
    "targets": [
    {
    "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))",
    "legendFormat": "{{service}}"
    }
    ],
    "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }
    },
    {
    "title": "Error Rate",
    "targets": [
    {
    "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
    "legendFormat": "Error %"
    }
    ],
    "alert": {
    "conditions": [
    {
    "evaluator": { "params": [5], "type": "gt" },
    "operator": { "type": "and" },
    "query": { "params": ["A", "5m", "now"] },
    "reducer": { "params": [], "type": "avg" },
    "type": "query"
    }
    ],
    "name": "High Error Rate"
    },
    "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }
    }
    ]
    }
    }
  1. Real-time Issue Investigation with Sentry MCP

    "Using Sentry MCP, show me:
    - Recent errors in production
    - Error trends over last 24 hours
    - Most affected users
    - Stack traces for PaymentError"
    // Direct issue resolution
    "Using Sentry MCP:
    - Find all instances of 'connection timeout'
    - Group by service
    - Show resolution status
    - Create issue for unresolved errors"
  2. Dashboard Management with Grafana MCP

    "Using Grafana MCP:
    - Search dashboards for 'API performance'
    - Run query: rate(http_requests_total[5m])
    - Create alert for response time > 1s
    - Export dashboard JSON for backup"
    // Query optimization
    "Using Grafana MCP, optimize this PromQL query:
    sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
    Make it more efficient for large datasets"
  3. Code Quality with SonarQube MCP

    "Using SonarQube MCP:
    - Get quality gate status for main branch
    - List critical security hotspots
    - Show code coverage trends
    - Find duplicated code blocks"
    // Pre-deployment checks
    "Using SonarQube MCP, verify:
    - No new critical issues
    - Code coverage > 80%
    - No security vulnerabilities
    - Technical debt ratio < 5%"
  4. Performance Monitoring with Dynatrace MCP

    "Using Dynatrace MCP:
    - Show current problem feed
    - Get service flow for checkout process
    - Analyze database query performance
    - Find memory leak indicators"
  5. Unified Monitoring Workflows

    "Coordinate monitoring tools:
    1. When Sentry reports error spike
    2. Check Grafana metrics for correlation
    3. Review SonarQube for recent changes
    4. Analyze Dynatrace for root cause"
// Direct query from IDE
"Using Sentry MCP, find all errors:
- Related to payment processing
- In last 6 hours
- Affecting > 10 users
Create Jira ticket for each"
// Instant access to data
// No context switching
// Automated workflows
// AI implements OpenTelemetry
"Set up OpenTelemetry with:
- Automatic instrumentation
- Custom spans
- Context propagation
- Multiple exporters
- Sampling strategies"
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
import { PrometheusExporter } from '@opentelemetry/exporter-prometheus';
// Configure SDK
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'api-service',
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.VERSION || '1.0.0',
environment: process.env.NODE_ENV
}),
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-fs': {
enabled: false // Disable noisy fs instrumentation
}
})
],
traceExporter: new JaegerExporter({
endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces'
}),
metricExporter: new PrometheusExporter({
port: 9090
})
});
// Initialize SDK
sdk.start();
// Custom instrumentation
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
const tracer = trace.getTracer('api-service');
export async function processUserRequest(userId: string, request: any) {
const span = tracer.startSpan('process_user_request', {
attributes: {
'user.id': userId,
'request.type': request.type
}
});
try {
// Add events
span.addEvent('validation_started');
await validateRequest(request);
span.addEvent('validation_completed');
// Create child span
const childSpan = tracer.startSpan('database_operation', {
parent: span
});
try {
const result = await databaseOperation(userId, request);
childSpan.setStatus({ code: SpanStatusCode.OK });
return result;
} finally {
childSpan.end();
}
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
// AI creates structured logging
"Implement structured logging with:
- JSON format
- Correlation IDs
- Context propagation
- Log levels
- Sensitive data masking"
import winston from 'winston';
import { v4 as uuidv4 } from 'uuid';
// Create logger factory
export class LoggerFactory {
static createLogger(service: string) {
return winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: {
service,
environment: process.env.NODE_ENV,
version: process.env.VERSION
},
transports: [
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
})
]
});
}
}
// Request context middleware
export class RequestContext {
private static storage = new AsyncLocalStorage<Context>();
static middleware() {
return (req: Request, res: Response, next: NextFunction) => {
const context: Context = {
requestId: req.headers['x-request-id'] || uuidv4(),
userId: req.user?.id,
sessionId: req.session?.id,
userAgent: req.headers['user-agent'],
ip: req.ip
};
RequestContext.storage.run(context, () => {
req.context = context;
next();
});
};
}
static getContext(): Context | undefined {
return this.storage.getStore();
}
}
// Enhanced logger with context
export class ContextualLogger {
constructor(private logger: winston.Logger) {}
private enrichLog(level: string, message: string, meta?: any) {
const context = RequestContext.getContext();
return this.logger.log({
level,
message,
...context,
...this.sanitizeMeta(meta)
});
}
private sanitizeMeta(meta: any): any {
if (!meta) return {};
// AI implements sensitive data masking
const sensitive = ['password', 'token', 'apiKey', 'secret', 'creditCard'];
const sanitized = { ...meta };
for (const key of Object.keys(sanitized)) {
if (sensitive.some(s => key.toLowerCase().includes(s))) {
sanitized[key] = '[REDACTED]';
}
}
return sanitized;
}
info(message: string, meta?: any) {
this.enrichLog('info', message, meta);
}
error(message: string, error?: Error, meta?: any) {
this.enrichLog('error', message, {
...meta,
error: {
message: error?.message,
stack: error?.stack,
name: error?.name
}
});
}
}
# AI creates ELK configuration
"Set up ELK stack with:
- Filebeat for collection
- Logstash for processing
- Elasticsearch for storage
- Kibana for visualization"
# filebeat.yml
filebeat.inputs:
- type: container
paths:
- '/var/lib/docker/containers/*/*.log'
processors:
- add_docker_metadata:
host: "unix:///var/run/docker.sock"
- decode_json_fields:
fields: ["message"]
target: "json"
overwrite_keys: true
- drop_event:
when:
or:
- equals:
json.level: "debug"
- contains:
json.path: "/health"
output.logstash:
hosts: ["logstash:5044"]
ssl.certificate_authorities: ["/etc/ca.crt"]
ssl.certificate: "/etc/client.crt"
ssl.key: "/etc/client.key"
# logstash.conf
input {
beats {
port => 5044
ssl => true
ssl_certificate => "/etc/logstash/server.crt"
ssl_key => "/etc/logstash/server.key"
}
}
filter {
# Parse timestamp
date {
match => [ "[json][timestamp]", "ISO8601" ]
target => "@timestamp"
}
# Add GeoIP data
geoip {
source => "[json][ip]"
target => "geoip"
}
# Extract fields
mutate {
add_field => {
"service" => "%{[json][service]}"
"environment" => "%{[json][environment]}"
"request_id" => "%{[json][requestId]}"
}
}
# Parse user agent
useragent {
source => "[json][userAgent]"
target => "user_agent"
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{[service]}-%{+YYYY.MM.dd}"
template_name => "logs"
template => "/etc/logstash/templates/logs.json"
}
}
// AI implements distributed tracing
"Create distributed tracing for microservices:
- Trace context propagation
- Service mesh integration
- Database query tracing
- External API tracing
- Performance analysis"
import { Tracer, Span, SpanContext } from '@opentelemetry/api';
import { W3CTraceContextPropagator } from '@opentelemetry/core';
export class TracingService {
private tracer: Tracer;
private propagator = new W3CTraceContextPropagator();
async handleRequest(req: Request, res: Response) {
// Extract parent context
const parentContext = this.propagator.extract(
context.active(),
req.headers
);
// Start new span
const span = this.tracer.startSpan('http.request', {
attributes: {
'http.method': req.method,
'http.url': req.url,
'http.target': req.path,
'http.host': req.hostname,
'http.scheme': req.protocol,
'http.user_agent': req.headers['user-agent']
}
}, parentContext);
// Propagate to downstream services
const headers = {};
this.propagator.inject(
trace.setSpan(context.active(), span),
headers
);
try {
// Make downstream call
const response = await this.callDownstreamService({
headers,
...requestData
});
span.setAttributes({
'http.status_code': response.statusCode,
'http.response_content_length': response.contentLength
});
return response;
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
}
// Database tracing
export class DatabaseTracer {
async query(sql: string, params: any[]) {
const span = tracer.startSpan('db.query', {
attributes: {
'db.system': 'postgresql',
'db.statement': this.sanitizeSql(sql),
'db.operation': this.extractOperation(sql)
}
});
try {
const result = await this.pool.query(sql, params);
span.setAttributes({
'db.rows_affected': result.rowCount
});
return result;
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
private sanitizeSql(sql: string): string {
// Remove sensitive data
return sql.replace(/\b\d{4,}\b/g, '?');
}
}

SLO-based Alerts

Alert based on Service Level Objectives

Anomaly Detection

ML-powered anomaly detection

Multi-channel Alerts

Slack, PagerDuty, email integration

Alert Grouping

Intelligent alert correlation

# AI creates Prometheus alert rules
"Generate alert rules for:
- SLO violations
- Error budget consumption
- Resource exhaustion
- Security incidents
- Business metrics"
groups:
- name: slo_alerts
interval: 30s
rules:
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
runbook_url: "https://wiki.company.com/runbooks/high-error-rate"
- alert: SLOBudgetBurn
expr: |
(
1 - (
sum(rate(http_requests_total{status!~"5.."}[1h]))
/
sum(rate(http_requests_total[1h]))
)
) > (1 - 0.999) * 14.4
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "SLO error budget burn rate too high"
description: "At current burn rate, monthly error budget will be exhausted in {{ $value }} hours"
- alert: HighMemoryUsage
expr: |
(
container_memory_usage_bytes{pod=~"api-.*"}
/
container_spec_memory_limit_bytes{pod=~"api-.*"}
) > 0.9
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Pod {{ $labels.pod }} memory usage above 90%"
description: "Memory usage is {{ $value | humanizePercentage }}"
- name: business_alerts
rules:
- alert: PaymentFailureRate
expr: |
(
sum(rate(payment_failures_total[5m]))
/
sum(rate(payment_attempts_total[5m]))
) > 0.1
for: 5m
labels:
severity: critical
team: payments
annotations:
summary: "High payment failure rate"
description: "Payment failure rate is {{ $value | humanizePercentage }}"
impact: "Revenue loss estimated at ${{ $value * 1000 | humanize }}/hour"
// AI creates incident response system
"Build incident response automation:
- Automatic incident creation
- Runbook execution
- Status page updates
- Post-mortem generation
- Root cause analysis"
export class IncidentManager {
async handleAlert(alert: Alert) {
// Create incident
const incident = await this.createIncident({
title: alert.annotations.summary,
severity: alert.labels.severity,
team: alert.labels.team,
alertName: alert.alertname,
startTime: new Date()
});
// Execute runbook if available
if (alert.annotations.runbook_url) {
await this.executeRunbook(alert.annotations.runbook_url, incident);
}
// Notify team
await this.notifyTeam(incident);
// Update status page
await this.updateStatusPage({
component: this.getAffectedComponent(alert),
status: 'degraded',
message: alert.annotations.summary
});
// Start diagnostic collection
await this.collectDiagnostics(incident);
}
private async executeRunbook(runbookUrl: string, incident: Incident) {
const runbook = await this.fetchRunbook(runbookUrl);
for (const step of runbook.steps) {
try {
const result = await this.executeStep(step);
await this.addIncidentNote(incident, {
type: 'runbook_step',
step: step.name,
result: result,
timestamp: new Date()
});
if (result.resolved) {
await this.resolveIncident(incident, {
resolution: 'Automated runbook resolution',
steps: runbook.steps
});
break;
}
} catch (error) {
await this.addIncidentNote(incident, {
type: 'runbook_error',
step: step.name,
error: error.message
});
}
}
}
}
// AI implements performance monitoring
"Create performance monitoring for:
- Response time tracking
- Database query performance
- Cache hit rates
- Resource utilization
- User experience metrics"
export class PerformanceMonitor {
private metrics = new MetricsCollector();
async monitorEndpoint(
handler: RequestHandler
): Promise<RequestHandler> {
return async (req, res, next) => {
const timer = this.metrics.startTimer('http_request_duration');
const startMemory = process.memoryUsage();
const startCpu = process.cpuUsage();
// Track database queries
const queryTracker = this.trackDatabaseQueries();
try {
await handler(req, res, next);
} finally {
const duration = timer.end();
const endMemory = process.memoryUsage();
const endCpu = process.cpuUsage();
// Record metrics
this.metrics.record({
endpoint: req.path,
method: req.method,
statusCode: res.statusCode,
duration,
memoryDelta: endMemory.heapUsed - startMemory.heapUsed,
cpuUser: endCpu.user - startCpu.user,
cpuSystem: endCpu.system - startCpu.system,
dbQueries: queryTracker.getQueries(),
dbDuration: queryTracker.getTotalDuration()
});
// Check for performance issues
if (duration > 1000) {
this.logger.warn('Slow endpoint detected', {
endpoint: req.path,
duration,
queries: queryTracker.getSlowQueries()
});
}
}
};
}
private trackDatabaseQueries() {
const queries: QueryInfo[] = [];
// Hook into database driver
const originalQuery = db.query;
db.query = async function(...args) {
const start = Date.now();
const result = await originalQuery.apply(this, args);
const duration = Date.now() - start;
queries.push({
sql: args[0],
duration,
rows: result.rowCount
});
return result;
};
return {
getQueries: () => queries,
getTotalDuration: () => queries.reduce((sum, q) => sum + q.duration, 0),
getSlowQueries: () => queries.filter(q => q.duration > 100)
};
}
}
  1. Set up Prometheus metrics collection
  2. Implement structured logging
  3. Add distributed tracing
  4. Create Grafana dashboards
  5. Configure alerting rules
  1. Identify slow endpoints
  2. Analyze database queries
  3. Find memory leaks
  4. Optimize resource usage
  5. Create performance dashboard
  1. Create alert rules for SLOs
  2. Set up alert routing
  3. Implement runbook automation
  4. Build status page integration
  5. Generate post-mortem reports

Instrument Early

Add monitoring from the start, not as an afterthought

Alert on Symptoms

Alert on user-facing issues, not just technical metrics

Context is King

Include relevant context in logs and traces

Automate Response

Automate common incident responses

Migration Strategies

Migrating legacy systems to modern monitoring

Architecture Patterns

Designing for observability from the start

Advanced DevOps

Integrating monitoring with CI/CD