Skip to content

Monitoring & Logging Patterns

Master observability with Cursor IDE and Claude Code. This guide covers Prometheus and Grafana setup, ELK stack configuration, distributed tracing, alerting strategies, and production monitoring patterns with AI assistance.

  1. Initialize Monitoring Stack

    Terminal window
    # Generate monitoring configuration
    Agent: "Create monitoring stack with:
    - Prometheus for metrics
    - Grafana dashboards
    - AlertManager rules
    - Service discovery
    - Log aggregation"
  2. Install Monitoring MCP Servers (optional)

    Terminal window
    # Grafana
    claude mcp add grafana -- npx -y grafana-mcp
    # Sentry
    claude mcp add sentry -- npx -y sentry-mcp
    # Elasticsearch
    claude mcp add es -- npx -y elasticsearch-mcp
  3. Configure AI Rules

    # .cursorrules or CLAUDE.md
    Monitoring best practices:
    - Use structured logging
    - Define SLIs and SLOs
    - Implement proper cardinality control
    - Use meaningful metric names
    - Configure retention policies
    - Follow security best practices
    - Implement cost optimization
# AI Prompt
Agent: "Create Prometheus configuration with:
- Service discovery for Kubernetes
- Recording rules for performance
- Alert rules for SLOs
- Remote write configuration
- Federation setup"
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-east-1'
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules
rule_files:
- "rules/*.yml"
- "alerts/*.yml"
# Remote write for long-term storage
remote_write:
- url: "http://cortex:9009/api/v1/push"
queue_config:
max_samples_per_send: 10000
batch_send_deadline: 5s
min_backoff: 30ms
max_backoff: 100ms
write_relabel_configs:
- source_labels: [__name__]
regex: '(go_|process_).*'
action: drop
# Service discovery configurations
scrape_configs:
# Kubernetes service discovery
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Node metrics
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Pod metrics
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Service endpoints
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Blackbox exporter for endpoint monitoring
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://api.example.com
- https://app.example.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
rules/recording.yml
# AI Prompt: "Create Prometheus recording rules for performance"
groups:
- name: api_performance
interval: 30s
rules:
# Request rate
- record: instance:api_http_requests:rate5m
expr: |
sum by (instance, job, method, status) (
rate(http_requests_total[5m])
)
# Error rate
- record: job:api_http_errors:rate5m
expr: |
sum by (job) (
rate(http_requests_total{status=~"5.."}[5m])
)
# P95 latency
- record: job:api_http_latency_seconds:p95_5m
expr: |
histogram_quantile(0.95,
sum by (job, le) (
rate(http_request_duration_seconds_bucket[5m])
)
)
# P99 latency
- record: job:api_http_latency_seconds:p99_5m
expr: |
histogram_quantile(0.99,
sum by (job, le) (
rate(http_request_duration_seconds_bucket[5m])
)
)
- name: resource_utilization
interval: 30s
rules:
# CPU utilization
- record: instance:node_cpu:rate5m
expr: |
100 - (avg by (instance) (
irate(node_cpu_seconds_total{mode="idle"}[5m])
) * 100)
# Memory utilization
- record: instance:node_memory_utilization:ratio
expr: |
1 - (
node_memory_MemAvailable_bytes /
node_memory_MemTotal_bytes
)
# Disk utilization
- record: instance:node_filesystem_utilization:ratio
expr: |
1 - (
node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} /
node_filesystem_size_bytes{fstype!~"tmpfs|fuse.lxcfs"}
)
- name: business_metrics
interval: 1m
rules:
# Revenue per minute
- record: business:revenue_usd:rate1m
expr: |
sum(increase(order_total_usd[1m]))
# Active users
- record: business:active_users:gauge
expr: |
count(
sum by (user_id) (
rate(api_requests_by_user[5m])
) > 0
)
alerts/slo.yml
# AI Prompt: "Create comprehensive alert rules with SLO-based alerts"
groups:
- name: slo_alerts
interval: 30s
rules:
# API availability SLO
- alert: APIAvailabilitySLO
expr: |
(
sum(rate(http_requests_total{status!~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
) < 0.995
for: 5m
labels:
severity: critical
team: platform
slo: availability
annotations:
summary: "API availability below SLO ({{ $value | humanizePercentage }})"
description: "API availability is {{ $value | humanizePercentage }}, which is below the 99.5% SLO"
runbook: "https://runbooks.example.com/api-availability"
# Latency SLO
- alert: APILatencySLO
expr: |
job:api_http_latency_seconds:p95_5m > 0.5
for: 5m
labels:
severity: warning
team: platform
slo: latency
annotations:
summary: "API p95 latency exceeds SLO"
description: "95th percentile latency is {{ $value }}s (SLO: 500ms)"
# Error budget burn rate
- alert: ErrorBudgetBurnRate
expr: |
(
sum(rate(http_requests_total{status=~"5.."}[1h]))
/
sum(rate(http_requests_total[1h]))
) > 0.01
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "High error budget burn rate"
description: "Error rate over the last hour is {{ $value | humanizePercentage }}"
- name: infrastructure_alerts
rules:
# Node down
- alert: NodeDown
expr: up{job="kubernetes-nodes"} == 0
for: 2m
labels:
severity: critical
team: infrastructure
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "Node has been unreachable for more than 2 minutes"
# High CPU usage
- alert: HighCPUUsage
expr: instance:node_cpu:rate5m > 80
for: 10m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}%"
# Disk space warning
- alert: DiskSpaceWarning
expr: instance:node_filesystem_utilization:ratio > 0.8
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk usage is {{ $value | humanizePercentage }}"
# Pod crash looping
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
description: "Pod has restarted {{ $value }} times in the last 15 minutes"
// AI Prompt
Agent: "Create Grafana dashboard for:
- Service overview with SLIs
- Detailed performance metrics
- Infrastructure monitoring
- Business metrics
- Alert status"
// dashboard.json
{
"dashboard": {
"title": "Service Overview",
"uid": "service-overview",
"tags": ["production", "sli"],
"timezone": "browser",
"schemaVersion": 27,
"version": 1,
"refresh": "30s",
"time": {
"from": "now-6h",
"to": "now"
},
"templating": {
"list": [
{
"name": "datasource",
"type": "datasource",
"query": "prometheus",
"current": {
"text": "Prometheus",
"value": "prometheus"
}
},
{
"name": "namespace",
"type": "query",
"datasource": "$datasource",
"query": "label_values(kube_namespace_created, namespace)",
"refresh": 1,
"multi": true,
"includeAll": true
},
{
"name": "service",
"type": "query",
"datasource": "$datasource",
"query": "label_values(up{namespace=~\"$namespace\"}, job)",
"refresh": 1,
"multi": true,
"includeAll": true
}
]
},
"panels": [
{
"title": "Service Level Indicators",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 0},
"type": "row",
"collapsed": false,
"panels": [
{
"title": "Availability",
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 1},
"type": "stat",
"targets": [
{
"expr": "sum(rate(http_requests_total{status!~\"5..\",namespace=~\"$namespace\",job=~\"$service\"}[5m])) / sum(rate(http_requests_total{namespace=~\"$namespace\",job=~\"$service\"}[5m]))",
"format": "time_series",
"legendFormat": "Availability"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 0.99},
{"color": "green", "value": 0.995}
]
}
}
}
},
{
"title": "Request Rate",
"gridPos": {"h": 8, "w": 6, "x": 6, "y": 1},
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{namespace=~\"$namespace\",job=~\"$service\"}[5m])) by (status)",
"legendFormat": "{{status}}"
}
],
"yaxes": [
{"format": "reqps", "label": "Requests/sec"}
]
},
{
"title": "P95 Latency",
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 1},
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{namespace=~\"$namespace\",job=~\"$service\"}[5m])) by (le))",
"legendFormat": "p95"
}
],
"yaxes": [
{"format": "s", "label": "Latency"}
],
"alert": {
"conditions": [
{
"evaluator": {
"params": [0.5],
"type": "gt"
},
"query": {
"params": ["A", "5m", "now"]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"name": "High P95 Latency",
"noDataState": "no_data",
"notifications": []
}
},
{
"title": "Error Rate",
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 1},
"type": "stat",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\",namespace=~\"$namespace\",job=~\"$service\"}[5m])) / sum(rate(http_requests_total{namespace=~\"$namespace\",job=~\"$service\"}[5m]))",
"legendFormat": "Error Rate"
}
],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 0.01},
{"color": "red", "value": 0.05}
]
}
}
}
}
]
},
{
"title": "Infrastructure Metrics",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 9},
"type": "row",
"panels": [
{
"title": "CPU Usage",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 10},
"type": "graph",
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",pod=~\".*$service.*\"}[5m])) by (pod)",
"legendFormat": "{{pod}}"
}
]
},
{
"title": "Memory Usage",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 10},
"type": "graph",
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",pod=~\".*$service.*\"}) by (pod)",
"legendFormat": "{{pod}}"
}
],
"yaxes": [
{"format": "bytes"}
]
},
{
"title": "Network I/O",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 10},
"type": "graph",
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\",pod=~\".*$service.*\"}[5m])) by (pod)",
"legendFormat": "RX {{pod}}"
},
{
"expr": "-sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\",pod=~\".*$service.*\"}[5m])) by (pod)",
"legendFormat": "TX {{pod}}"
}
],
"yaxes": [
{"format": "Bps"}
]
}
]
}
]
}
}

ELK Stack (Elasticsearch, Logstash, Kibana)

Section titled “ELK Stack (Elasticsearch, Logstash, Kibana)”
elasticsearch.yml
# AI Prompt: "Configure Elasticsearch cluster for production"
cluster.name: production-logs
node.name: ${HOSTNAME}
node.roles: [master, data, ingest]
# Network settings
network.host: 0.0.0.0
http.port: 9200
transport.port: 9300
# Discovery
discovery.seed_hosts:
- elasticsearch-master-0
- elasticsearch-master-1
- elasticsearch-master-2
cluster.initial_master_nodes:
- elasticsearch-master-0
- elasticsearch-master-1
- elasticsearch-master-2
# Memory
bootstrap.memory_lock: true
# Data paths
path.data: /usr/share/elasticsearch/data
path.logs: /usr/share/elasticsearch/logs
# Security
xpack.security.enabled: true
xpack.security.transport.ssl.enabled: true
xpack.security.transport.ssl.verification_mode: certificate
xpack.security.transport.ssl.client_authentication: required
xpack.security.transport.ssl.keystore.path: elastic-certificates.p12
xpack.security.transport.ssl.truststore.path: elastic-certificates.p12
# Monitoring
xpack.monitoring.enabled: true
xpack.monitoring.collection.enabled: true
# Index lifecycle management
xpack.ilm.enabled: true
# Machine learning
xpack.ml.enabled: true
# Performance tuning
indices.memory.index_buffer_size: 20%
indices.queries.cache.size: 15%
indices.fielddata.cache.size: 30%
# Thread pools
thread_pool:
write:
size: 8
queue_size: 1000
search:
size: 12
queue_size: 1000
# Circuit breakers
indices.breaker.total.limit: 85%
indices.breaker.fielddata.limit: 40%
indices.breaker.request.limit: 40%
# AI Prompt
Ask: "Create Logstash pipeline with:
- Multiple inputs (beats, syslog, kafka)
- Parsing and enrichment
- Error handling
- Performance optimization
- Multiple outputs"
# logstash.conf
input {
# Filebeat input
beats {
port => 5044
ssl => true
ssl_certificate_authorities => ["/etc/logstash/ca.crt"]
ssl_certificate => "/etc/logstash/server.crt"
ssl_key => "/etc/logstash/server.key"
ssl_verify_mode => "peer"
}
# Syslog input
syslog {
port => 5514
type => "syslog"
codec => plain {
charset => "UTF-8"
}
}
# Kafka input for high-volume logs
kafka {
bootstrap_servers => "kafka:9092"
topics => ["application-logs", "nginx-logs", "security-logs"]
group_id => "logstash-consumer"
codec => json
consumer_threads => 4
decorate_events => true
}
# HTTP input for direct log submission
http {
port => 8080
codec => json
ssl => true
ssl_certificate => "/etc/logstash/http.crt"
ssl_key => "/etc/logstash/http.key"
}
}
filter {
# Parse JSON logs
if [message] =~ /^\{.*\}$/ {
json {
source => "message"
target => "parsed"
}
mutate {
rename => {
"[parsed][timestamp]" => "@timestamp"
"[parsed][level]" => "log_level"
"[parsed][logger]" => "logger_name"
"[parsed][trace_id]" => "trace_id"
"[parsed][span_id]" => "span_id"
}
}
}
# Parse nginx logs
if [type] == "nginx" {
grok {
match => {
"message" => '%{IPORHOST:remote_addr} - %{DATA:remote_user} \[%{HTTPDATE:time_local}\] "%{WORD:method} %{DATA:request} HTTP/%{NUMBER:http_version}" %{NUMBER:status} %{NUMBER:body_bytes_sent} "%{DATA:http_referer}" "%{DATA:http_user_agent}" "%{DATA:http_x_forwarded_for}" %{NUMBER:request_time} %{NUMBER:upstream_response_time}'
}
}
date {
match => ["time_local", "dd/MMM/yyyy:HH:mm:ss Z"]
target => "@timestamp"
}
mutate {
convert => {
"status" => "integer"
"body_bytes_sent" => "integer"
"request_time" => "float"
"upstream_response_time" => "float"
}
}
# GeoIP enrichment
if [remote_addr] !~ /^(10\.|172\.(1[6-9]|2[0-9]|3[01])\.|192\.168\.)/ {
geoip {
source => "remote_addr"
target => "geoip"
}
}
}
# Parse application logs
if [type] == "application" {
# Extract stack traces
if [log_level] == "ERROR" {
multiline {
pattern => "^\s"
what => "previous"
max_age => 5
max_lines => 50
}
}
# Add environment metadata
mutate {
add_field => {
"environment" => "${ENVIRONMENT:production}"
"service_name" => "${SERVICE_NAME:unknown}"
"version" => "${VERSION:unknown}"
}
}
}
# Parse Kubernetes metadata
if [kubernetes] {
mutate {
add_field => {
"k8s_namespace" => "%{[kubernetes][namespace]}"
"k8s_pod" => "%{[kubernetes][pod][name]}"
"k8s_container" => "%{[kubernetes][container][name]}"
"k8s_node" => "%{[kubernetes][node][name]}"
}
}
}
# Security enrichment
if [type] == "security" {
# Threat intelligence lookup
translate {
field => "src_ip"
destination => "threat_intel"
dictionary_path => "/etc/logstash/threat_intel.yml"
fallback => "clean"
}
# Anonymize sensitive data
mutate {
gsub => [
"message", "\b(?:\d{4}[- ]?){3}\d{4}\b", "XXXX-XXXX-XXXX-XXXX",
"message", "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "REDACTED@EMAIL"
]
}
}
# Performance metrics
metrics {
meter => ["events"]
add_tag => ["metric"]
flush_interval => 60
}
# Drop debug logs in production
if [environment] == "production" and [log_level] == "DEBUG" {
drop { }
}
}
output {
# Primary Elasticsearch output
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{type}-%{+YYYY.MM.dd}"
template_name => "logs"
template => "/etc/logstash/templates/logs.json"
template_overwrite => true
# Security
ssl => true
ssl_certificate_verification => true
cacert => "/etc/logstash/ca.crt"
user => "${ELASTICSEARCH_USER}"
password => "${ELASTICSEARCH_PASSWORD}"
# Performance
pipeline => "logs-pipeline"
pool_max => 50
pool_max_per_route => 25
# ILM
ilm_enabled => true
ilm_rollover_alias => "logs"
ilm_pattern => "000001"
ilm_policy => "logs-policy"
}
# Dead letter queue for failed events
if "_elasticsearch_failure" in [tags] {
file {
path => "/var/log/logstash/dead_letter_queue.log"
codec => json_lines
}
}
# Real-time alerts to Kafka
if [log_level] == "ERROR" or [status] >= 500 {
kafka {
bootstrap_servers => "kafka:9092"
topic_id => "alerts"
codec => json
compression_type => "snappy"
}
}
# Metrics to monitoring system
if "metric" in [tags] {
statsd {
host => "statsd"
port => 8125
namespace => "logstash"
gauge => {
"events.rate" => "%{[events][rate_1m]}"
}
}
}
# S3 backup for compliance
s3 {
region => "us-east-1"
bucket => "logs-backup"
prefix => "%{type}/%{+YYYY}/%{+MM}/%{+dd}"
time_file => 300
codec => json_lines
canned_acl => "private"
server_side_encryption => true
server_side_encryption_algorithm => "AES256"
}
}
kibana-dashboard.json
# AI Prompt: "Create Kibana dashboard configuration for log analysis"
{
"version": "8.0.0",
"objects": [
{
"id": "log-overview-dashboard",
"type": "dashboard",
"attributes": {
"title": "Log Analysis Overview",
"hits": 0,
"description": "Comprehensive log analysis dashboard",
"panelsJSON": "[{\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15},\"panelIndex\":\"1\",\"embeddableConfig\":{},\"panelRefName\":\"panel_1\"},{\"gridData\":{\"x\":24,\"y\":0,\"w\":24,\"h\":15},\"panelIndex\":\"2\",\"embeddableConfig\":{},\"panelRefName\":\"panel_2\"},{\"gridData\":{\"x\":0,\"y\":15,\"w\":48,\"h\":15},\"panelIndex\":\"3\",\"embeddableConfig\":{},\"panelRefName\":\"panel_3\"}]",
"timeRestore": true,
"timeTo": "now",
"timeFrom": "now-24h",
"refreshInterval": {
"pause": false,
"value": 10000
}
}
},
{
"id": "log-levels-visualization",
"type": "visualization",
"attributes": {
"title": "Log Levels Over Time",
"visState": {
"type": "line",
"params": {
"grid": {"categoryLines": false, "style": {"color": "#eee"}},
"categoryAxes": [{
"id": "CategoryAxis-1",
"type": "category",
"position": "bottom",
"show": true,
"style": {},
"scale": {"type": "linear"},
"labels": {"show": true, "truncate": 100},
"title": {}
}],
"valueAxes": [{
"id": "ValueAxis-1",
"name": "LeftAxis-1",
"type": "value",
"position": "left",
"show": true,
"style": {},
"scale": {"type": "linear", "mode": "normal"},
"labels": {"show": true, "rotate": 0, "filter": false, "truncate": 100},
"title": {"text": "Count"}
}],
"seriesParams": [{
"show": true,
"type": "line",
"mode": "normal",
"data": {"label": "Count", "id": "1"},
"valueAxis": "ValueAxis-1",
"drawLinesBetweenPoints": true,
"showCircles": true
}]
},
"aggs": [
{
"id": "1",
"enabled": true,
"type": "count",
"schema": "metric",
"params": {}
},
{
"id": "2",
"enabled": true,
"type": "date_histogram",
"schema": "segment",
"params": {
"field": "@timestamp",
"interval": "auto",
"min_doc_count": 1,
"extended_bounds": {}
}
},
{
"id": "3",
"enabled": true,
"type": "terms",
"schema": "group",
"params": {
"field": "log_level",
"size": 5,
"order": "desc",
"orderBy": "1"
}
}
]
}
}
}
]
}
# AI Prompt
Agent: "Set up Jaeger for distributed tracing with:
- Collector configuration
- Sampling strategies
- Storage backend
- Query service
- Integration with apps"
# jaeger-production.yml
apiVersion: v1
kind: ConfigMap
metadata:
name: jaeger-configuration
namespace: observability
data:
sampling.json: |
{
"service_strategies": [
{
"service": "api-gateway",
"type": "adaptive",
"max_traces_per_second": 100
},
{
"service": "payment-service",
"type": "probabilistic",
"param": 0.1
},
{
"service": "user-service",
"type": "ratelimiting",
"param": 50
}
],
"default_strategy": {
"type": "probabilistic",
"param": 0.001,
"operation_strategies": [
{
"operation": "GET /health",
"type": "probabilistic",
"param": 0.0001
},
{
"operation": "POST /api/v1/orders",
"type": "probabilistic",
"param": 0.1
}
]
}
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jaeger-collector
namespace: observability
spec:
replicas: 3
selector:
matchLabels:
app: jaeger-collector
template:
metadata:
labels:
app: jaeger-collector
spec:
containers:
- name: jaeger-collector
image: jaegertracing/jaeger-collector:1.50
env:
- name: SPAN_STORAGE_TYPE
value: elasticsearch
- name: ES_SERVER_URLS
value: http://elasticsearch:9200
- name: ES_USERNAME
valueFrom:
secretKeyRef:
name: elastic-credentials
key: username
- name: ES_PASSWORD
valueFrom:
secretKeyRef:
name: elastic-credentials
key: password
- name: ES_INDEX_PREFIX
value: jaeger
- name: ES_NUM_SHARDS
value: "3"
- name: ES_NUM_REPLICAS
value: "1"
- name: COLLECTOR_ZIPKIN_HOST_PORT
value: ":9411"
- name: COLLECTOR_NUM_WORKERS
value: "50"
- name: COLLECTOR_QUEUE_SIZE
value: "2000"
- name: SAMPLING_CONFIG_FILE
value: /etc/jaeger/sampling.json
ports:
- containerPort: 14250 # gRPC
name: grpc
- containerPort: 14268 # HTTP
name: http
- containerPort: 14269 # Admin
name: admin
- containerPort: 9411 # Zipkin
name: zipkin
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
volumeMounts:
- name: sampling-config
mountPath: /etc/jaeger
livenessProbe:
httpGet:
path: /
port: 14269
readinessProbe:
httpGet:
path: /
port: 14269
volumes:
- name: sampling-config
configMap:
name: jaeger-configuration
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: jaeger-query
namespace: observability
spec:
replicas: 2
selector:
matchLabels:
app: jaeger-query
template:
metadata:
labels:
app: jaeger-query
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "16687"
spec:
containers:
- name: jaeger-query
image: jaegertracing/jaeger-query:1.50
env:
- name: SPAN_STORAGE_TYPE
value: elasticsearch
- name: ES_SERVER_URLS
value: http://elasticsearch:9200
- name: ES_USERNAME
valueFrom:
secretKeyRef:
name: elastic-credentials
key: username
- name: ES_PASSWORD
valueFrom:
secretKeyRef:
name: elastic-credentials
key: password
- name: ES_INDEX_PREFIX
value: jaeger
- name: QUERY_BASE_PATH
value: /jaeger
- name: QUERY_STATIC_FILES
value: /go/jaeger-ui/
- name: QUERY_UI_CONFIG
value: /etc/jaeger/ui-config.json
ports:
- containerPort: 16686
name: query
- containerPort: 16687
name: metrics
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
tracing.js
// AI Prompt: "Instrument Node.js application with OpenTelemetry"
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus');
const { BatchSpanProcessor } = require('@opentelemetry/sdk-trace-base');
const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics');
// Configure resource
const resource = Resource.default().merge(
new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: process.env.SERVICE_NAME || 'api-service',
[SemanticResourceAttributes.SERVICE_VERSION]: process.env.SERVICE_VERSION || '1.0.0',
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.ENVIRONMENT || 'production',
'service.namespace': process.env.NAMESPACE || 'default',
'service.instance.id': process.env.POD_NAME || `${os.hostname()}-${process.pid}`,
})
);
// Configure Jaeger exporter
const jaegerExporter = new JaegerExporter({
endpoint: process.env.JAEGER_ENDPOINT || 'http://jaeger-collector:14268/api/traces',
headers: {
'Authorization': `Bearer ${process.env.JAEGER_TOKEN || ''}`
},
});
// Configure Prometheus exporter
const prometheusExporter = new PrometheusExporter({
port: 9464,
endpoint: '/metrics',
}, () => {
console.log('Prometheus metrics server started on port 9464');
});
// Configure SDK
const sdk = new NodeSDK({
resource,
traceExporter: jaegerExporter,
metricReader: prometheusExporter,
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-fs': {
enabled: false, // Disable noisy fs instrumentation
},
'@opentelemetry/instrumentation-http': {
requestHook: (span, request) => {
span.setAttribute('http.request.body', JSON.stringify(request.body));
},
responseHook: (span, response) => {
span.setAttribute('http.response.size', response.headers['content-length']);
},
ignoreIncomingPaths: ['/health', '/metrics', '/ready'],
ignoreOutgoingUrls: ['https://api.segment.io'],
},
'@opentelemetry/instrumentation-express': {
requestHook: (span, info) => {
span.setAttribute('express.route', info.route);
span.setAttribute('express.params', JSON.stringify(info.req.params));
},
},
'@opentelemetry/instrumentation-mongodb': {
enhancedDatabaseReporting: true,
responseHook: (span, result) => {
span.setAttribute('db.response.count', Array.isArray(result) ? result.length : 1);
},
},
'@opentelemetry/instrumentation-redis': {
requireParentSpan: true,
dbStatementSerializer: (cmdName, cmdArgs) => {
// Redact sensitive information
if (['auth', 'set'].includes(cmdName.toLowerCase())) {
return `${cmdName} [REDACTED]`;
}
return `${cmdName} ${cmdArgs.join(' ')}`;
},
},
}),
],
spanProcessor: new BatchSpanProcessor(jaegerExporter, {
maxQueueSize: 2048,
maxExportBatchSize: 512,
scheduledDelayMillis: 5000,
exportTimeoutMillis: 30000,
}),
});
// Initialize the SDK
sdk.start();
// Graceful shutdown
process.on('SIGTERM', () => {
sdk.shutdown()
.then(() => console.log('Tracing terminated'))
.catch((error) => console.log('Error terminating tracing', error))
.finally(() => process.exit(0));
});
module.exports = { sdk };
// Custom instrumentation helper
const { trace, context, SpanStatusCode } = require('@opentelemetry/api');
class TracingHelper {
constructor() {
this.tracer = trace.getTracer('application', '1.0.0');
}
// Wrap async functions with tracing
traceAsync(name, fn, attributes = {}) {
return async (...args) => {
const span = this.tracer.startSpan(name, {
attributes: {
'code.function': fn.name,
'code.namespace': this.constructor.name,
...attributes,
},
});
return context.with(trace.setSpan(context.active(), span), async () => {
try {
const result = await fn(...args);
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
span.recordException(error);
throw error;
} finally {
span.end();
}
});
};
}
// Add custom attributes to current span
addAttributes(attributes) {
const span = trace.getActiveSpan();
if (span) {
Object.entries(attributes).forEach(([key, value]) => {
span.setAttribute(key, value);
});
}
}
// Create child span
createSpan(name, fn, options = {}) {
const span = this.tracer.startSpan(name, options);
return context.with(trace.setSpan(context.active(), span), () => {
try {
const result = fn(span);
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
span.recordException(error);
throw error;
} finally {
span.end();
}
});
}
}
module.exports.TracingHelper = TracingHelper;
alertmanager.yml
# AI Prompt: "Configure AlertManager with complex routing rules"
global:
resolve_timeout: 5m
slack_api_url: '${SLACK_API_URL}'
pagerduty_url: '${PAGERDUTY_URL}'
opsgenie_api_key: '${OPSGENIE_API_KEY}'
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default'
routes:
# Critical alerts - immediate page
- match:
severity: critical
receiver: 'pagerduty-critical'
group_wait: 0s
repeat_interval: 1h
continue: true
# Database alerts
- match_re:
service: '^(postgres|mysql|mongodb|redis).*'
receiver: 'database-team'
group_by: ['alertname', 'instance']
continue: true
# Security alerts
- match:
team: security
receiver: 'security-team'
group_wait: 0s
routes:
- match:
severity: critical
receiver: 'security-oncall'
# Business hours only
- match:
severity: warning
receiver: 'slack-warnings'
mute_time_intervals:
- weekends
- out-of-hours
# Dev environment - lower priority
- match:
environment: development
receiver: 'dev-slack'
repeat_interval: 24h
# Catch-all for unmatched alerts
- match_re:
alertname: '.*'
receiver: 'slack-general'
receivers:
- name: 'default'
# No-op receiver
- name: 'pagerduty-critical'
pagerduty_configs:
- service_key: '${PAGERDUTY_SERVICE_KEY}'
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
client: 'AlertManager'
client_url: '{{ template "pagerduty.default.clientURL" . }}'
details:
alerts: '{{ template "pagerduty.default.instances" . }}'
num_alerts: '{{ .Alerts | len }}'
- name: 'database-team'
email_configs:
- to: 'database-team@example.com'
from: 'alerts@example.com'
smarthost: 'smtp.example.com:587'
auth_username: '${SMTP_USERNAME}'
auth_password: '${SMTP_PASSWORD}'
headers:
Subject: 'Database Alert: {{ .GroupLabels.alertname }}'
html: '{{ template "email.default.html" . }}'
slack_configs:
- channel: '#database-alerts'
title: 'Database Alert'
text: '{{ template "slack.default.text" . }}'
send_resolved: true
- name: 'security-team'
opsgenie_configs:
- api_key: '${OPSGENIE_API_KEY}'
message: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
description: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
source: 'AlertManager'
priority: 'P2'
tags: 'security,{{ .GroupLabels.cluster }}'
- name: 'security-oncall'
webhook_configs:
- url: 'http://security-bot:8080/alert'
send_resolved: false
http_config:
bearer_token: '${SECURITY_BOT_TOKEN}'
- name: 'slack-warnings'
slack_configs:
- channel: '#alerts-warning'
title: 'Warning: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
send_resolved: true
actions:
- type: 'button'
text: 'Dashboard'
url: 'https://grafana.example.com/d/{{ .GroupLabels.service }}'
- type: 'button'
text: 'Runbook'
url: '{{ (index .Alerts 0).Annotations.runbook }}'
- name: 'dev-slack'
slack_configs:
- channel: '#dev-alerts'
title: '[DEV] {{ .GroupLabels.alertname }}'
send_resolved: true
username: 'alertmanager-dev'
icon_emoji: ':construction:'
- name: 'slack-general'
slack_configs:
- channel: '#alerts-general'
send_resolved: true
inhibit_rules:
# Inhibit warnings when critical alerts are firing
- source_matchers:
- severity = 'critical'
target_matchers:
- severity = 'warning'
equal: ['alertname', 'cluster', 'service']
# Inhibit alerts when cluster is down
- source_matchers:
- alertname = 'ClusterDown'
target_matchers:
- alertname != 'ClusterDown'
equal: ['cluster']
mute_time_intervals:
- name: weekends
time_intervals:
- weekdays: ['saturday', 'sunday']
- name: out-of-hours
time_intervals:
- times:
- start_time: '00:00'
end_time: '09:00'
- start_time: '18:00'
end_time: '24:00'
weekdays: ['monday:friday']
# AI Prompt
Agent: "Configure OpenTelemetry Collector for:
- Multi-pipeline processing
- Tail sampling
- Metric aggregation
- Log correlation
- Multi-backend export"
# otel-collector-config.yaml
receivers:
# OTLP receiver
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "http://localhost:3000"
- "https://*.example.com"
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
# Jaeger receiver for legacy support
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
thrift_compact:
endpoint: 0.0.0.0:6831
# Zipkin receiver
zipkin:
endpoint: 0.0.0.0:9411
# Filelog receiver for container logs
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
- /var/log/pods/*/otel-collector-*/*.log
start_at: end
include_file_path: true
include_file_name: false
operators:
- type: router
id: get-format
routes:
- output: parser-docker
expr: 'body matches "^\\{"'
- output: parser-crio
expr: 'body matches "^[^ Z]+ "'
- output: parser-containerd
expr: 'body matches "^[^ Z]+Z"'
- type: regex_parser
id: parser-crio
regex: '^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '2006-01-02T15:04:05.999999999Z07:00'
- type: json_parser
id: parser-docker
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
- type: move
from: attributes.log
to: body
- type: metadata
resource:
k8s.namespace.name: 'NAMESPACE_NAME'
k8s.pod.name: 'POD_NAME'
k8s.container.name: 'CONTAINER_NAME'
processors:
# Batch processor
batch:
send_batch_size: 10000
timeout: 10s
send_batch_max_size: 11000
# Memory limiter
memory_limiter:
check_interval: 1s
limit_percentage: 75
spike_limit_percentage: 20
# Resource detection
resourcedetection:
detectors: [env, system, docker, ec2, ecs, gcp]
timeout: 5s
override: false
# Attributes processor
attributes:
actions:
- key: environment
value: production
action: upsert
- key: db.statement
action: delete
- key: http.request.header.authorization
action: delete
- key: http.request.header.x-api-key
action: delete
# Tail sampling
tail_sampling:
decision_wait: 30s
num_traces: 100000
expected_new_traces_per_sec: 1000
policies:
- name: errors-policy
type: status_code
status_code: {status_codes: [ERROR]}
- name: slow-traces-policy
type: latency
latency: {threshold_ms: 1000}
- name: probabilistic-policy
type: probabilistic
probabilistic: {sampling_percentage: 10}
- name: composite-policy
type: composite
composite:
max_total_spans_per_second: 1000
policy_order: [errors-policy, slow-traces-policy, probabilistic-policy]
composite_sub_policy:
- name: errors-policy
type: status_code
status_code: {status_codes: [ERROR]}
- name: slow-traces-policy
type: latency
latency: {threshold_ms: 1000}
rate_allocation:
- policy: errors-policy
percent: 50
- policy: slow-traces-policy
percent: 25
# Metrics generation from spans
spanmetrics:
metrics_exporter: prometheus
latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s]
dimensions:
- name: http.method
- name: http.status_code
- name: http.route
dimensions_cache_size: 1000
aggregation_temporality: "AGGREGATION_TEMPORALITY_CUMULATIVE"
# Transform processor for logs
transform:
error_mode: ignore
log_statements:
- context: log
statements:
- set(severity_text, "ERROR") where body == "error"
- set(attributes["team"], "platform") where resource.attributes["service.name"] == "api-gateway"
- merge_maps(attributes, ParseJSON(body), "upsert") where IsMatch(body, "^\\{")
exporters:
# Debug exporter
debug:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
# Prometheus exporter
prometheus:
endpoint: "0.0.0.0:8889"
namespace: "otel"
const_labels:
environment: "production"
resource_to_telemetry_conversion:
enabled: true
# OTLP exporters
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: false
ca_file: /etc/ssl/certs/ca.crt
headers:
X-Scope-OrgID: "production"
sending_queue:
enabled: true
num_consumers: 10
queue_size: 5000
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
otlp/mimir:
endpoint: mimir:4317
tls:
insecure: false
headers:
X-Scope-OrgID: "production"
# Elasticsearch exporter for logs
elasticsearch/logs:
endpoints:
- http://elasticsearch:9200
logs_index: logs-otel
user: elastic
password: ${ELASTIC_PASSWORD}
discovery:
on_start: true
flush:
bytes: 10485760
retry:
max_interval: 30s
mapping:
mode: ecs
service:
pipelines:
# Traces pipeline
traces:
receivers: [otlp, jaeger, zipkin]
processors: [memory_limiter, resourcedetection, attributes, tail_sampling, spanmetrics, batch]
exporters: [otlp/tempo, debug]
# Metrics pipeline
metrics:
receivers: [otlp, prometheus, spanmetrics]
processors: [memory_limiter, resourcedetection, attributes, batch]
exporters: [prometheus, otlp/mimir]
# Logs pipeline
logs:
receivers: [otlp, filelog]
processors: [memory_limiter, resourcedetection, attributes, transform, batch]
exporters: [elasticsearch/logs]
extensions: [health_check, pprof, zpages]
telemetry:
logs:
level: info
initial_fields:
service: "otel-collector"
metrics:
level: detailed
address: 0.0.0.0:8888
cloudwatch-config.json
# AI Prompt: "Configure CloudWatch monitoring for AWS infrastructure"
{
"agent": {
"metrics_collection_interval": 60,
"run_as_user": "cwagent"
},
"metrics": {
"namespace": "Production/Application",
"metrics_collected": {
"cpu": {
"measurement": [
{
"name": "cpu_usage_idle",
"rename": "CPU_USAGE_IDLE",
"unit": "Percent"
},
{
"name": "cpu_usage_iowait",
"rename": "CPU_USAGE_IOWAIT",
"unit": "Percent"
},
"cpu_time_guest"
],
"metrics_collection_interval": 60,
"resources": ["*"],
"totalcpu": false
},
"disk": {
"measurement": [
{
"name": "used_percent",
"rename": "DISK_USED_PERCENT",
"unit": "Percent"
},
"disk_inodes_free",
"disk_inodes_used"
],
"metrics_collection_interval": 60,
"resources": ["*"],
"ignore_file_system_types": ["sysfs", "devtmpfs", "tmpfs"]
},
"mem": {
"measurement": [
"mem_used_percent",
"mem_available",
"mem_used",
"mem_total"
],
"metrics_collection_interval": 60
},
"netstat": {
"measurement": [
"tcp_established",
"tcp_time_wait"
],
"metrics_collection_interval": 60
},
"statsd": {
"service_address": ":8125",
"metrics_collection_interval": 10,
"metrics_aggregation_interval": 60
}
},
"append_dimensions": {
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
"ImageId": "${aws:ImageId}",
"InstanceId": "${aws:InstanceId}",
"InstanceType": "${aws:InstanceType}"
},
"aggregation_dimensions": [
["AutoScalingGroupName"],
["InstanceId", "InstanceType"]
]
},
"logs": {
"logs_collected": {
"files": {
"collect_list": [
{
"file_path": "/var/log/application/*.log",
"log_group_name": "/aws/ec2/application",
"log_stream_name": "{instance_id}/{hostname}",
"timezone": "UTC",
"multi_line_start_pattern": "{timestamp_format}",
"timestamp_format": "%Y-%m-%d %H:%M:%S"
},
{
"file_path": "/var/log/nginx/access.log",
"log_group_name": "/aws/ec2/nginx",
"log_stream_name": "{instance_id}/access.log"
}
]
}
}
}
}

Monitoring & Logging Best Practices

  1. Define SLIs/SLOs - Set clear service level indicators and objectives
  2. Use Structured Logging - JSON format with consistent fields
  3. Implement Sampling - Balance visibility with cost
  4. Correlate Signals - Link metrics, logs, and traces
  5. Automate Responses - Create runbooks and auto-remediation
  6. Monitor the Monitors - Ensure observability stack health
# AI: "Optimize data retention policies"
# Elasticsearch ILM policy
PUT _ilm/policy/logs-policy
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_primary_shard_size": "50gb",
"max_age": "7d"
}
}
},
"warm": {
"min_age": "7d",
"actions": {
"shrink": {
"number_of_shards": 1
},
"forcemerge": {
"max_num_segments": 1
}
}
},
"cold": {
"min_age": "30d",
"actions": {
"searchable_snapshot": {
"snapshot_repository": "s3-repository"
}
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
}
Terminal window
# AI Prompt: "Generate monitoring debugging commands"
# Prometheus queries
# Check scrape targets
curl http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | select(.health != "up")'
# Query metric cardinality
curl -G http://prometheus:9090/api/v1/query \
--data-urlencode 'query=prometheus_tsdb_symbol_table_size_bytes'
# Elasticsearch cluster health
curl -X GET "elasticsearch:9200/_cluster/health?pretty"
curl -X GET "elasticsearch:9200/_cat/indices?v&s=store.size:desc"
# Jaeger service dependencies
curl http://jaeger-query:16686/api/dependencies?endTs=$(date +%s)000&lookback=3600000
# OpenTelemetry Collector metrics
curl http://otel-collector:8888/metrics