TechLead
Lesson 17 of 25
5 min read
Cloud & Kubernetes

Kubernetes Monitoring

Set up comprehensive monitoring with Prometheus, Grafana, alerting, logging with the EFK stack, and observability best practices

Observability in Kubernetes

Monitoring Kubernetes clusters requires observing three pillars: metrics (numerical data about system behavior), logs (discrete events from applications and system components), and traces (request flows across distributed services). Together, these provide the visibility needed to operate production clusters reliably.

The Three Pillars of Observability

  • Metrics: Time-series numerical data — CPU usage, request rates, error rates, latency (Prometheus)
  • Logs: Structured or unstructured text records from applications and infrastructure (EFK/Loki)
  • Traces: End-to-end request tracking across microservices (Jaeger, OpenTelemetry)

Prometheus + Grafana Stack

The Prometheus + Grafana stack is the industry standard for Kubernetes monitoring. Prometheus collects and stores metrics, Grafana provides visualization and dashboards, and Alertmanager handles alerting.

Installing with Helm

# Install the kube-prometheus-stack (Prometheus + Grafana + Alertmanager)
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update

helm install monitoring prometheus-community/kube-prometheus-stack \
  --namespace monitoring \
  --create-namespace \
  --set grafana.adminPassword=mySecurePassword \
  --set prometheus.prometheusSpec.retention=30d \
  --set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi

# Verify installation
kubectl get pods -n monitoring

# Access Grafana dashboard
kubectl port-forward svc/monitoring-grafana 3000:80 -n monitoring
# Open http://localhost:3000 (admin / mySecurePassword)

ServiceMonitor for Custom Applications

# service-monitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: api-server-monitor
  namespace: monitoring
  labels:
    release: monitoring   # Must match Prometheus Helm release
spec:
  namespaceSelector:
    matchNames:
    - production
  selector:
    matchLabels:
      app: api-server
  endpoints:
  - port: metrics
    path: /metrics
    interval: 15s
    scrapeTimeout: 10s

Instrumenting a Node.js Application

// metrics.ts - Prometheus metrics for a Node.js app
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';

const register = new Registry();

// Collect default Node.js metrics (CPU, memory, event loop)
collectDefaultMetrics({ register });

// Custom metrics
export const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'path', 'status_code'],
  registers: [register],
});

export const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'path', 'status_code'],
  buckets: [0.01, 0.05, 0.1, 0.3, 0.5, 1, 2, 5],
  registers: [register],
});

export const activeConnections = new Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register],
});

// Express middleware
export function metricsMiddleware(req: any, res: any, next: any) {
  const start = Date.now();
  activeConnections.inc();

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    httpRequestsTotal.inc({ method: req.method, path: req.route?.path || req.path, status_code: res.statusCode });
    httpRequestDuration.observe({ method: req.method, path: req.route?.path || req.path, status_code: res.statusCode }, duration);
    activeConnections.dec();
  });
  next();
}

// Metrics endpoint
export async function metricsHandler(req: any, res: any) {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
}

Alerting with Alertmanager

# prometheus-rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: application-alerts
  namespace: monitoring
  labels:
    release: monitoring
spec:
  groups:
  - name: application
    rules:
    - alert: HighErrorRate
      expr: sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "High error rate detected"
        description: "Error rate is above 5% for the last 5 minutes"

    - alert: HighLatency
      expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "High p95 latency detected"
        description: "95th percentile latency is above 1 second"

    - alert: PodCrashLooping
      expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
      for: 15m
      labels:
        severity: critical
      annotations:
        summary: "Pod is crash-looping"
        description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is restarting frequently"

Logging with Loki

# Install Loki stack (Loki + Promtail)
helm repo add grafana https://grafana.github.io/helm-charts
helm install loki grafana/loki-stack \
  --namespace monitoring \
  --set grafana.enabled=false \
  --set promtail.enabled=true \
  --set loki.persistence.enabled=true \
  --set loki.persistence.size=50Gi

Key Takeaways

  • Monitor metrics (Prometheus), logs (Loki/EFK), and traces (Jaeger) for full observability
  • kube-prometheus-stack provides a batteries-included monitoring setup
  • Instrument your applications with Prometheus client libraries and expose /metrics
  • Set up PrometheusRules for proactive alerting on error rates, latency, and pod health
  • Use Grafana dashboards for visualization and Alertmanager for notification routing

Continue Learning