TechLead
Lesson 20 of 22
5 min read
Performance Engineering

Monitoring & Alerting

Build real-time performance dashboards, set up alerting on regressions, and implement Real User Monitoring (RUM)

Why Performance Monitoring Matters

Lab testing catches issues during development, but only Real User Monitoring (RUM) captures what users actually experience in production. Network conditions, device capabilities, geographic distribution, and third-party scripts all affect real performance. Monitoring provides the continuous feedback loop needed to maintain and improve performance over time.

Monitoring Layers

  • Real User Monitoring (RUM): Collect actual user performance data from browsers
  • Synthetic Monitoring: Automated tests from fixed locations on a schedule
  • Server Monitoring: Track server response times, error rates, and resource usage
  • Application Performance Monitoring (APM): Trace individual requests through the entire stack
  • Alerting: Automated notifications when metrics cross defined thresholds

Building a RUM Pipeline

// lib/rum.ts — Real User Monitoring client
import { onLCP, onINP, onCLS, onFCP, onTTFB, type Metric } from 'web-vitals';

interface RUMPayload {
  sessionId: string;
  url: string;
  timestamp: number;
  connection: string;
  deviceType: string;
  metrics: Record<string, number>;
}

class RUMCollector {
  private sessionId: string;
  private metrics: Record<string, number> = {};
  private reported = false;

  constructor() {
    this.sessionId = crypto.randomUUID();
    this.initObservers();
    this.reportOnUnload();
  }

  private initObservers(): void {
    onLCP((m) => this.collect(m));
    onINP((m) => this.collect(m));
    onCLS((m) => this.collect(m));
    onFCP((m) => this.collect(m));
    onTTFB((m) => this.collect(m));
  }

  private collect(metric: Metric): void {
    this.metrics[metric.name] = metric.value;
  }

  private getConnectionType(): string {
    const nav = navigator as any;
    return nav.connection?.effectiveType || 'unknown';
  }

  private getDeviceType(): string {
    const width = window.innerWidth;
    if (width < 768) return 'mobile';
    if (width < 1024) return 'tablet';
    return 'desktop';
  }

  private reportOnUnload(): void {
    const report = () => {
      if (this.reported) return;
      this.reported = true;

      const payload: RUMPayload = {
        sessionId: this.sessionId,
        url: window.location.href,
        timestamp: Date.now(),
        connection: this.getConnectionType(),
        deviceType: this.getDeviceType(),
        metrics: this.metrics,
      };

      navigator.sendBeacon('/api/rum', JSON.stringify(payload));
    };

    document.addEventListener('visibilitychange', () => {
      if (document.visibilityState === 'hidden') report();
    });
    window.addEventListener('pagehide', report);
  }
}

// Initialize on page load
if (typeof window !== 'undefined') {
  new RUMCollector();
}

Server-Side Monitoring

// middleware.ts — Server-side performance tracking
import { NextRequest, NextResponse } from 'next/server';

export function middleware(request: NextRequest) {
  const start = Date.now();

  const response = NextResponse.next();

  const duration = Date.now() - start;
  response.headers.set(
    'Server-Timing',
    `middleware;dur=${duration};desc="Edge Middleware"`
  );

  // Log slow requests
  if (duration > 100) {
    console.warn(`Slow middleware: ${request.nextUrl.pathname} took ${duration}ms`);
  }

  return response;
}

// API route with detailed timing
// app/api/products/route.ts
export async function GET(request: NextRequest) {
  const timings: string[] = [];
  const start = Date.now();

  // Time database query
  const dbStart = Date.now();
  const products = await prisma.product.findMany({ take: 20 });
  timings.push(`db;dur=${Date.now() - dbStart};desc="Database Query"`);

  // Time serialization
  const serStart = Date.now();
  const json = JSON.stringify(products);
  timings.push(`serialize;dur=${Date.now() - serStart};desc="Serialization"`);

  timings.push(`total;dur=${Date.now() - start};desc="Total"`);

  return new Response(json, {
    headers: {
      'Content-Type': 'application/json',
      'Server-Timing': timings.join(', '),
    },
  });
}

Setting Up Alerts

// alert-rules.ts — Define alerting thresholds
interface AlertRule {
  metric: string;
  condition: 'above' | 'below';
  threshold: number;
  window: string;  // Time window for evaluation
  severity: 'critical' | 'warning' | 'info';
  channel: string; // Notification channel
}

const alertRules: AlertRule[] = [
  {
    metric: 'lcp_p75',
    condition: 'above',
    threshold: 2500,
    window: '15m',
    severity: 'critical',
    channel: 'slack-performance',
  },
  {
    metric: 'cls_p75',
    condition: 'above',
    threshold: 0.1,
    window: '15m',
    severity: 'warning',
    channel: 'slack-performance',
  },
  {
    metric: 'error_rate',
    condition: 'above',
    threshold: 0.05,
    window: '5m',
    severity: 'critical',
    channel: 'pagerduty',
  },
  {
    metric: 'p99_response_time',
    condition: 'above',
    threshold: 3000,
    window: '10m',
    severity: 'warning',
    channel: 'slack-performance',
  },
];

// Slack notification for alerts
async function sendSlackAlert(rule: AlertRule, currentValue: number): Promise<void> {
  const emoji = rule.severity === 'critical' ? ':rotating_light:' : ':warning:';

  await fetch(process.env.SLACK_WEBHOOK_URL!, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({
      text: `${emoji} Performance Alert: ${rule.metric} is ${currentValue} (threshold: ${rule.threshold}) over the last ${rule.window}`,
      blocks: [
        {
          type: 'section',
          text: {
            type: 'mrkdwn',
            text: `*${emoji} Performance Alert*\n*Metric:* ${rule.metric}\n*Current:* ${currentValue}\n*Threshold:* ${rule.threshold}\n*Window:* ${rule.window}\n*Severity:* ${rule.severity}`,
          },
        },
      ],
    }),
  });
}

Monitoring Best Practices

  • Collect RUM data: Lab tests miss real-world variability — always measure real users
  • Track percentiles, not averages: p75 and p95 reveal the experience of your slowest users
  • Use Server-Timing headers: Make server-side timing visible to client-side monitoring
  • Alert on regressions: Detect when metrics cross thresholds and notify immediately
  • Segment data: Analyze by device type, connection, geography, and page type

Continue Learning