TechLead
Lesson 21 of 24
5 min read
AI Agents & RAG

LLMOps Fundamentals

Learn the LLM lifecycle including monitoring, evaluation, deployment, and cost management

What is LLMOps?

LLMOps (Large Language Model Operations) is the set of practices, tools, and processes for managing LLM-powered applications in production. It extends traditional MLOps with LLM-specific concerns: prompt versioning, evaluation pipelines, cost tracking, latency optimization, and model lifecycle management.

LLMOps Pillars

  • Prompt Management: Version, test, and deploy prompts like code
  • Evaluation: Automated quality testing across model and prompt changes
  • Monitoring: Track latency, errors, costs, and quality in real-time
  • Cost Management: Optimize token usage, caching, and model selection
  • Deployment: Blue-green deployments, A/B testing, gradual rollouts

Prompt Management

// Prompt versioning system
interface PromptVersion {
  id: string;
  name: string;
  version: number;
  template: string;
  model: string;
  temperature: number;
  createdAt: Date;
  metrics?: { avgLatency: number; avgCost: number; avgQuality: number };
}

class PromptRegistry {
  private prompts = new Map<string, PromptVersion[]>();

  register(name: string, config: Omit<PromptVersion, "id" | "createdAt" | "version">): PromptVersion {
    const versions = this.prompts.get(name) || [];
    const version: PromptVersion = {
      ...config,
      id: `${name}_v${versions.length + 1}`,
      name,
      version: versions.length + 1,
      createdAt: new Date(),
    };
    versions.push(version);
    this.prompts.set(name, versions);
    return version;
  }

  getLatest(name: string): PromptVersion | undefined {
    const versions = this.prompts.get(name);
    return versions?.[versions.length - 1];
  }

  getVersion(name: string, version: number): PromptVersion | undefined {
    return this.prompts.get(name)?.[version - 1];
  }

  // A/B test between two versions
  getABVersion(name: string, vA: number, vB: number, ratio = 0.5): PromptVersion | undefined {
    return Math.random() < ratio ? this.getVersion(name, vA) : this.getVersion(name, vB);
  }
}

// Usage
const registry = new PromptRegistry();

registry.register("customer_support", {
  template: "You are a helpful support agent. Answer: {question}",
  model: "claude-sonnet-4-20250514",
  temperature: 0.3,
});

registry.register("customer_support", {
  template: "You are a concise support agent. Context: {context}\nQuestion: {question}",
  model: "claude-sonnet-4-20250514",
  temperature: 0.1,
});

Monitoring and Observability

# LLM monitoring with LangSmith / custom logging
import time
import json
from datetime import datetime
from dataclasses import dataclass, asdict

@dataclass
class LLMTrace:
    trace_id: str
    timestamp: str
    model: str
    prompt_name: str
    prompt_version: int
    input_tokens: int
    output_tokens: int
    latency_ms: float
    cost_usd: float
    status: str  # "success", "error", "guardrail_triggered"
    error: str = ""
    metadata: dict = None

class LLMMonitor:
    def __init__(self):
        self.traces: list[LLMTrace] = []

    def track(self, trace: LLMTrace):
        self.traces.append(trace)
        # In production: send to logging service (Datadog, LangSmith, etc.)
        print(json.dumps(asdict(trace), indent=2))

    def get_metrics(self, prompt_name: str = None, hours: int = 24) -> dict:
        relevant = self.traces
        if prompt_name:
            relevant = [t for t in relevant if t.prompt_name == prompt_name]

        if not relevant:
            return {}

        return {
            "total_requests": len(relevant),
            "avg_latency_ms": sum(t.latency_ms for t in relevant) / len(relevant),
            "total_cost_usd": sum(t.cost_usd for t in relevant),
            "error_rate": sum(1 for t in relevant if t.status == "error") / len(relevant),
            "avg_input_tokens": sum(t.input_tokens for t in relevant) / len(relevant),
            "avg_output_tokens": sum(t.output_tokens for t in relevant) / len(relevant),
            "p99_latency_ms": sorted(t.latency_ms for t in relevant)[int(len(relevant) * 0.99)],
        }

monitor = LLMMonitor()

# Decorator for automatic monitoring
def monitored(prompt_name: str, prompt_version: int = 1):
    def decorator(func):
        async def wrapper(*args, **kwargs):
            trace_id = f"trace_{time.time_ns()}"
            start = time.time()
            try:
                result = await func(*args, **kwargs)
                latency = (time.time() - start) * 1000
                monitor.track(LLMTrace(
                    trace_id=trace_id,
                    timestamp=datetime.now().isoformat(),
                    model=kwargs.get("model", "unknown"),
                    prompt_name=prompt_name,
                    prompt_version=prompt_version,
                    input_tokens=result.usage.input_tokens,
                    output_tokens=result.usage.output_tokens,
                    latency_ms=latency,
                    cost_usd=calculate_cost(result.usage),
                    status="success",
                ))
                return result
            except Exception as e:
                monitor.track(LLMTrace(
                    trace_id=trace_id,
                    timestamp=datetime.now().isoformat(),
                    model="unknown",
                    prompt_name=prompt_name,
                    prompt_version=prompt_version,
                    input_tokens=0, output_tokens=0,
                    latency_ms=(time.time() - start) * 1000,
                    cost_usd=0, status="error", error=str(e),
                ))
                raise
        return wrapper
    return decorator

Cost Optimization

// Cost optimization strategies
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic();

// 1. Prompt caching - reuse cached system prompts
async function cachedQuery(userMessage: string) {
  return client.messages.create({
    model: "claude-sonnet-4-20250514",
    max_tokens: 1024,
    system: [
      {
        type: "text",
        text: longSystemPrompt, // This gets cached after first call
        cache_control: { type: "ephemeral" },
      },
    ],
    messages: [{ role: "user", content: userMessage }],
  });
}

// 2. Semantic caching - cache similar query results
class SemanticCache {
  private cache = new Map<string, { response: string; embedding: number[]; timestamp: number }>();
  private threshold = 0.95; // Similarity threshold

  async get(query: string, queryEmbedding: number[]): Promise<string | null> {
    for (const [, entry] of this.cache) {
      const similarity = cosineSimilarity(queryEmbedding, entry.embedding);
      if (similarity > this.threshold) {
        return entry.response;
      }
    }
    return null;
  }

  set(query: string, response: string, embedding: number[]): void {
    this.cache.set(query, { response, embedding, timestamp: Date.now() });
  }
}

// 3. Model routing - use cheaper models for simple tasks
async function routeToModel(query: string): Promise<string> {
  // Classify query complexity
  const complexity = await classifyComplexity(query);

  const model = complexity === "simple"
    ? "claude-haiku-3.5"         // $0.25/$1.25 per M tokens
    : "claude-sonnet-4-20250514"; // $3/$15 per M tokens

  const response = await client.messages.create({
    model,
    max_tokens: 1024,
    messages: [{ role: "user", content: query }],
  });

  return response.content[0].type === "text" ? response.content[0].text : "";
}

LLMOps Tool Landscape

Category Tools Purpose
ObservabilityLangSmith, Langfuse, HeliconeTracing, logging, debugging
EvaluationRAGAS, Braintrust, PromptfooAutomated quality testing
Prompt ManagementLangSmith Hub, PromptLayerVersion and manage prompts
GatewayLiteLLM, Portkey, HeliconeModel routing, fallback, caching
Cost TrackingHelicone, custom loggingTrack spend per model/feature

LLMOps Checklist for Production

  • Logging: Log every LLM call with input, output, latency, tokens, and cost
  • Evaluation: Run automated evals on every prompt/model change before deploying
  • Alerting: Set alerts for error rate spikes, latency increases, and cost anomalies
  • Caching: Implement semantic caching for frequently asked questions
  • Fallbacks: Configure model fallbacks (e.g., Claude -> GPT-4) for resilience
  • Rate limiting: Implement per-user rate limits to control costs and prevent abuse

Summary

LLMOps is the discipline that makes LLM applications reliable, cost-effective, and maintainable in production. Start with logging and monitoring from day one, add automated evaluation as a CI/CD step, implement caching for cost optimization, and use model routing to balance quality with cost. The tools ecosystem is maturing rapidly — LangSmith, Langfuse, and Helicone are good starting points.

Continue Learning