What is LLMOps?
LLMOps (Large Language Model Operations) is the set of practices, tools, and processes for managing LLM-powered applications in production. It extends traditional MLOps with LLM-specific concerns: prompt versioning, evaluation pipelines, cost tracking, latency optimization, and model lifecycle management.
LLMOps Pillars
- Prompt Management: Version, test, and deploy prompts like code
- Evaluation: Automated quality testing across model and prompt changes
- Monitoring: Track latency, errors, costs, and quality in real-time
- Cost Management: Optimize token usage, caching, and model selection
- Deployment: Blue-green deployments, A/B testing, gradual rollouts
Prompt Management
// Prompt versioning system
interface PromptVersion {
id: string;
name: string;
version: number;
template: string;
model: string;
temperature: number;
createdAt: Date;
metrics?: { avgLatency: number; avgCost: number; avgQuality: number };
}
class PromptRegistry {
private prompts = new Map<string, PromptVersion[]>();
register(name: string, config: Omit<PromptVersion, "id" | "createdAt" | "version">): PromptVersion {
const versions = this.prompts.get(name) || [];
const version: PromptVersion = {
...config,
id: `${name}_v${versions.length + 1}`,
name,
version: versions.length + 1,
createdAt: new Date(),
};
versions.push(version);
this.prompts.set(name, versions);
return version;
}
getLatest(name: string): PromptVersion | undefined {
const versions = this.prompts.get(name);
return versions?.[versions.length - 1];
}
getVersion(name: string, version: number): PromptVersion | undefined {
return this.prompts.get(name)?.[version - 1];
}
// A/B test between two versions
getABVersion(name: string, vA: number, vB: number, ratio = 0.5): PromptVersion | undefined {
return Math.random() < ratio ? this.getVersion(name, vA) : this.getVersion(name, vB);
}
}
// Usage
const registry = new PromptRegistry();
registry.register("customer_support", {
template: "You are a helpful support agent. Answer: {question}",
model: "claude-sonnet-4-20250514",
temperature: 0.3,
});
registry.register("customer_support", {
template: "You are a concise support agent. Context: {context}\nQuestion: {question}",
model: "claude-sonnet-4-20250514",
temperature: 0.1,
});
Monitoring and Observability
# LLM monitoring with LangSmith / custom logging
import time
import json
from datetime import datetime
from dataclasses import dataclass, asdict
@dataclass
class LLMTrace:
trace_id: str
timestamp: str
model: str
prompt_name: str
prompt_version: int
input_tokens: int
output_tokens: int
latency_ms: float
cost_usd: float
status: str # "success", "error", "guardrail_triggered"
error: str = ""
metadata: dict = None
class LLMMonitor:
def __init__(self):
self.traces: list[LLMTrace] = []
def track(self, trace: LLMTrace):
self.traces.append(trace)
# In production: send to logging service (Datadog, LangSmith, etc.)
print(json.dumps(asdict(trace), indent=2))
def get_metrics(self, prompt_name: str = None, hours: int = 24) -> dict:
relevant = self.traces
if prompt_name:
relevant = [t for t in relevant if t.prompt_name == prompt_name]
if not relevant:
return {}
return {
"total_requests": len(relevant),
"avg_latency_ms": sum(t.latency_ms for t in relevant) / len(relevant),
"total_cost_usd": sum(t.cost_usd for t in relevant),
"error_rate": sum(1 for t in relevant if t.status == "error") / len(relevant),
"avg_input_tokens": sum(t.input_tokens for t in relevant) / len(relevant),
"avg_output_tokens": sum(t.output_tokens for t in relevant) / len(relevant),
"p99_latency_ms": sorted(t.latency_ms for t in relevant)[int(len(relevant) * 0.99)],
}
monitor = LLMMonitor()
# Decorator for automatic monitoring
def monitored(prompt_name: str, prompt_version: int = 1):
def decorator(func):
async def wrapper(*args, **kwargs):
trace_id = f"trace_{time.time_ns()}"
start = time.time()
try:
result = await func(*args, **kwargs)
latency = (time.time() - start) * 1000
monitor.track(LLMTrace(
trace_id=trace_id,
timestamp=datetime.now().isoformat(),
model=kwargs.get("model", "unknown"),
prompt_name=prompt_name,
prompt_version=prompt_version,
input_tokens=result.usage.input_tokens,
output_tokens=result.usage.output_tokens,
latency_ms=latency,
cost_usd=calculate_cost(result.usage),
status="success",
))
return result
except Exception as e:
monitor.track(LLMTrace(
trace_id=trace_id,
timestamp=datetime.now().isoformat(),
model="unknown",
prompt_name=prompt_name,
prompt_version=prompt_version,
input_tokens=0, output_tokens=0,
latency_ms=(time.time() - start) * 1000,
cost_usd=0, status="error", error=str(e),
))
raise
return wrapper
return decorator
Cost Optimization
// Cost optimization strategies
import Anthropic from "@anthropic-ai/sdk";
const client = new Anthropic();
// 1. Prompt caching - reuse cached system prompts
async function cachedQuery(userMessage: string) {
return client.messages.create({
model: "claude-sonnet-4-20250514",
max_tokens: 1024,
system: [
{
type: "text",
text: longSystemPrompt, // This gets cached after first call
cache_control: { type: "ephemeral" },
},
],
messages: [{ role: "user", content: userMessage }],
});
}
// 2. Semantic caching - cache similar query results
class SemanticCache {
private cache = new Map<string, { response: string; embedding: number[]; timestamp: number }>();
private threshold = 0.95; // Similarity threshold
async get(query: string, queryEmbedding: number[]): Promise<string | null> {
for (const [, entry] of this.cache) {
const similarity = cosineSimilarity(queryEmbedding, entry.embedding);
if (similarity > this.threshold) {
return entry.response;
}
}
return null;
}
set(query: string, response: string, embedding: number[]): void {
this.cache.set(query, { response, embedding, timestamp: Date.now() });
}
}
// 3. Model routing - use cheaper models for simple tasks
async function routeToModel(query: string): Promise<string> {
// Classify query complexity
const complexity = await classifyComplexity(query);
const model = complexity === "simple"
? "claude-haiku-3.5" // $0.25/$1.25 per M tokens
: "claude-sonnet-4-20250514"; // $3/$15 per M tokens
const response = await client.messages.create({
model,
max_tokens: 1024,
messages: [{ role: "user", content: query }],
});
return response.content[0].type === "text" ? response.content[0].text : "";
}
LLMOps Tool Landscape
| Category | Tools | Purpose |
|---|---|---|
| Observability | LangSmith, Langfuse, Helicone | Tracing, logging, debugging |
| Evaluation | RAGAS, Braintrust, Promptfoo | Automated quality testing |
| Prompt Management | LangSmith Hub, PromptLayer | Version and manage prompts |
| Gateway | LiteLLM, Portkey, Helicone | Model routing, fallback, caching |
| Cost Tracking | Helicone, custom logging | Track spend per model/feature |
LLMOps Checklist for Production
- Logging: Log every LLM call with input, output, latency, tokens, and cost
- Evaluation: Run automated evals on every prompt/model change before deploying
- Alerting: Set alerts for error rate spikes, latency increases, and cost anomalies
- Caching: Implement semantic caching for frequently asked questions
- Fallbacks: Configure model fallbacks (e.g., Claude -> GPT-4) for resilience
- Rate limiting: Implement per-user rate limits to control costs and prevent abuse
Summary
LLMOps is the discipline that makes LLM applications reliable, cost-effective, and maintainable in production. Start with logging and monitoring from day one, add automated evaluation as a CI/CD step, implement caching for cost optimization, and use model routing to balance quality with cost. The tools ecosystem is maturing rapidly — LangSmith, Langfuse, and Helicone are good starting points.