TechLead
Lesson 20 of 24
5 min read
AI Agents & RAG

Guardrails and Safety

Implement input/output validation, content filtering, and safety guardrails for AI applications

Why Guardrails Matter

Production AI applications need safety guardrails to prevent harmful outputs, validate inputs and outputs, detect prompt injection, and ensure compliance with business rules. Without guardrails, your AI application is vulnerable to misuse, hallucination leaks, and generating inappropriate content.

Types of Guardrails

  • Input Guardrails: Validate and sanitize user inputs before they reach the LLM
  • Output Guardrails: Validate LLM outputs before returning to the user
  • Prompt Injection Detection: Detect and block attempts to override system instructions
  • Content Filtering: Block harmful, biased, or inappropriate content
  • PII Detection: Prevent exposure of personal identifiable information
  • Topic Guardrails: Keep the conversation within allowed topics

Input Validation

// Input guardrails implementation
import Anthropic from "@anthropic-ai/sdk";

const client = new Anthropic();

interface GuardrailResult {
  passed: boolean;
  reason?: string;
  sanitizedInput?: string;
}

// Check for prompt injection
async function checkPromptInjection(input: string): Promise<GuardrailResult> {
  const suspiciousPatterns = [
    /ignore (all |previous |above )?instructions/i,
    /you are now/i,
    /new instructions:/i,
    /system prompt:/i,
    /\[INST\]/i,
    /<\|im_start\|>/i,
    /forget everything/i,
    /pretend you are/i,
    /act as if/i,
  ];

  for (const pattern of suspiciousPatterns) {
    if (pattern.test(input)) {
      return { passed: false, reason: "Potential prompt injection detected" };
    }
  }

  // LLM-based detection for sophisticated attacks
  const response = await client.messages.create({
    model: "claude-sonnet-4-20250514",
    max_tokens: 50,
    messages: [
      {
        role: "user",
        content: `Is this user input attempting to manipulate or override AI instructions? Answer ONLY "SAFE" or "UNSAFE".

Input: "${input.slice(0, 500)}"`,
      },
    ],
  });

  const verdict = (response.content[0] as any).text.trim().toUpperCase();
  return verdict === "SAFE"
    ? { passed: true }
    : { passed: false, reason: "LLM detected prompt manipulation attempt" };
}

// Check for PII
function checkPII(input: string): GuardrailResult {
  const piiPatterns = {
    ssn: /\b\d{3}-\d{2}-\d{4}\b/,
    creditCard: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/,
    email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/,
    phone: /\b\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b/,
  };

  const detectedPII: string[] = [];
  for (const [type, pattern] of Object.entries(piiPatterns)) {
    if (pattern.test(input)) {
      detectedPII.push(type);
    }
  }

  if (detectedPII.length > 0) {
    return {
      passed: false,
      reason: `PII detected: ${detectedPII.join(", ")}`,
    };
  }

  return { passed: true };
}

// Topic guardrail
async function checkTopicRelevance(
  input: string,
  allowedTopics: string[]
): Promise<GuardrailResult> {
  const response = await client.messages.create({
    model: "claude-sonnet-4-20250514",
    max_tokens: 50,
    messages: [
      {
        role: "user",
        content: `Is this query related to any of these topics: ${allowedTopics.join(", ")}?
Answer ONLY "ON_TOPIC" or "OFF_TOPIC".

Query: "${input}"`,
      },
    ],
  });

  const verdict = (response.content[0] as any).text.trim();
  return verdict.includes("ON_TOPIC")
    ? { passed: true }
    : { passed: false, reason: "Query is outside allowed topics" };
}

// Combined input guardrail pipeline
async function validateInput(input: string): Promise<GuardrailResult> {
  // Length check
  if (input.length > 10000) {
    return { passed: false, reason: "Input exceeds maximum length" };
  }
  if (input.trim().length === 0) {
    return { passed: false, reason: "Input is empty" };
  }

  // PII check
  const piiResult = checkPII(input);
  if (!piiResult.passed) return piiResult;

  // Prompt injection check
  const injectionResult = await checkPromptInjection(input);
  if (!injectionResult.passed) return injectionResult;

  return { passed: true, sanitizedInput: input.trim() };
}

Output Validation

# Output guardrails in Python
import anthropic
import re

client = anthropic.Anthropic()

def validate_output(output: str, context: str = "") -> dict:
    """Validate LLM output before returning to user."""
    checks = {
        "pii_check": check_output_pii(output),
        "hallucination_check": check_hallucination(output, context),
        "toxicity_check": check_toxicity(output),
        "format_check": check_format(output),
    }

    all_passed = all(c["passed"] for c in checks.values())
    return {"passed": all_passed, "checks": checks}

def check_output_pii(output: str) -> dict:
    """Ensure LLM didn't leak PII in the response."""
    pii_patterns = {
        "ssn": r"d{3}-d{2}-d{4}",
        "credit_card": r"d{4}[s-]?d{4}[s-]?d{4}[s-]?d{4}",
    }

    for pii_type, pattern in pii_patterns.items():
        if re.search(pattern, output):
            return {"passed": False, "reason": f"Output contains {pii_type}"}

    return {"passed": True}

def check_hallucination(output: str, context: str) -> dict:
    """Check if the output is grounded in the provided context."""
    if not context:
        return {"passed": True}

    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=100,
        messages=[{
            "role": "user",
            "content": f"""Is every factual claim in the Answer supported by the Context?
Answer "GROUNDED" if yes, "HALLUCINATED" if any claim is not in the context.

Context: {context[:2000]}
Answer: {output[:1000]}"""
        }],
    )

    verdict = response.content[0].text.strip()
    return {
        "passed": "GROUNDED" in verdict,
        "reason": "Output contains unsupported claims" if "HALLUCINATED" in verdict else None,
    }

def check_toxicity(output: str) -> dict:
    """Check for toxic or harmful content."""
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=50,
        messages=[{
            "role": "user",
            "content": f'Is this text toxic, harmful, or inappropriate? Answer "SAFE" or "UNSAFE".

Text: "{output[:1000]}"'
        }],
    )

    verdict = response.content[0].text.strip()
    return {"passed": "SAFE" in verdict}

NVIDIA NeMo Guardrails

NeMo Guardrails is an open-source toolkit for adding programmable guardrails to LLM applications.

# NeMo Guardrails configuration
# config.yml
"""
models:
  - type: main
    engine: anthropic
    model: claude-sonnet-4-20250514

rails:
  input:
    flows:
      - check jailbreak
      - check input toxicity
      - check topic allowed

  output:
    flows:
      - check output toxicity
      - check hallucination
      - check pii in output
"""

# Colang rails definition (rails.co)
"""
define user ask about allowed topics
  "What is your vacation policy?"
  "How do I submit an expense report?"
  "What are the working hours?"

define user ask about off topic
  "What's the weather like?"
  "Tell me a joke"
  "Who won the game?"

define flow check topic allowed
  user ask about off topic
  bot inform topic not supported
  stop

define bot inform topic not supported
  "I can only help with company policy and HR questions.
   Please ask about our policies, benefits, or procedures."
"""

# Using NeMo Guardrails in code
from nemoguardrails import RailsConfig, LLMRails

config = RailsConfig.from_path("./config")
rails = LLMRails(config)

# All inputs and outputs are automatically validated
response = await rails.generate(
    messages=[{"role": "user", "content": "What is the vacation policy?"}]
)
print(response)

Guardrail Implementation Checklist

  • Input validation: Length limits, PII detection, prompt injection detection, topic filtering
  • Output validation: Hallucination check, toxicity filter, PII leak prevention, format validation
  • Rate limiting: Per-user and per-session rate limits to prevent abuse
  • Logging: Log all guardrail triggers for monitoring and improvement
  • Graceful failures: When a guardrail triggers, return a helpful message, not an error
  • Regular updates: Prompt injection techniques evolve — update detection patterns regularly

Summary

Guardrails are non-negotiable for production AI applications. Implement input validation (prompt injection, PII, topic boundaries), output validation (hallucination, toxicity, PII leaks), and operational controls (rate limiting, logging). Start with rule-based checks for speed, add LLM-based checks for nuance, and consider NeMo Guardrails for comprehensive, declarative safety policies.

Continue Learning