RAG-Powered Chatbots
A RAG-powered chatbot combines conversational AI with document retrieval, enabling users to have natural conversations about your specific data. Unlike basic RAG (single question-answer), a chatbot maintains conversation history, handles follow-up questions, and provides a seamless interactive experience.
Chatbot Architecture Components
- Conversation History: Track previous messages for context-aware responses
- Query Reformulation: Rewrite follow-up questions to be standalone for retrieval
- Context Retrieval: Fetch relevant documents for each turn
- Response Generation: Generate answers grounded in retrieved context
- Streaming: Stream responses for real-time user experience
The Query Reformulation Challenge
The biggest challenge in conversational RAG is handling follow-up questions. When a user asks "What about their pricing?", the retriever needs to know what "their" refers to. Query reformulation rewrites the follow-up into a standalone question.
// Query reformulation for conversational RAG
import Anthropic from "@anthropic-ai/sdk";
const client = new Anthropic();
interface Message {
role: "user" | "assistant";
content: string;
}
async function reformulateQuery(
currentQuestion: string,
chatHistory: Message[]
): Promise<string> {
if (chatHistory.length === 0) return currentQuestion;
const historyText = chatHistory
.slice(-6) // Last 3 exchanges
.map(m => `${m.role}: ${m.content}`)
.join("\n");
const response = await client.messages.create({
model: "claude-sonnet-4-20250514",
max_tokens: 256,
messages: [
{
role: "user",
content: `Given this conversation history and a follow-up question, rewrite the follow-up as a standalone question that includes all necessary context.
Chat History:
${historyText}
Follow-up Question: ${currentQuestion}
Standalone Question:`,
},
],
});
return response.content[0].type === "text"
? response.content[0].text.trim()
: currentQuestion;
}
// Example:
// History: user: "Tell me about Pinecone" / assistant: "Pinecone is a vector DB..."
// Follow-up: "What about their pricing?"
// Reformulated: "What is Pinecone's pricing?"
Full Chatbot Implementation
// Complete RAG chatbot with streaming
import Anthropic from "@anthropic-ai/sdk";
import { ChromaClient } from "chromadb";
const anthropic = new Anthropic();
const chroma = new ChromaClient();
class RAGChatbot {
private history: Message[] = [];
private collection: any;
async init(collectionName: string) {
this.collection = await chroma.getCollection({ name: collectionName });
}
async chat(userMessage: string): Promise<string> {
// Step 1: Reformulate query using conversation history
const standaloneQuery = await reformulateQuery(userMessage, this.history);
// Step 2: Retrieve relevant documents
const results = await this.collection.query({
queryTexts: [standaloneQuery],
nResults: 5,
});
const context = results.documents?.[0]?.join("\n\n") || "";
const sources = results.metadatas?.[0]
?.map((m: any) => m.source)
.filter((v: string, i: number, a: string[]) => a.indexOf(v) === i) || [];
// Step 3: Build messages with history and context
const systemPrompt = `You are a helpful assistant that answers questions based on the provided knowledge base.
Rules:
- Only use information from the context provided
- If you don't know, say so — don't make things up
- Be conversational and helpful
- Reference sources when possible
- Keep answers concise but complete
Context from knowledge base:
${context}`;
const messages = [
...this.history.map(m => ({
role: m.role as "user" | "assistant",
content: m.content,
})),
{ role: "user" as const, content: userMessage },
];
// Step 4: Generate response
const response = await anthropic.messages.create({
model: "claude-sonnet-4-20250514",
max_tokens: 1024,
system: systemPrompt,
messages,
});
const answer = response.content[0].type === "text" ? response.content[0].text : "";
// Step 5: Update history
this.history.push({ role: "user", content: userMessage });
this.history.push({ role: "assistant", content: answer });
// Trim history to prevent context overflow
if (this.history.length > 20) {
this.history = this.history.slice(-20);
}
return `${answer}${sources.length ? `\n\n_Sources: ${sources.join(", ")}_` : ""}`;
}
// Streaming version for real-time UI
async *chatStream(userMessage: string): AsyncGenerator<string> {
const standaloneQuery = await reformulateQuery(userMessage, this.history);
const results = await this.collection.query({
queryTexts: [standaloneQuery],
nResults: 5,
});
const context = results.documents?.[0]?.join("\n\n") || "";
const messages = [
...this.history.map(m => ({
role: m.role as "user" | "assistant",
content: m.content,
})),
{ role: "user" as const, content: userMessage },
];
const stream = anthropic.messages.stream({
model: "claude-sonnet-4-20250514",
max_tokens: 1024,
system: `Answer from this context:\n${context}`,
messages,
});
let fullResponse = "";
for await (const event of stream) {
if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
fullResponse += event.delta.text;
yield event.delta.text;
}
}
this.history.push({ role: "user", content: userMessage });
this.history.push({ role: "assistant", content: fullResponse });
}
clearHistory(): void {
this.history = [];
}
}
// Usage
const bot = new RAGChatbot();
await bot.init("knowledge_base");
console.log(await bot.chat("What is your vacation policy?"));
console.log(await bot.chat("How does that compare to sick leave?"));
console.log(await bot.chat("Can I carry over unused days?"));
Python Implementation with LangChain
# Conversational RAG chatbot with LangChain
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
# Setup
llm = ChatAnthropic(model="claude-sonnet-4-20250514")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma(persist_directory="./db", embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# Step 1: Create history-aware retriever (query reformulation built-in)
contextualize_prompt = ChatPromptTemplate.from_messages([
("system", """Given the chat history and latest question,
reformulate it as a standalone question. Do NOT answer it,
just reformulate if needed, otherwise return it as is."""),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
])
history_aware_retriever = create_history_aware_retriever(
llm, retriever, contextualize_prompt
)
# Step 2: Create QA chain
qa_prompt = ChatPromptTemplate.from_messages([
("system", """Answer based on the context below. If you don't know, say so.
Be conversational and cite sources.
Context: {context}"""),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
])
qa_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)
# Step 3: Manage conversation
chat_history = []
def chat(user_input: str) -> str:
result = rag_chain.invoke({
"input": user_input,
"chat_history": chat_history,
})
chat_history.append(HumanMessage(content=user_input))
chat_history.append(AIMessage(content=result["answer"]))
return result["answer"]
# Conversation
print(chat("What are the API rate limits?"))
print(chat("Can I increase those limits?"))
print(chat("How much does the premium plan cost?"))
Chatbot Best Practices
- Always reformulate queries: Follow-up questions without reformulation will retrieve irrelevant documents
- Limit history size: Keep the last 10-20 messages. Summarize older history to save context window space
- Stream responses: Users expect real-time feedback. Always stream in production chatbots
- Handle out-of-scope gracefully: Train the system prompt to acknowledge when a question is outside the knowledge base
- Show sources: Display which documents were used to build trust and enable verification
Summary
Building a RAG chatbot requires solving the query reformulation problem for conversational context, managing conversation history efficiently, and providing streaming responses for good UX. Whether you build from scratch with Anthropic's API or use LangChain's pre-built chains, the core pattern is: reformulate, retrieve, generate, update history. Add source attribution and graceful out-of-scope handling for a production-ready experience.