Why Hybrid Search?
Neither keyword search nor semantic search is perfect on its own. Keyword search (BM25) excels at finding exact terms, product names, error codes, and technical identifiers. Semantic search excels at understanding meaning, synonyms, and intent. Hybrid search combines both to get the best of both worlds.
When Semantic Search Fails
- Exact names: "ERROR_CODE_4032" won't match semantically but keywords find it instantly
- Rare terms: Domain jargon may not have good embedding representations
- Abbreviations: "K8s" and "Kubernetes" may be far apart in embedding space
- Code search: Function names and variable names need exact matching
BM25: The Keyword Search Baseline
# BM25 keyword search implementation
from rank_bm25 import BM25Okapi
import re
class BM25Search:
def __init__(self, documents: list[str]):
self.documents = documents
# Tokenize documents
self.tokenized = [self._tokenize(doc) for doc in documents]
self.bm25 = BM25Okapi(self.tokenized)
def _tokenize(self, text: str) -> list[str]:
"""Simple tokenization with lowercasing."""
return re.findall(r'\w+', text.lower())
def search(self, query: str, top_k: int = 5) -> list[tuple[int, float, str]]:
"""Search for documents matching the query."""
tokenized_query = self._tokenize(query)
scores = self.bm25.get_scores(tokenized_query)
# Get top-k indices
top_indices = scores.argsort()[-top_k:][::-1]
return [
(idx, scores[idx], self.documents[idx])
for idx in top_indices
if scores[idx] > 0
]
# Usage
documents = [
"The ERROR_CODE_4032 indicates an authentication failure.",
"Authentication errors occur when credentials are invalid.",
"Kubernetes (K8s) orchestrates container deployments.",
"Container orchestration automates deployment scaling.",
]
bm25 = BM25Search(documents)
results = bm25.search("ERROR_CODE_4032")
for idx, score, doc in results:
print(f"[{score:.2f}] {doc}")
Implementing Hybrid Search
Hybrid search runs both keyword and semantic searches, then combines the results using Reciprocal Rank Fusion (RRF) or weighted scoring.
// Hybrid search with Reciprocal Rank Fusion
interface SearchResult {
id: string;
content: string;
score: number;
}
function reciprocalRankFusion(
resultSets: SearchResult[][],
k: number = 60, // RRF constant
weights?: number[]
): SearchResult[] {
const scores = new Map<string, { score: number; content: string }>();
const setWeights = weights || resultSets.map(() => 1.0);
for (let setIdx = 0; setIdx < resultSets.length; setIdx++) {
const results = resultSets[setIdx];
const weight = setWeights[setIdx];
for (let rank = 0; rank < results.length; rank++) {
const result = results[rank];
const rrfScore = weight * (1 / (k + rank + 1));
const existing = scores.get(result.id);
if (existing) {
existing.score += rrfScore;
} else {
scores.set(result.id, {
score: rrfScore,
content: result.content,
});
}
}
}
return Array.from(scores.entries())
.map(([id, { score, content }]) => ({ id, content, score }))
.sort((a, b) => b.score - a.score);
}
// Full hybrid search implementation
async function hybridSearch(
query: string,
vectorStore: any,
bm25Index: any,
topK: number = 5,
semanticWeight: number = 0.7,
keywordWeight: number = 0.3,
): Promise<SearchResult[]> {
// Run both searches in parallel
const [semanticResults, keywordResults] = await Promise.all([
vectorStore.similaritySearchWithScore(query, topK * 2),
bm25Index.search(query, topK * 2),
]);
// Normalize results to common format
const semantic: SearchResult[] = semanticResults.map(
([doc, score]: any, i: number) => ({
id: doc.metadata?.id || `sem_${i}`,
content: doc.pageContent,
score: 1 - score, // Convert distance to similarity
})
);
const keyword: SearchResult[] = keywordResults.map(
([idx, score, content]: any) => ({
id: `kw_${idx}`,
content,
score,
})
);
// Fuse results
return reciprocalRankFusion(
[semantic, keyword],
60,
[semanticWeight, keywordWeight]
).slice(0, topK);
}
Hybrid Search with Weaviate
Weaviate has built-in hybrid search that combines BM25 and vector search natively.
# Weaviate native hybrid search
import weaviate
from weaviate.classes.query import HybridFusion
client = weaviate.connect_to_local() # or connect_to_wcs()
# Create collection with hybrid search support
collection = client.collections.create(
name="Documents",
vectorizer_config=weaviate.classes.config.Configure.Vectorizer.text2vec_openai(),
)
# Add documents
collection.data.insert_many([
{"content": "ERROR_CODE_4032 authentication failure", "source": "errors.md"},
{"content": "Authentication requires valid JWT tokens", "source": "auth.md"},
{"content": "Kubernetes cluster management guide", "source": "k8s.md"},
])
# Hybrid search (combines BM25 + vector automatically)
results = collection.query.hybrid(
query="ERROR_CODE_4032",
alpha=0.5, # 0 = pure keyword, 1 = pure vector, 0.5 = balanced
limit=5,
fusion_type=HybridFusion.RELATIVE_SCORE,
return_metadata=weaviate.classes.query.MetadataQuery(score=True),
)
for obj in results.objects:
print(f"[{obj.metadata.score:.3f}] {obj.properties['content']}")
client.close()
Hybrid Search with pgvector + Full-Text Search
# PostgreSQL hybrid search with pgvector + tsvector
import psycopg2
conn = psycopg2.connect("dbname=mydb user=myuser")
cur = conn.cursor()
# Create table with both vector and full-text search
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT,
source TEXT,
embedding vector(1536),
tsv tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED
);
CREATE INDEX IF NOT EXISTS idx_docs_embedding ON documents
USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_docs_tsv ON documents
USING gin(tsv);
""")
# Hybrid search query
def hybrid_search_pg(query: str, query_embedding: list, top_k: int = 5, alpha: float = 0.5):
"""Combine semantic and full-text search in PostgreSQL."""
cur.execute("""
WITH semantic AS (
SELECT id, content, source,
1 - (embedding <=> %s::vector) AS semantic_score,
ROW_NUMBER() OVER (ORDER BY embedding <=> %s::vector) AS semantic_rank
FROM documents
ORDER BY embedding <=> %s::vector
LIMIT %s
),
keyword AS (
SELECT id, content, source,
ts_rank(tsv, plainto_tsquery('english', %s)) AS keyword_score,
ROW_NUMBER() OVER (ORDER BY ts_rank(tsv, plainto_tsquery('english', %s)) DESC) AS keyword_rank
FROM documents
WHERE tsv @@ plainto_tsquery('english', %s)
LIMIT %s
)
SELECT COALESCE(s.id, k.id) AS id,
COALESCE(s.content, k.content) AS content,
COALESCE(s.source, k.source) AS source,
(%s * COALESCE(1.0 / (60 + s.semantic_rank), 0)) +
(%s * COALESCE(1.0 / (60 + k.keyword_rank), 0)) AS hybrid_score
FROM semantic s
FULL OUTER JOIN keyword k ON s.id = k.id
ORDER BY hybrid_score DESC
LIMIT %s
""", (query_embedding, query_embedding, query_embedding, top_k * 2,
query, query, query, top_k * 2,
alpha, 1 - alpha, top_k))
return cur.fetchall()
Hybrid Search Tips
- Tune the alpha parameter: Start at 0.5 and adjust based on your query types. Technical queries need more keyword weight.
- Use RRF over simple weighting: RRF is more robust because it works with ranks, not raw scores that may have different scales.
- Consider query routing: Detect if a query has specific terms (codes, names) and adjust alpha dynamically.
- Measure both components: Track how often keyword vs semantic search contributes the best result to identify the optimal balance.
Summary
Hybrid search is a significant upgrade over pure semantic search, especially for production applications dealing with technical content, product names, or error codes. Whether you use a database with built-in hybrid support (Weaviate, pgvector), or implement RRF fusion yourself, combining keyword and semantic approaches consistently produces better retrieval quality.