Semantic Search
Semantic search retrieves results based on the meaning and intent of a query rather than exact keyword matching, using embeddings to represent text in a way that captures semantic similarity.
Keyword Search vs Semantic Search
Query: "How to fix a flat tire"
Keyword Search:
✓ "Fix your flat tire in 5 steps"
✓ "Flat tire repair guide"
✗ "Changing a punctured wheel" (no keyword match!)
✗ "Tire replacement tutorial" (different words)
Semantic Search:
✓ "Fix your flat tire in 5 steps"
✓ "Flat tire repair guide"
✓ "Changing a punctured wheel" (same meaning!)
✓ "Tire replacement tutorial" (related concept)
How It Works
┌─────────────────────────────────────────────────────────┐
│ Indexing Phase │
├─────────────────────────────────────────────────────────┤
│ Documents → Encoder → Embeddings → Vector Database │
│ "doc 1" │ [0.2, 0.8, ...] │
│ "doc 2" │ [0.5, 0.3, ...] │
│ "doc 3" │ [0.1, 0.9, ...] │
└─────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────┐
│ Query Phase │
├─────────────────────────────────────────────────────────┤
│ Query → Encoder → Query Embedding → Similarity Search │
│ "how to fix" │ [0.2, 0.7, ...] │ │
│ ↓ │
│ Top-k similar docs │
└─────────────────────────────────────────────────────────┘
Embedding Models
Sentence Transformers
from sentence_transformers import SentenceTransformer
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Encode text to embeddings
sentences = [
"How to train a neural network",
"Deep learning model training guide",
"Best pizza recipes"
]
embeddings = model.encode(sentences)
# Shape: (3, 384) - 384-dimensional vectors
# Query
query = "machine learning tutorial"
query_embedding = model.encode(query)
OpenAI Embeddings
from openai import OpenAI
client = OpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input=["How to train a neural network"]
)
embedding = response.data[0].embedding
# 1536-dimensional vector
Popular Models
| Model | Dimensions | Speed | Quality |
|---|---|---|---|
| all-MiniLM-L6-v2 | 384 | Fast | Good |
| all-mpnet-base-v2 | 768 | Medium | Better |
| text-embedding-3-small | 1536 | API | Excellent |
| text-embedding-3-large | 3072 | API | Best |
| BGE-large | 1024 | Medium | Excellent |
Similarity Search
Brute Force (Small Scale)
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def search(query_embedding, document_embeddings, top_k=5):
# Compute similarities
similarities = cosine_similarity(
[query_embedding],
document_embeddings
)[0]
# Get top-k indices
top_indices = np.argsort(similarities)[::-1][:top_k]
return [(idx, similarities[idx]) for idx in top_indices]
Vector Databases (Large Scale)
# Using FAISS
import faiss
# Create index
dimension = 384
index = faiss.IndexFlatIP(dimension) # Inner product (cosine for normalized)
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)
index.add(embeddings)
# Search
faiss.normalize_L2(query_embedding.reshape(1, -1))
distances, indices = index.search(query_embedding.reshape(1, -1), k=5)
# Using ChromaDB
import chromadb
client = chromadb.Client()
collection = client.create_collection("documents")
# Add documents
collection.add(
documents=["doc1 text", "doc2 text", "doc3 text"],
ids=["id1", "id2", "id3"]
)
# Query
results = collection.query(
query_texts=["search query"],
n_results=5
)
# Using Pinecone
import pinecone
pinecone.init(api_key="your-key")
index = pinecone.Index("semantic-search")
# Upsert
index.upsert(vectors=[
("id1", embedding1, {"text": "doc1"}),
("id2", embedding2, {"text": "doc2"}),
])
# Query
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)
Hybrid Search
Combine semantic and keyword search:
def hybrid_search(query, documents, alpha=0.5):
"""
alpha: weight for semantic (1-alpha for keyword)
"""
# Semantic scores
query_emb = model.encode(query)
doc_embs = model.encode(documents)
semantic_scores = cosine_similarity([query_emb], doc_embs)[0]
# Keyword scores (BM25)
from rank_bm25 import BM25Okapi
tokenized_docs = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)
keyword_scores = bm25.get_scores(query.lower().split())
# Normalize scores
semantic_scores = (semantic_scores - semantic_scores.min()) / (semantic_scores.max() - semantic_scores.min())
keyword_scores = (keyword_scores - keyword_scores.min()) / (keyword_scores.max() - keyword_scores.min() + 1e-6)
# Combine
hybrid_scores = alpha * semantic_scores + (1 - alpha) * keyword_scores
return np.argsort(hybrid_scores)[::-1]
Chunking Strategies
Long documents need to be split:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", " "]
)
chunks = splitter.split_text(long_document)
# Each chunk is embedded separately
Chunking Strategies Comparison
| Strategy | Pros | Cons |
|---|---|---|
| Fixed size | Simple, consistent | May split sentences |
| Sentence | Preserves meaning | Variable sizes |
| Paragraph | Natural boundaries | May be too long |
| Recursive | Balanced | More complex |
| Semantic | Best coherence | Slow, complex |
Reranking
Improve results with a second-stage ranker:
from sentence_transformers import CrossEncoder
# First stage: Fast bi-encoder retrieval
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
candidate_docs = fast_retrieve(query, top_k=100)
# Second stage: Accurate cross-encoder reranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
pairs = [[query, doc] for doc in candidate_docs]
scores = cross_encoder.predict(pairs)
reranked = sorted(zip(candidate_docs, scores), key=lambda x: x[1], reverse=True)
top_results = reranked[:10]
Evaluation
Metrics
def evaluate_retrieval(queries, ground_truth, retrieve_fn, k=10):
metrics = {'mrr': 0, 'recall@k': 0, 'precision@k': 0}
for query, relevant_docs in zip(queries, ground_truth):
retrieved = retrieve_fn(query, k=k)
# MRR (Mean Reciprocal Rank)
for i, doc in enumerate(retrieved):
if doc in relevant_docs:
metrics['mrr'] += 1 / (i + 1)
break
# Recall@k
retrieved_relevant = len(set(retrieved) & set(relevant_docs))
metrics['recall@k'] += retrieved_relevant / len(relevant_docs)
# Precision@k
metrics['precision@k'] += retrieved_relevant / k
n = len(queries)
return {k: v/n for k, v in metrics.items()}
Complete Example
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
class SemanticSearchEngine:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.index = None
self.documents = []
def index_documents(self, documents):
self.documents = documents
embeddings = self.model.encode(documents, show_progress_bar=True)
embeddings = embeddings.astype('float32')
# Normalize for cosine similarity
faiss.normalize_L2(embeddings)
# Create index
self.index = faiss.IndexFlatIP(embeddings.shape[1])
self.index.add(embeddings)
print(f"Indexed {len(documents)} documents")
def search(self, query, top_k=5):
query_embedding = self.model.encode([query]).astype('float32')
faiss.normalize_L2(query_embedding)
scores, indices = self.index.search(query_embedding, top_k)
results = []
for score, idx in zip(scores[0], indices[0]):
results.append({
'document': self.documents[idx],
'score': float(score)
})
return results
# Usage
engine = SemanticSearchEngine()
engine.index_documents([
"Python is a programming language",
"Machine learning uses algorithms to learn from data",
"Neural networks are inspired by the brain"
])
results = engine.search("How do AI models learn?")
for r in results:
print(f"{r['score']:.3f}: {r['document']}")
Key Takeaways
- Semantic search finds results by meaning, not keywords
- Uses embedding models to convert text to vectors
- Similarity measured by cosine similarity or dot product
- Vector databases (FAISS, Pinecone) enable fast search at scale
- Hybrid search combines semantic + keyword for best results
- Reranking with cross-encoders improves precision