Transformers Architecture Explained
Deep dive into self-attention, multi-head attention, positional encoding, and the encoder-decoder transformer architecture
Why Transformers Replaced RNNs
Before transformers, RNNs and LSTMs processed sequences one token at a time, creating bottlenecks for long sequences. Transformers process all tokens in parallel using attention, making them dramatically faster to train and better at capturing long-range dependencies.
The "Attention Is All You Need" Breakthrough (2017)
Vaswani et al. showed that attention mechanisms alone, without any recurrence or convolution, achieve state-of-the-art results on translation tasks while training 10x faster.
Self-Attention: The Core Mechanism
Self-attention computes how much each token should "attend to" every other token in the sequence. It uses three learned projections: Query (Q), Key (K), and Value (V).
import numpy as np
def scaled_dot_product_attention(Q, K, V, mask=None):
"""
Scaled Dot-Product Attention
Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V
Q: queries (seq_len, d_k)
K: keys (seq_len, d_k)
V: values (seq_len, d_v)
"""
d_k = K.shape[-1]
# Step 1: Compute attention scores
# How much each query matches each key
scores = Q @ K.T / np.sqrt(d_k) # (seq_len, seq_len)
# Step 2: Optional masking (for decoder / causal attention)
if mask is not None:
scores = np.where(mask == 0, -1e9, scores)
# Step 3: Softmax to get attention weights (probabilities)
def softmax(x):
exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
attention_weights = softmax(scores) # (seq_len, seq_len)
# Step 4: Weighted sum of values
output = attention_weights @ V # (seq_len, d_v)
return output, attention_weights
# Example: 4 tokens, embedding dim = 8
seq_len, d_model = 4, 8
np.random.seed(42)
# Simulate embeddings for "The cat sat down"
X = np.random.randn(seq_len, d_model)
# In practice, Q/K/V come from learned linear projections
W_q = np.random.randn(d_model, d_model) * 0.1
W_k = np.random.randn(d_model, d_model) * 0.1
W_v = np.random.randn(d_model, d_model) * 0.1
Q = X @ W_q
K = X @ W_k
V = X @ W_v
output, weights = scaled_dot_product_attention(Q, K, V)
print("Attention weights (which tokens attend to which):")
print(np.round(weights, 3))
# Each row shows how much that token attends to each other token
Multi-Head Attention
Instead of a single attention function, transformers run multiple attention heads in parallel. Each head learns different relationship patterns (syntax, semantics, coreference, etc.).
import numpy as np
class MultiHeadAttention:
def __init__(self, d_model, num_heads):
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads # dimension per head
# Learned projection matrices
self.W_q = np.random.randn(d_model, d_model) * 0.02
self.W_k = np.random.randn(d_model, d_model) * 0.02
self.W_v = np.random.randn(d_model, d_model) * 0.02
self.W_o = np.random.randn(d_model, d_model) * 0.02 # output projection
def split_heads(self, x):
"""Reshape (seq_len, d_model) -> (num_heads, seq_len, d_k)"""
seq_len = x.shape[0]
x = x.reshape(seq_len, self.num_heads, self.d_k)
return x.transpose(1, 0, 2) # (num_heads, seq_len, d_k)
def forward(self, X, mask=None):
# Linear projections
Q = X @ self.W_q
K = X @ self.W_k
V = X @ self.W_v
# Split into multiple heads
Q = self.split_heads(Q) # (num_heads, seq_len, d_k)
K = self.split_heads(K)
V = self.split_heads(V)
# Apply attention to each head
head_outputs = []
for h in range(self.num_heads):
scores = Q[h] @ K[h].T / np.sqrt(self.d_k)
if mask is not None:
scores = np.where(mask == 0, -1e9, scores)
weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
weights /= np.sum(weights, axis=-1, keepdims=True)
head_outputs.append(weights @ V[h])
# Concatenate heads and project
concat = np.concatenate(head_outputs, axis=-1) # (seq_len, d_model)
output = concat @ self.W_o
return output
# GPT-3 uses 96 attention heads with d_model = 12288
# Here we use small values for illustration
mha = MultiHeadAttention(d_model=64, num_heads=8)
X = np.random.randn(10, 64) # 10 tokens, 64-dim embeddings
output = mha.forward(X)
print(f"Input shape: {X.shape}") # (10, 64)
print(f"Output shape: {output.shape}") # (10, 64)
Positional Encoding
Since transformers process all tokens in parallel (no inherent order), they need positional encodings to understand token positions. The original paper uses sinusoidal functions:
import numpy as np
def positional_encoding(seq_len, d_model):
"""
Sinusoidal positional encoding from 'Attention Is All You Need'
PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
"""
PE = np.zeros((seq_len, d_model))
position = np.arange(seq_len)[:, np.newaxis] # (seq_len, 1)
div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
PE[:, 0::2] = np.sin(position * div_term) # even indices
PE[:, 1::2] = np.cos(position * div_term) # odd indices
return PE
# Create positional encoding for 100 positions, 256 dimensions
pe = positional_encoding(100, 256)
# Add to token embeddings (the standard approach)
token_embeddings = np.random.randn(100, 256) # from embedding layer
input_to_transformer = token_embeddings + pe
print(f"PE shape: {pe.shape}")
print(f"Each position gets a unique encoding vector")
print(f"Similar positions have similar encodings (smooth)")
# Why sinusoidal?
# 1. Can extrapolate to longer sequences than seen during training
# 2. Relative positions are captured (PE[pos+k] is a linear
# function of PE[pos] for any fixed offset k)
# 3. No learned parameters needed
Encoder-Decoder Architecture
Encoder Stack
Repeated N times (6 in original). Processes full input bidirectionally. Used by BERT.
Decoder Stack
Causal masking prevents attending to future tokens. GPT uses decoder-only.
Architecture variants: Encoder-only (BERT), Decoder-only (GPT), Encoder-Decoder (T5, original Transformer).
Transformer Block Implementation
import numpy as np
class TransformerBlock:
"""A single transformer encoder block."""
def __init__(self, d_model, num_heads, d_ff):
self.attention = MultiHeadAttention(d_model, num_heads)
self.d_model = d_model
# Feed-forward network weights
self.W1 = np.random.randn(d_model, d_ff) * 0.02
self.b1 = np.zeros(d_ff)
self.W2 = np.random.randn(d_ff, d_model) * 0.02
self.b2 = np.zeros(d_model)
def layer_norm(self, x, eps=1e-6):
"""Layer normalization"""
mean = np.mean(x, axis=-1, keepdims=True)
std = np.std(x, axis=-1, keepdims=True)
return (x - mean) / (std + eps)
def feed_forward(self, x):
"""Position-wise feed-forward: two linear layers with ReLU"""
hidden = np.maximum(0, x @ self.W1 + self.b1) # ReLU
return hidden @ self.W2 + self.b2
def forward(self, x, mask=None):
# Sub-layer 1: Multi-head attention + residual + layer norm
attn_output = self.attention.forward(x, mask)
x = self.layer_norm(x + attn_output) # residual connection
# Sub-layer 2: Feed-forward + residual + layer norm
ff_output = self.feed_forward(x)
x = self.layer_norm(x + ff_output) # residual connection
return x
# Stack multiple blocks for a full transformer encoder
d_model, num_heads, d_ff, num_layers = 64, 8, 256, 6
blocks = [TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)]
x = np.random.randn(20, d_model) # 20 tokens
for block in blocks:
x = block.forward(x)
print(f"Final output shape: {x.shape}") # (20, 64)
Key Takeaways
- Self-attention computes Query-Key-Value relationships between all tokens in parallel
- Multi-head attention lets the model learn different types of relationships simultaneously
- Positional encoding injects sequence order information since attention has no inherent position
- Residual connections and layer normalization stabilize training of deep transformer stacks
- Encoder-only (BERT), decoder-only (GPT), and encoder-decoder (T5) serve different tasks