TechLead
Intermediate
20 min
Full Guide

Transformers Architecture Explained

Deep dive into self-attention, multi-head attention, positional encoding, and the encoder-decoder transformer architecture

Why Transformers Replaced RNNs

Before transformers, RNNs and LSTMs processed sequences one token at a time, creating bottlenecks for long sequences. Transformers process all tokens in parallel using attention, making them dramatically faster to train and better at capturing long-range dependencies.

The "Attention Is All You Need" Breakthrough (2017)

Vaswani et al. showed that attention mechanisms alone, without any recurrence or convolution, achieve state-of-the-art results on translation tasks while training 10x faster.

Self-Attention: The Core Mechanism

Self-attention computes how much each token should "attend to" every other token in the sequence. It uses three learned projections: Query (Q), Key (K), and Value (V).

import numpy as np

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Scaled Dot-Product Attention
    Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V

    Q: queries (seq_len, d_k)
    K: keys    (seq_len, d_k)
    V: values  (seq_len, d_v)
    """
    d_k = K.shape[-1]

    # Step 1: Compute attention scores
    # How much each query matches each key
    scores = Q @ K.T / np.sqrt(d_k)  # (seq_len, seq_len)

    # Step 2: Optional masking (for decoder / causal attention)
    if mask is not None:
        scores = np.where(mask == 0, -1e9, scores)

    # Step 3: Softmax to get attention weights (probabilities)
    def softmax(x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

    attention_weights = softmax(scores)  # (seq_len, seq_len)

    # Step 4: Weighted sum of values
    output = attention_weights @ V  # (seq_len, d_v)

    return output, attention_weights

# Example: 4 tokens, embedding dim = 8
seq_len, d_model = 4, 8
np.random.seed(42)

# Simulate embeddings for "The cat sat down"
X = np.random.randn(seq_len, d_model)

# In practice, Q/K/V come from learned linear projections
W_q = np.random.randn(d_model, d_model) * 0.1
W_k = np.random.randn(d_model, d_model) * 0.1
W_v = np.random.randn(d_model, d_model) * 0.1

Q = X @ W_q
K = X @ W_k
V = X @ W_v

output, weights = scaled_dot_product_attention(Q, K, V)
print("Attention weights (which tokens attend to which):")
print(np.round(weights, 3))
# Each row shows how much that token attends to each other token

Multi-Head Attention

Instead of a single attention function, transformers run multiple attention heads in parallel. Each head learns different relationship patterns (syntax, semantics, coreference, etc.).

import numpy as np

class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # dimension per head

        # Learned projection matrices
        self.W_q = np.random.randn(d_model, d_model) * 0.02
        self.W_k = np.random.randn(d_model, d_model) * 0.02
        self.W_v = np.random.randn(d_model, d_model) * 0.02
        self.W_o = np.random.randn(d_model, d_model) * 0.02  # output projection

    def split_heads(self, x):
        """Reshape (seq_len, d_model) -> (num_heads, seq_len, d_k)"""
        seq_len = x.shape[0]
        x = x.reshape(seq_len, self.num_heads, self.d_k)
        return x.transpose(1, 0, 2)  # (num_heads, seq_len, d_k)

    def forward(self, X, mask=None):
        # Linear projections
        Q = X @ self.W_q
        K = X @ self.W_k
        V = X @ self.W_v

        # Split into multiple heads
        Q = self.split_heads(Q)  # (num_heads, seq_len, d_k)
        K = self.split_heads(K)
        V = self.split_heads(V)

        # Apply attention to each head
        head_outputs = []
        for h in range(self.num_heads):
            scores = Q[h] @ K[h].T / np.sqrt(self.d_k)
            if mask is not None:
                scores = np.where(mask == 0, -1e9, scores)
            weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
            weights /= np.sum(weights, axis=-1, keepdims=True)
            head_outputs.append(weights @ V[h])

        # Concatenate heads and project
        concat = np.concatenate(head_outputs, axis=-1)  # (seq_len, d_model)
        output = concat @ self.W_o

        return output

# GPT-3 uses 96 attention heads with d_model = 12288
# Here we use small values for illustration
mha = MultiHeadAttention(d_model=64, num_heads=8)
X = np.random.randn(10, 64)  # 10 tokens, 64-dim embeddings
output = mha.forward(X)
print(f"Input shape:  {X.shape}")       # (10, 64)
print(f"Output shape: {output.shape}")  # (10, 64)

Positional Encoding

Since transformers process all tokens in parallel (no inherent order), they need positional encodings to understand token positions. The original paper uses sinusoidal functions:

import numpy as np

def positional_encoding(seq_len, d_model):
    """
    Sinusoidal positional encoding from 'Attention Is All You Need'
    PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
    """
    PE = np.zeros((seq_len, d_model))
    position = np.arange(seq_len)[:, np.newaxis]  # (seq_len, 1)
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))

    PE[:, 0::2] = np.sin(position * div_term)  # even indices
    PE[:, 1::2] = np.cos(position * div_term)  # odd indices
    return PE

# Create positional encoding for 100 positions, 256 dimensions
pe = positional_encoding(100, 256)

# Add to token embeddings (the standard approach)
token_embeddings = np.random.randn(100, 256)  # from embedding layer
input_to_transformer = token_embeddings + pe

print(f"PE shape: {pe.shape}")
print(f"Each position gets a unique encoding vector")
print(f"Similar positions have similar encodings (smooth)")

# Why sinusoidal?
# 1. Can extrapolate to longer sequences than seen during training
# 2. Relative positions are captured (PE[pos+k] is a linear
#    function of PE[pos] for any fixed offset k)
# 3. No learned parameters needed

Encoder-Decoder Architecture

Encoder Stack

Multi-Head Self-Attention
+ Add & LayerNorm
Feed-Forward Network
+ Add & LayerNorm

Repeated N times (6 in original). Processes full input bidirectionally. Used by BERT.

Decoder Stack

Masked Multi-Head Self-Attention
+ Add & LayerNorm
Cross-Attention (to encoder output)
+ Add & LayerNorm
Feed-Forward Network
+ Add & LayerNorm

Causal masking prevents attending to future tokens. GPT uses decoder-only.

Architecture variants: Encoder-only (BERT), Decoder-only (GPT), Encoder-Decoder (T5, original Transformer).

Transformer Block Implementation

import numpy as np

class TransformerBlock:
    """A single transformer encoder block."""

    def __init__(self, d_model, num_heads, d_ff):
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.d_model = d_model

        # Feed-forward network weights
        self.W1 = np.random.randn(d_model, d_ff) * 0.02
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.randn(d_ff, d_model) * 0.02
        self.b2 = np.zeros(d_model)

    def layer_norm(self, x, eps=1e-6):
        """Layer normalization"""
        mean = np.mean(x, axis=-1, keepdims=True)
        std = np.std(x, axis=-1, keepdims=True)
        return (x - mean) / (std + eps)

    def feed_forward(self, x):
        """Position-wise feed-forward: two linear layers with ReLU"""
        hidden = np.maximum(0, x @ self.W1 + self.b1)  # ReLU
        return hidden @ self.W2 + self.b2

    def forward(self, x, mask=None):
        # Sub-layer 1: Multi-head attention + residual + layer norm
        attn_output = self.attention.forward(x, mask)
        x = self.layer_norm(x + attn_output)  # residual connection

        # Sub-layer 2: Feed-forward + residual + layer norm
        ff_output = self.feed_forward(x)
        x = self.layer_norm(x + ff_output)  # residual connection

        return x

# Stack multiple blocks for a full transformer encoder
d_model, num_heads, d_ff, num_layers = 64, 8, 256, 6
blocks = [TransformerBlock(d_model, num_heads, d_ff) for _ in range(num_layers)]

x = np.random.randn(20, d_model)  # 20 tokens
for block in blocks:
    x = block.forward(x)
print(f"Final output shape: {x.shape}")  # (20, 64)

Key Takeaways

  • Self-attention computes Query-Key-Value relationships between all tokens in parallel
  • Multi-head attention lets the model learn different types of relationships simultaneously
  • Positional encoding injects sequence order information since attention has no inherent position
  • Residual connections and layer normalization stabilize training of deep transformer stacks
  • Encoder-only (BERT), decoder-only (GPT), and encoder-decoder (T5) serve different tasks

Continue Learning