TechLead
Intermediate
20 min
Full Guide

Natural Language Processing (NLP) Engineering

Build NLP pipelines with tokenization, word embeddings, text classification, sentiment analysis, and named entity recognition

NLP Engineering: From Text to Features

NLP engineering is about building practical text processing pipelines. While the theory of NLP covers linguistics and algorithms, engineering NLP focuses on tokenization, embeddings, preprocessing, and connecting these into production-ready systems.

The NLP Pipeline:

Raw Text -> Preprocessing -> Tokenization -> Embeddings -> Model -> Predictions

Tokenization Methods

Tokenization converts text into tokens (subwords, words, or characters) that models can process. Modern tokenizers use subword methods:

# Tokenization approaches compared

# 1. Word-level tokenization (simple but limited vocabulary)
text = "I love machine learning and deep learning"
word_tokens = text.lower().split()
print(f"Word tokens: {word_tokens}")
# ['i', 'love', 'machine', 'learning', 'and', 'deep', 'learning']

# 2. Character-level tokenization (small vocab, long sequences)
char_tokens = list(text.lower())
print(f"Char tokens: {char_tokens}")

# 3. BPE (Byte Pair Encoding) - used by GPT models
# Learns common subword patterns from data
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=["[PAD]", "[UNK]"])

# Train on corpus
corpus = [
    "machine learning is transforming the world",
    "deep learning uses neural networks",
    "natural language processing handles text data"
]
tokenizer.train_from_iterator(corpus, trainer)

encoded = tokenizer.encode("machine learning")
print(f"BPE tokens: {encoded.tokens}")
print(f"Token IDs:  {encoded.ids}")

# 4. Using Hugging Face tokenizers (production-ready)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer("I love machine learning!", return_tensors="pt")
print(f"BERT tokens: {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")
# ['[CLS]', 'i', 'love', 'machine', 'learning', '!', '[SEP]']

Word Embeddings: Word2Vec and GloVe

Embeddings represent words as dense vectors where similar words have similar vectors. They capture semantic meaning: "king - man + woman = queen".

import numpy as np
from gensim.models import Word2Vec, KeyedVectors

# Train Word2Vec on custom corpus
sentences = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "ran", "in", "the", "park"],
    ["cats", "and", "dogs", "are", "pets"],
    ["machine", "learning", "is", "powerful"],
    ["deep", "learning", "uses", "neural", "networks"],
]

# CBOW (predict center word from context) vs Skip-gram (predict context from center)
model = Word2Vec(
    sentences,
    vector_size=100,  # embedding dimension
    window=5,         # context window
    min_count=1,      # minimum word frequency
    sg=1,             # 1 = skip-gram, 0 = CBOW
    epochs=100
)

# Find similar words
similar = model.wv.most_similar("cat", topn=3)
print(f"Similar to 'cat': {similar}")

# Word arithmetic
# result = model.wv.most_similar(positive=['king', 'woman'],
#                                 negative=['man'], topn=1)

# Using pre-trained GloVe embeddings (much better!)
# Download glove.6B.100d.txt from Stanford
# glove = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', no_header=True)

# Modern approach: use transformer embeddings instead
from transformers import AutoModel, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

text = "Machine learning is powerful"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

# outputs.last_hidden_state: contextual embeddings for each token
embeddings = outputs.last_hidden_state
print(f"Contextual embeddings shape: {embeddings.shape}")
# (1, 6, 768) - 6 tokens, 768-dimensional embeddings

Text Preprocessing Pipeline

import re
import string
from collections import Counter

class TextPreprocessor:
    """Production-ready text preprocessing pipeline."""

    def __init__(self):
        # Common stop words
        self.stop_words = {
            'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
            'would', 'could', 'should', 'may', 'might', 'can', 'shall',
            'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
            'it', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
            'she', 'we', 'they', 'and', 'but', 'or', 'not', 'no',
        }

    def clean(self, text):
        """Full preprocessing pipeline."""
        # Lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        # Remove special characters (keep alphanumeric and spaces)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def remove_stopwords(self, text):
        return ' '.join(w for w in text.split() if w not in self.stop_words)

    def process(self, text):
        text = self.clean(text)
        text = self.remove_stopwords(text)
        return text

# Usage
preprocessor = TextPreprocessor()
raw = "Check out https://example.com! This is an AMAZING product!!!"
clean = preprocessor.process(raw)
print(f"Raw:   {raw}")
print(f"Clean: {clean}")
# Output: "check amazing product"

Sentiment Analysis with Transformers

from transformers import pipeline

# Zero-code sentiment analysis
sentiment = pipeline("sentiment-analysis")
results = sentiment([
    "I absolutely love this product!",
    "This is the worst experience ever.",
    "It's okay, nothing special.",
    "The quality exceeded my expectations!"
])
for text, result in zip(
    ["love", "worst", "okay", "exceeded"], results
):
    print(f"  '{text}...' -> {result['label']}: {result['score']:.3f}")

# Named Entity Recognition
ner = pipeline("ner", grouped_entities=True)
entities = ner("Apple CEO Tim Cook announced the new iPhone at their Cupertino headquarters.")
for entity in entities:
    print(f"  {entity['word']:20s} -> {entity['entity_group']:10s} (confidence: {entity['score']:.3f})")
# Apple                -> ORG        (confidence: 0.998)
# Tim Cook             -> PER        (confidence: 0.999)
# iPhone               -> MISC       (confidence: 0.987)
# Cupertino            -> LOC        (confidence: 0.997)

# Text classification
classifier = pipeline("zero-shot-classification")
result = classifier(
    "The stock market crashed after the Fed raised interest rates.",
    candidate_labels=["finance", "sports", "technology", "politics"]
)
print(f"\nTopic: {result['labels'][0]} ({result['scores'][0]:.3f})")

Key Takeaways

  • BPE tokenization (used by GPT) handles unknown words by breaking them into learned subwords
  • Word2Vec/GloVe provide static embeddings; transformer models provide contextual embeddings
  • Always preprocess text: lowercase, remove URLs/HTML, handle special characters
  • Hugging Face pipelines give production-ready NLP in 2 lines of code
  • Contextual embeddings from transformers have replaced static embeddings for most tasks

Continue Learning