Natural Language Processing (NLP) Engineering
Build NLP pipelines with tokenization, word embeddings, text classification, sentiment analysis, and named entity recognition
NLP Engineering: From Text to Features
NLP engineering is about building practical text processing pipelines. While the theory of NLP covers linguistics and algorithms, engineering NLP focuses on tokenization, embeddings, preprocessing, and connecting these into production-ready systems.
The NLP Pipeline:
Raw Text -> Preprocessing -> Tokenization -> Embeddings -> Model -> Predictions
Tokenization Methods
Tokenization converts text into tokens (subwords, words, or characters) that models can process. Modern tokenizers use subword methods:
# Tokenization approaches compared
# 1. Word-level tokenization (simple but limited vocabulary)
text = "I love machine learning and deep learning"
word_tokens = text.lower().split()
print(f"Word tokens: {word_tokens}")
# ['i', 'love', 'machine', 'learning', 'and', 'deep', 'learning']
# 2. Character-level tokenization (small vocab, long sequences)
char_tokens = list(text.lower())
print(f"Char tokens: {char_tokens}")
# 3. BPE (Byte Pair Encoding) - used by GPT models
# Learns common subword patterns from data
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=["[PAD]", "[UNK]"])
# Train on corpus
corpus = [
"machine learning is transforming the world",
"deep learning uses neural networks",
"natural language processing handles text data"
]
tokenizer.train_from_iterator(corpus, trainer)
encoded = tokenizer.encode("machine learning")
print(f"BPE tokens: {encoded.tokens}")
print(f"Token IDs: {encoded.ids}")
# 4. Using Hugging Face tokenizers (production-ready)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer("I love machine learning!", return_tensors="pt")
print(f"BERT tokens: {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")
# ['[CLS]', 'i', 'love', 'machine', 'learning', '!', '[SEP]']
Word Embeddings: Word2Vec and GloVe
Embeddings represent words as dense vectors where similar words have similar vectors. They capture semantic meaning: "king - man + woman = queen".
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
# Train Word2Vec on custom corpus
sentences = [
["the", "cat", "sat", "on", "the", "mat"],
["the", "dog", "ran", "in", "the", "park"],
["cats", "and", "dogs", "are", "pets"],
["machine", "learning", "is", "powerful"],
["deep", "learning", "uses", "neural", "networks"],
]
# CBOW (predict center word from context) vs Skip-gram (predict context from center)
model = Word2Vec(
sentences,
vector_size=100, # embedding dimension
window=5, # context window
min_count=1, # minimum word frequency
sg=1, # 1 = skip-gram, 0 = CBOW
epochs=100
)
# Find similar words
similar = model.wv.most_similar("cat", topn=3)
print(f"Similar to 'cat': {similar}")
# Word arithmetic
# result = model.wv.most_similar(positive=['king', 'woman'],
# negative=['man'], topn=1)
# Using pre-trained GloVe embeddings (much better!)
# Download glove.6B.100d.txt from Stanford
# glove = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', no_header=True)
# Modern approach: use transformer embeddings instead
from transformers import AutoModel, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
text = "Machine learning is powerful"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# outputs.last_hidden_state: contextual embeddings for each token
embeddings = outputs.last_hidden_state
print(f"Contextual embeddings shape: {embeddings.shape}")
# (1, 6, 768) - 6 tokens, 768-dimensional embeddings
Text Preprocessing Pipeline
import re
import string
from collections import Counter
class TextPreprocessor:
"""Production-ready text preprocessing pipeline."""
def __init__(self):
# Common stop words
self.stop_words = {
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'can', 'shall',
'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
'it', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
'she', 'we', 'they', 'and', 'but', 'or', 'not', 'no',
}
def clean(self, text):
"""Full preprocessing pipeline."""
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)
# Remove special characters (keep alphanumeric and spaces)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def remove_stopwords(self, text):
return ' '.join(w for w in text.split() if w not in self.stop_words)
def process(self, text):
text = self.clean(text)
text = self.remove_stopwords(text)
return text
# Usage
preprocessor = TextPreprocessor()
raw = "Check out https://example.com! This is an AMAZING product!!!"
clean = preprocessor.process(raw)
print(f"Raw: {raw}")
print(f"Clean: {clean}")
# Output: "check amazing product"
Sentiment Analysis with Transformers
from transformers import pipeline
# Zero-code sentiment analysis
sentiment = pipeline("sentiment-analysis")
results = sentiment([
"I absolutely love this product!",
"This is the worst experience ever.",
"It's okay, nothing special.",
"The quality exceeded my expectations!"
])
for text, result in zip(
["love", "worst", "okay", "exceeded"], results
):
print(f" '{text}...' -> {result['label']}: {result['score']:.3f}")
# Named Entity Recognition
ner = pipeline("ner", grouped_entities=True)
entities = ner("Apple CEO Tim Cook announced the new iPhone at their Cupertino headquarters.")
for entity in entities:
print(f" {entity['word']:20s} -> {entity['entity_group']:10s} (confidence: {entity['score']:.3f})")
# Apple -> ORG (confidence: 0.998)
# Tim Cook -> PER (confidence: 0.999)
# iPhone -> MISC (confidence: 0.987)
# Cupertino -> LOC (confidence: 0.997)
# Text classification
classifier = pipeline("zero-shot-classification")
result = classifier(
"The stock market crashed after the Fed raised interest rates.",
candidate_labels=["finance", "sports", "technology", "politics"]
)
print(f"\nTopic: {result['labels'][0]} ({result['scores'][0]:.3f})")
Key Takeaways
- BPE tokenization (used by GPT) handles unknown words by breaking them into learned subwords
- Word2Vec/GloVe provide static embeddings; transformer models provide contextual embeddings
- Always preprocess text: lowercase, remove URLs/HTML, handle special characters
- Hugging Face pipelines give production-ready NLP in 2 lines of code
- Contextual embeddings from transformers have replaced static embeddings for most tasks