Intermediate
20 min
Full Guide
Hugging Face Ecosystem
Master the Hugging Face Transformers library, model hub, tokenizers, datasets, pipelines API, and fine-tuning pretrained models
Why Hugging Face Matters
Hugging Face has become the GitHub of machine learning. It hosts 500K+ models, 100K+ datasets, and provides the most popular libraries for working with transformers. If you work with NLP, computer vision, or any modern AI, you will use Hugging Face.
The Hugging Face Stack:
Transformers: Model library (PyTorch/TF)
Datasets: Dataset loading and processing
Tokenizers: Fast tokenization (Rust)
Hub: Model and dataset repository
Accelerate: Multi-GPU/TPU training
PEFT: Parameter-efficient fine-tuning
Pipelines: Zero-Code ML
Pipelines are the simplest way to use pretrained models. One line of code for inference:
from transformers import pipeline
# Text Generation
generator = pipeline("text-generation", model="gpt2")
result = generator("The future of AI is", max_length=50, num_return_sequences=2)
for r in result:
print(f"Generated: {r['generated_text'][:100]}...")
# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
article = """
Artificial intelligence has made remarkable progress in recent years.
Large language models can now write code, answer questions, and engage
in complex reasoning. However, challenges remain in areas like
hallucination, bias, and energy consumption. Researchers are working
on making AI more reliable and efficient.
"""
summary = summarizer(article, max_length=50, min_length=20)
print(f"Summary: {summary[0]['summary_text']}")
# Question Answering
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
result = qa(
question="What are the challenges of AI?",
context=article
)
print(f"Answer: {result['answer']} (confidence: {result['score']:.3f})")
# Translation
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Machine learning is transforming every industry.")
print(f"French: {result[0]['translation_text']}")
# Image Classification
classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
# result = classifier("cat.jpg")
# print(f"This is a: {result[0]['label']}")
# Available tasks: text-classification, token-classification,
# fill-mask, text2text-generation, image-segmentation,
# audio-classification, automatic-speech-recognition, and more!
Loading Models and Tokenizers
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
# AutoModel automatically detects the right model class
model_name = "bert-base-uncased"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Tokenize text
text = "Hugging Face makes NLP accessible to everyone"
inputs = tokenizer(
text,
return_tensors="pt", # return PyTorch tensors
padding=True, # pad to max length
truncation=True, # truncate if too long
max_length=128
)
print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
# Get model outputs
import torch
with torch.no_grad():
outputs = model(**inputs)
# outputs.last_hidden_state: (batch, seq_len, hidden_dim)
print(f"Output shape: {outputs.last_hidden_state.shape}")
# For classification tasks, use the task-specific model
classifier = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english"
)
cls_tokenizer = AutoTokenizer.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english"
)
inputs = cls_tokenizer("I love this product!", return_tensors="pt")
with torch.no_grad():
logits = classifier(**inputs).logits
prediction = torch.argmax(logits, dim=-1)
print(f"Sentiment: {'POSITIVE' if prediction == 1 else 'NEGATIVE'}")
Working with Datasets
from datasets import load_dataset, Dataset
# Load popular datasets
imdb = load_dataset("imdb")
print(f"IMDB train: {len(imdb['train'])} examples")
print(f"Sample: {imdb['train'][0]['text'][:100]}...")
print(f"Label: {imdb['train'][0]['label']}") # 0=neg, 1=pos
# Load specific splits and subsets
squad = load_dataset("squad", split="train[:1000]") # first 1000
# Apply transformations efficiently
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=256
)
tokenized = imdb.map(tokenize_function, batched=True, num_proc=4)
# Filter, sort, select
filtered = imdb["train"].filter(lambda x: len(x["text"]) > 100)
shuffled = imdb["train"].shuffle(seed=42)
subset = imdb["train"].select(range(100))
# Create custom datasets
my_data = Dataset.from_dict({
"text": ["Great product!", "Terrible service", "It was okay"],
"label": [1, 0, 1]
})
# Stream large datasets (no download needed)
large_dataset = load_dataset("wikipedia", "20220301.en", streaming=True)
for i, example in enumerate(large_dataset["train"]):
if i >= 5: break
print(f" Article: {example['title']}")
print("Streaming allows processing datasets larger than RAM")
Fine-Tuning a Pretrained Model
from transformers import (
AutoModelForSequenceClassification, AutoTokenizer,
TrainingArguments, Trainer
)
from datasets import load_dataset
import numpy as np
# 1. Load dataset and model
dataset = load_dataset("imdb")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2
)
# 2. Tokenize dataset
def tokenize(examples):
return tokenizer(examples["text"], padding="max_length",
truncation=True, max_length=256)
tokenized = dataset.map(tokenize, batched=True)
small_train = tokenized["train"].shuffle(seed=42).select(range(2000))
small_eval = tokenized["test"].shuffle(seed=42).select(range(500))
# 3. Define metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = (predictions == labels).mean()
return {"accuracy": accuracy}
# 4. Configure training
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=100,
weight_decay=0.01,
learning_rate=2e-5,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
logging_steps=50,
fp16=True, # mixed precision training
)
# 5. Train!
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train,
eval_dataset=small_eval,
compute_metrics=compute_metrics,
)
trainer.train()
# 6. Push to Hub (share with the world)
# trainer.push_to_hub("my-sentiment-model")
print("Fine-tuning complete! Model saved to ./results")
Key Takeaways
- Hugging Face Pipelines give you production-ready ML in one line of code
- AutoModel and AutoTokenizer automatically detect the right classes for any model
- The Datasets library handles loading, streaming, and preprocessing at scale
- Fine-tuning with Trainer takes ~20 lines of config code for excellent results
- Always start with a pretrained model and fine-tune rather than training from scratch