TechLead
Intermediate
20 min
Full Guide

Hugging Face Ecosystem

Master the Hugging Face Transformers library, model hub, tokenizers, datasets, pipelines API, and fine-tuning pretrained models

Why Hugging Face Matters

Hugging Face has become the GitHub of machine learning. It hosts 500K+ models, 100K+ datasets, and provides the most popular libraries for working with transformers. If you work with NLP, computer vision, or any modern AI, you will use Hugging Face.

The Hugging Face Stack:

Transformers: Model library (PyTorch/TF)
Datasets: Dataset loading and processing
Tokenizers: Fast tokenization (Rust)
Hub: Model and dataset repository
Accelerate: Multi-GPU/TPU training
PEFT: Parameter-efficient fine-tuning

Pipelines: Zero-Code ML

Pipelines are the simplest way to use pretrained models. One line of code for inference:

from transformers import pipeline

# Text Generation
generator = pipeline("text-generation", model="gpt2")
result = generator("The future of AI is", max_length=50, num_return_sequences=2)
for r in result:
    print(f"Generated: {r['generated_text'][:100]}...")

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
article = """
Artificial intelligence has made remarkable progress in recent years.
Large language models can now write code, answer questions, and engage
in complex reasoning. However, challenges remain in areas like
hallucination, bias, and energy consumption. Researchers are working
on making AI more reliable and efficient.
"""
summary = summarizer(article, max_length=50, min_length=20)
print(f"Summary: {summary[0]['summary_text']}")

# Question Answering
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
result = qa(
    question="What are the challenges of AI?",
    context=article
)
print(f"Answer: {result['answer']} (confidence: {result['score']:.3f})")

# Translation
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
result = translator("Machine learning is transforming every industry.")
print(f"French: {result[0]['translation_text']}")

# Image Classification
classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
# result = classifier("cat.jpg")
# print(f"This is a: {result[0]['label']}")

# Available tasks: text-classification, token-classification,
# fill-mask, text2text-generation, image-segmentation,
# audio-classification, automatic-speech-recognition, and more!

Loading Models and Tokenizers

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

# AutoModel automatically detects the right model class
model_name = "bert-base-uncased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize text
text = "Hugging Face makes NLP accessible to everyone"
inputs = tokenizer(
    text,
    return_tensors="pt",       # return PyTorch tensors
    padding=True,              # pad to max length
    truncation=True,           # truncate if too long
    max_length=128
)

print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])}")
print(f"Vocabulary size: {tokenizer.vocab_size}")

# Get model outputs
import torch
with torch.no_grad():
    outputs = model(**inputs)

# outputs.last_hidden_state: (batch, seq_len, hidden_dim)
print(f"Output shape: {outputs.last_hidden_state.shape}")

# For classification tasks, use the task-specific model
classifier = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
cls_tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

inputs = cls_tokenizer("I love this product!", return_tensors="pt")
with torch.no_grad():
    logits = classifier(**inputs).logits
    prediction = torch.argmax(logits, dim=-1)
    print(f"Sentiment: {'POSITIVE' if prediction == 1 else 'NEGATIVE'}")

Working with Datasets

from datasets import load_dataset, Dataset

# Load popular datasets
imdb = load_dataset("imdb")
print(f"IMDB train: {len(imdb['train'])} examples")
print(f"Sample: {imdb['train'][0]['text'][:100]}...")
print(f"Label: {imdb['train'][0]['label']}")  # 0=neg, 1=pos

# Load specific splits and subsets
squad = load_dataset("squad", split="train[:1000]")  # first 1000

# Apply transformations efficiently
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized = imdb.map(tokenize_function, batched=True, num_proc=4)

# Filter, sort, select
filtered = imdb["train"].filter(lambda x: len(x["text"]) > 100)
shuffled = imdb["train"].shuffle(seed=42)
subset = imdb["train"].select(range(100))

# Create custom datasets
my_data = Dataset.from_dict({
    "text": ["Great product!", "Terrible service", "It was okay"],
    "label": [1, 0, 1]
})

# Stream large datasets (no download needed)
large_dataset = load_dataset("wikipedia", "20220301.en", streaming=True)
for i, example in enumerate(large_dataset["train"]):
    if i >= 5: break
    print(f"  Article: {example['title']}")
print("Streaming allows processing datasets larger than RAM")

Fine-Tuning a Pretrained Model

from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer,
    TrainingArguments, Trainer
)
from datasets import load_dataset
import numpy as np

# 1. Load dataset and model
dataset = load_dataset("imdb")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

# 2. Tokenize dataset
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length",
                    truncation=True, max_length=256)

tokenized = dataset.map(tokenize, batched=True)
small_train = tokenized["train"].shuffle(seed=42).select(range(2000))
small_eval = tokenized["test"].shuffle(seed=42).select(range(500))

# 3. Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# 4. Configure training
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=50,
    fp16=True,  # mixed precision training
)

# 5. Train!
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

# 6. Push to Hub (share with the world)
# trainer.push_to_hub("my-sentiment-model")
print("Fine-tuning complete! Model saved to ./results")

Key Takeaways

  • Hugging Face Pipelines give you production-ready ML in one line of code
  • AutoModel and AutoTokenizer automatically detect the right classes for any model
  • The Datasets library handles loading, streaming, and preprocessing at scale
  • Fine-tuning with Trainer takes ~20 lines of config code for excellent results
  • Always start with a pretrained model and fine-tune rather than training from scratch

Continue Learning