Computer Vision with Deep Learning
Build computer vision systems with CNNs, image classification, object detection, and transfer learning using PyTorch and pretrained models
Convolutional Neural Networks (CNNs)
CNNs are the backbone of computer vision. Unlike fully connected networks that treat each pixel independently, CNNs use convolutional filters that slide across the image, detecting local patterns like edges, textures, and shapes. Deeper layers combine these into complex features.
CNN Hierarchy:
Early layers detect edges and colors -> Middle layers detect textures and parts -> Deep layers detect objects and scenes
CNN Architecture Layers
Convolution Layer
Applies learnable filters (kernels) to extract features. A 3x3 filter slides across the image computing dot products. Output: feature maps.
Pooling Layer
Reduces spatial dimensions. Max pooling takes the max value in each window. Reduces computation and adds translation invariance.
Batch Normalization
Normalizes layer outputs to zero mean and unit variance. Stabilizes training, allows higher learning rates, acts as regularization.
Fully Connected Layer
After feature extraction, flatten and feed to dense layers for classification. Last layer has num_classes neurons with softmax.
Building a CNN in PyTorch
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
# Image preprocessing pipeline
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(), # data augmentation
transforms.RandomRotation(10), # data augmentation
transforms.ColorJitter(brightness=0.2), # data augmentation
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]), # ImageNet stats
])
# Custom CNN for image classification
class CNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
# Block 1: 3 -> 32 channels
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # 224 -> 112
# Block 2: 32 -> 64 channels
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2), # 112 -> 56
# Block 3: 64 -> 128 channels
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.AdaptiveAvgPool2d((4, 4)), # any size -> 4x4
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 256),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(256, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
model = CNN(num_classes=10)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
# ~500K parameters - small and fast to train
Transfer Learning with Pretrained Models
Transfer learning uses models pretrained on ImageNet (14M images, 1000 classes) and fine-tunes them on your specific task. This is the standard approach in practice because training from scratch requires massive datasets.
import torch
import torch.nn as nn
import torchvision.models as models
# ===== Method 1: Fine-tune a pretrained ResNet =====
model = models.resnet50(weights='IMAGENET1K_V2')
# Freeze all layers except the final classifier
for param in model.parameters():
param.requires_grad = False
# Replace the final layer for your task (e.g., 5 classes)
num_classes = 5
model.fc = nn.Sequential(
nn.Linear(model.fc.in_features, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, num_classes)
)
# Only the new layers will be trained
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Training {trainable:,} of {total:,} parameters ({100*trainable/total:.1f}%)")
# ===== Method 2: Use EfficientNet (modern, efficient) =====
model = models.efficientnet_b0(weights='IMAGENET1K_V1')
model.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(model.classifier[1].in_features, num_classes)
)
# ===== Method 3: YOLO for Object Detection =====
# pip install ultralytics
from ultralytics import YOLO
# Load pretrained YOLOv8
yolo_model = YOLO('yolov8n.pt') # nano model (fast)
# Predict on an image
results = yolo_model.predict('image.jpg', conf=0.5)
for result in results:
for box in result.boxes:
class_name = result.names[int(box.cls)]
confidence = float(box.conf)
coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
print(f" {class_name}: {confidence:.2f} at {coords}")
Image Preprocessing Best Practices
import torchvision.transforms as T
from PIL import Image
# Training transforms (with augmentation)
train_transform = T.Compose([
T.Resize(256), # resize shorter side to 256
T.RandomCrop(224), # random 224x224 crop
T.RandomHorizontalFlip(p=0.5), # 50% chance flip
T.RandomVerticalFlip(p=0.1), # 10% chance flip
T.ColorJitter(
brightness=0.2,
contrast=0.2,
saturation=0.2,
hue=0.1
),
T.RandomRotation(15), # random rotation up to 15 degrees
T.ToTensor(), # convert to tensor [0, 1]
T.Normalize( # ImageNet normalization
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
])
# Validation/test transforms (NO augmentation)
val_transform = T.Compose([
T.Resize(256),
T.CenterCrop(224), # deterministic center crop
T.ToTensor(),
T.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
])
# Apply transforms
# img = Image.open("photo.jpg")
# train_tensor = train_transform(img) # shape: (3, 224, 224)
# val_tensor = val_transform(img)
print("Always use augmentation for training, not for validation!")
print("Always normalize with ImageNet stats when using pretrained models.")
Key Takeaways
- CNNs learn hierarchical features: edges -> textures -> parts -> objects
- Transfer learning with pretrained models (ResNet, EfficientNet) is the standard approach
- Data augmentation (flips, rotations, color jitter) prevents overfitting on small datasets
- YOLO is the go-to for real-time object detection in production
- Always normalize images with ImageNet statistics when using pretrained models