TechLead
Intermediate
20 min
Full Guide

Computer Vision with Deep Learning

Build computer vision systems with CNNs, image classification, object detection, and transfer learning using PyTorch and pretrained models

Convolutional Neural Networks (CNNs)

CNNs are the backbone of computer vision. Unlike fully connected networks that treat each pixel independently, CNNs use convolutional filters that slide across the image, detecting local patterns like edges, textures, and shapes. Deeper layers combine these into complex features.

CNN Hierarchy:

Early layers detect edges and colors -> Middle layers detect textures and parts -> Deep layers detect objects and scenes

CNN Architecture Layers

Convolution Layer

Applies learnable filters (kernels) to extract features. A 3x3 filter slides across the image computing dot products. Output: feature maps.

Pooling Layer

Reduces spatial dimensions. Max pooling takes the max value in each window. Reduces computation and adds translation invariance.

Batch Normalization

Normalizes layer outputs to zero mean and unit variance. Stabilizes training, allows higher learning rates, acts as regularization.

Fully Connected Layer

After feature extraction, flatten and feed to dense layers for classification. Last layer has num_classes neurons with softmax.

Building a CNN in PyTorch

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# Image preprocessing pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),      # data augmentation
    transforms.RandomRotation(10),           # data augmentation
    transforms.ColorJitter(brightness=0.2),  # data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225]),  # ImageNet stats
])

# Custom CNN for image classification
class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            # Block 1: 3 -> 32 channels
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 224 -> 112

            # Block 2: 32 -> 64 channels
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 112 -> 56

            # Block 3: 64 -> 128 channels
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((4, 4)),  # any size -> 4x4
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 4 * 4, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

model = CNN(num_classes=10)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
# ~500K parameters - small and fast to train

Transfer Learning with Pretrained Models

Transfer learning uses models pretrained on ImageNet (14M images, 1000 classes) and fine-tunes them on your specific task. This is the standard approach in practice because training from scratch requires massive datasets.

import torch
import torch.nn as nn
import torchvision.models as models

# ===== Method 1: Fine-tune a pretrained ResNet =====
model = models.resnet50(weights='IMAGENET1K_V2')

# Freeze all layers except the final classifier
for param in model.parameters():
    param.requires_grad = False

# Replace the final layer for your task (e.g., 5 classes)
num_classes = 5
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, num_classes)
)

# Only the new layers will be trained
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Training {trainable:,} of {total:,} parameters ({100*trainable/total:.1f}%)")

# ===== Method 2: Use EfficientNet (modern, efficient) =====
model = models.efficientnet_b0(weights='IMAGENET1K_V1')
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.Linear(model.classifier[1].in_features, num_classes)
)

# ===== Method 3: YOLO for Object Detection =====
# pip install ultralytics
from ultralytics import YOLO

# Load pretrained YOLOv8
yolo_model = YOLO('yolov8n.pt')  # nano model (fast)

# Predict on an image
results = yolo_model.predict('image.jpg', conf=0.5)
for result in results:
    for box in result.boxes:
        class_name = result.names[int(box.cls)]
        confidence = float(box.conf)
        coords = box.xyxy[0].tolist()  # [x1, y1, x2, y2]
        print(f"  {class_name}: {confidence:.2f} at {coords}")

Image Preprocessing Best Practices

import torchvision.transforms as T
from PIL import Image

# Training transforms (with augmentation)
train_transform = T.Compose([
    T.Resize(256),                    # resize shorter side to 256
    T.RandomCrop(224),                # random 224x224 crop
    T.RandomHorizontalFlip(p=0.5),   # 50% chance flip
    T.RandomVerticalFlip(p=0.1),     # 10% chance flip
    T.ColorJitter(
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.1
    ),
    T.RandomRotation(15),            # random rotation up to 15 degrees
    T.ToTensor(),                    # convert to tensor [0, 1]
    T.Normalize(                     # ImageNet normalization
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

# Validation/test transforms (NO augmentation)
val_transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),               # deterministic center crop
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

# Apply transforms
# img = Image.open("photo.jpg")
# train_tensor = train_transform(img)  # shape: (3, 224, 224)
# val_tensor = val_transform(img)

print("Always use augmentation for training, not for validation!")
print("Always normalize with ImageNet stats when using pretrained models.")

Key Takeaways

  • CNNs learn hierarchical features: edges -> textures -> parts -> objects
  • Transfer learning with pretrained models (ResNet, EfficientNet) is the standard approach
  • Data augmentation (flips, rotations, color jitter) prevents overfitting on small datasets
  • YOLO is the go-to for real-time object detection in production
  • Always normalize images with ImageNet statistics when using pretrained models

Continue Learning