TechLead
Lesson 26 of 28
5 min read
Rust

Performance Optimization in Rust

Optimize Rust performance with zero-cost abstractions, criterion benchmarks, profiling, SIMD, cache-friendly structures, and flamegraphs.

Rust Performance Philosophy

Rust's core promise is zero-cost abstractions: high-level code compiles to the same machine code as hand-optimized low-level code. Iterators, generics, and trait dispatch all compile down to efficient machine instructions with no runtime overhead.

Performance Checklist

  • 1. Build in release mode: cargo build --release enables optimizations
  • 2. Measure first: Profile before optimizing — don't guess where bottlenecks are
  • 3. Avoid allocations: Reuse buffers, use references, prefer stack over heap
  • 4. Use iterators: Iterator chains often optimize better than manual loops

Benchmarking with Criterion

// Cargo.toml:
// [dev-dependencies]
// criterion = { version = "0.5", features = ["html_reports"] }
//
// [[bench]]
// name = "my_benchmark"
// harness = false

// benches/my_benchmark.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};

fn fibonacci_recursive(n: u64) -> u64 {
    match n {
        0 | 1 => n,
        _ => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2),
    }
}

fn fibonacci_iterative(n: u64) -> u64 {
    let (mut a, mut b) = (0u64, 1u64);
    for _ in 0..n {
        let temp = a + b;
        a = b;
        b = temp;
    }
    a
}

fn benchmark_fibonacci(c: &mut Criterion) {
    let mut group = c.benchmark_group("fibonacci");

    group.bench_function("recursive_20", |b| {
        b.iter(|| fibonacci_recursive(black_box(20)))
    });

    group.bench_function("iterative_20", |b| {
        b.iter(|| fibonacci_iterative(black_box(20)))
    });

    group.finish();
}

criterion_group!(benches, benchmark_fibonacci);
criterion_main!(benches);

// Run: cargo bench
// Results appear in target/criterion/report/index.html

Avoiding Allocations

use std::fmt::Write;

fn main() {
    // BAD: Allocates a new String each iteration
    let mut results_bad = Vec::new();
    for i in 0..1000 {
        results_bad.push(format!("item_{i}"));
    }

    // GOOD: Pre-allocate with known capacity
    let mut results_good = Vec::with_capacity(1000);
    for i in 0..1000 {
        results_good.push(format!("item_{i}"));
    }

    // GOOD: Reuse a buffer
    let mut buffer = String::with_capacity(256);
    for i in 0..1000 {
        buffer.clear();
        write!(buffer, "item_{i}").unwrap();
        // Use buffer...
    }

    // Use &str instead of String where possible
    fn process(data: &str) { /* no allocation */ }
    // Instead of:
    fn process_owned(data: String) { /* takes ownership, may allocate */ }

    // SmallVec for small collections (avoids heap for small sizes)
    // use smallvec::SmallVec;
    // let mut v: SmallVec<[i32; 8]> = SmallVec::new();
    // v.push(1); // On stack if <= 8 elements

    // Cow: Clone-on-Write — avoid cloning until mutation
    use std::borrow::Cow;
    fn maybe_modify(input: &str) -> Cow<'_, str> {
        if input.contains("bad") {
            Cow::Owned(input.replace("bad", "good"))
        } else {
            Cow::Borrowed(input) // No allocation!
        }
    }
}

Release Profile Optimizations

// Cargo.toml
[profile.release]
opt-level = 3           // Max optimization (default)
lto = "fat"             // Link-time optimization (slower build, faster binary)
codegen-units = 1       // Single codegen unit (slower build, better optimization)
strip = true            // Strip debug symbols
panic = "abort"         // Smaller binary (no unwinding)
target-cpu = "native"   // Optimize for current CPU

// For maximum speed:
// [profile.release]
// opt-level = 3
// lto = "fat"
// codegen-units = 1
// target-cpu = "native"

// For minimum binary size:
// [profile.release]
// opt-level = "z"
// lto = true
// strip = true
// codegen-units = 1
// panic = "abort"

Profiling and Flamegraphs

# Install flamegraph tool
cargo install flamegraph

# Generate a flamegraph
cargo flamegraph --bin my-app

# Profile with perf (Linux)
perf record --call-graph dwarf ./target/release/my-app
perf report

# Profile with Instruments (macOS)
cargo instruments -t "Time Profiler" --release

# Memory profiling with DHAT
# Cargo.toml: dhat = "0.3"
# #[global_allocator]
# static ALLOC: dhat::Alloc = dhat::Alloc;
# fn main() {
#     let _profiler = dhat::Profiler::new_heap();
#     // ... your code ...
# }

# Compile-time analysis
cargo build --release --timings   # Build timing report
cargo bloat --release             # Analyze binary size

Cache-Friendly Data Structures

// Struct of Arrays (SoA) vs Array of Structs (AoS)

// AoS — poor cache locality for single-field iteration
struct ParticleAoS {
    x: f32, y: f32, z: f32,
    vx: f32, vy: f32, vz: f32,
    mass: f32,
}
// Iterating over just positions touches velocity and mass too

// SoA — great cache locality for single-field iteration
struct Particles {
    x: Vec,
    y: Vec,
    z: Vec,
    vx: Vec,
    vy: Vec,
    vz: Vec,
    mass: Vec,
}
// Iterating over positions only touches position data

impl Particles {
    fn update_positions(&mut self, dt: f32) {
        // Tight loop over contiguous memory — very cache-friendly
        for i in 0..self.x.len() {
            self.x[i] += self.vx[i] * dt;
            self.y[i] += self.vy[i] * dt;
            self.z[i] += self.vz[i] * dt;
        }
    }
}

Key Takeaways

  • ✅ Always benchmark in release mode — debug builds are 10-100x slower
  • ✅ Use Criterion for statistical benchmarking and performance regression detection
  • ✅ Pre-allocate collections, reuse buffers, and prefer references to avoid allocations
  • ✅ LTO, codegen-units=1, and target-cpu=native squeeze maximum performance
  • ✅ Flamegraphs and profilers show where time is actually spent — measure, don't guess

Continue Learning