RRust By Example

Rust AI Inference Benchmarking

How to benchmark AI inference performance in Rust using Criterion. Measure throughput, latency distribution, batch efficiency, and compare implementations with reproducible results.

Topic: Ai Inference

Search intent: High-intent search: "rust ai inference benchmark criterion"

Rust AI Inference Benchmarking

Setup

toml
# Cargo.toml
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }

[[bench]]
name = "inference"
harness = false

Runnable example — Criterion benchmark

rust
// benches/inference.rs
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::time::Duration;

/// Simulate a simple linear layer: output = input * weight
fn linear_layer(input: &[f32], weight: &[f32]) -> Vec<f32> {
    assert_eq!(input.len(), weight.len());
    vec![input.iter().zip(weight).map(|(x, w)| x * w).sum()]
}

/// Simulate batch inference
fn batch_infer(batch: &[Vec<f32>], weight: &[f32]) -> Vec<Vec<f32>> {
    batch.iter()
        .map(|inp| linear_layer(inp, weight))
        .collect()
}

fn benchmark_single(c: &mut Criterion) {
    let weight = vec![0.5f32; 256];
    let input = vec![1.0f32; 256];

    c.bench_function("single_inference_256d", |b| {
        b.iter(|| {
            black_box(linear_layer(black_box(&input), black_box(&weight)))
        })
    });
}

fn benchmark_batches(c: &mut Criterion) {
    let weight = vec![0.5f32; 256];
    let mut group = c.benchmark_group("batch_inference");
    group.measurement_time(Duration::from_secs(10));

    for batch_size in [1, 8, 32, 128] {
        let batch: Vec<Vec<f32>> = (0..batch_size)
            .map(|_| vec![1.0f32; 256])
            .collect();
        group.throughput(Throughput::Elements(batch_size as u64));
        group.bench_with_input(
            BenchmarkId::from_parameter(batch_size),
            &batch,
            |b, batch| {
                b.iter(|| black_box(batch_infer(black_box(batch), black_box(&weight))))
            },
        );
    }
    group.finish();
}

criterion_group!(benches, benchmark_single, benchmark_batches);
criterion_main!(benches);

Run with:

bash
cargo bench
# Results in target/criterion/

Throughput measurement pattern

rust
use std::time::Instant;

fn measure_throughput(
    infer_fn: impl Fn(&[f32]) -> Vec<f32>,
    input: &[f32],
    duration_secs: u64,
) -> f64 {
    let end = Instant::now() + std::time::Duration::from_secs(duration_secs);
    let mut count = 0u64;

    // Warm up
    for _ in 0..100 { infer_fn(input); }

    while Instant::now() < end {
        infer_fn(input);
        count += 1;
    }

    count as f64 / duration_secs as f64
}

fn main() {
    let weight = vec![1.0f32; 128];
    let input = vec![0.5f32; 128];

    let rps = measure_throughput(
        |inp| inp.iter().zip(&weight).map(|(x, w)| x * w).collect(),
        &input,
        3,
    );
    println!("Throughput: {:.0} inference/sec", rps);
}

Latency percentile tracker

rust
use std::time::Instant;

struct LatencyRecorder {
    samples: Vec<f64>, // milliseconds
}

impl LatencyRecorder {
    fn new() -> Self { Self { samples: Vec::new() } }

    fn record<F: FnOnce()>(&mut self, f: F) {
        let t = Instant::now();
        f();
        self.samples.push(t.elapsed().as_secs_f64() * 1000.0);
    }

    fn report(&mut self) {
        self.samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
        let n = self.samples.len();
        let p = |pct: f64| self.samples[(n as f64 * pct / 100.0) as usize];
        println!("n={} p50={:.3}ms p95={:.3}ms p99={:.3}ms max={:.3}ms",
            n, p(50.0), p(95.0), p(99.0), self.samples[n-1]);
    }
}

fn main() {
    let weight = vec![1.0f32; 64];
    let input = vec![0.5f32; 64];
    let mut rec = LatencyRecorder::new();

    for _ in 0..10_000 {
        rec.record(|| {
            let _: Vec<f32> = input.iter().zip(&weight).map(|(x, w)| x * w).collect();
        });
    }

    rec.report();
}

Benchmarking checklist

  • [ ] Use --release flag; debug builds are misleading.
  • [ ] Warm up for at least 1 second before measuring.
  • [ ] Use black_box() to prevent compiler from optimizing away benchmarks.
  • [ ] Measure both single-request and batch scenarios.
  • [ ] Report p50, p95, p99 — never just mean or min.
  • [ ] Run on idle hardware with CPU frequency scaling disabled.
  • [ ] Store baseline results to detect regressions in CI.

CI regression detection

bash
# Store baseline
cargo bench -- --save-baseline main

# Compare after changes
cargo bench -- --baseline main

Related reading

Related Guides

Continue in This Topic

More Rust Guides