Rust AI Inference Benchmarking

Setup

toml

# Cargo.toml
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }

[[bench]]
name = "inference"
harness = false

Runnable example — Criterion benchmark

rust

// benches/inference.rs
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::time::Duration;

/// Simulate a simple linear layer: output = input * weight
fn linear_layer(input: &[f32], weight: &[f32]) -> Vec<f32> {
    assert_eq!(input.len(), weight.len());
    vec![input.iter().zip(weight).map(|(x, w)| x * w).sum()]
}

/// Simulate batch inference
fn batch_infer(batch: &[Vec<f32>], weight: &[f32]) -> Vec<Vec<f32>> {
    batch.iter()
        .map(|inp| linear_layer(inp, weight))
        .collect()
}

fn benchmark_single(c: &mut Criterion) {
    let weight = vec![0.5f32; 256];
    let input = vec![1.0f32; 256];

    c.bench_function("single_inference_256d", |b| {
        b.iter(|| {
            black_box(linear_layer(black_box(&input), black_box(&weight)))
        })
    });
}

fn benchmark_batches(c: &mut Criterion) {
    let weight = vec![0.5f32; 256];
    let mut group = c.benchmark_group("batch_inference");
    group.measurement_time(Duration::from_secs(10));

    for batch_size in [1, 8, 32, 128] {
        let batch: Vec<Vec<f32>> = (0..batch_size)
            .map(|_| vec![1.0f32; 256])
            .collect();
        group.throughput(Throughput::Elements(batch_size as u64));
        group.bench_with_input(
            BenchmarkId::from_parameter(batch_size),
            &batch,
            |b, batch| {
                b.iter(|| black_box(batch_infer(black_box(batch), black_box(&weight))))
            },
        );
    }
    group.finish();
}

criterion_group!(benches, benchmark_single, benchmark_batches);
criterion_main!(benches);

Run with:

bash

cargo bench
# Results in target/criterion/

Throughput measurement pattern

rust

use std::time::Instant;

fn measure_throughput(
    infer_fn: impl Fn(&[f32]) -> Vec<f32>,
    input: &[f32],
    duration_secs: u64,
) -> f64 {
    let end = Instant::now() + std::time::Duration::from_secs(duration_secs);
    let mut count = 0u64;

    // Warm up
    for _ in 0..100 { infer_fn(input); }

    while Instant::now() < end {
        infer_fn(input);
        count += 1;
    }

    count as f64 / duration_secs as f64
}

fn main() {
    let weight = vec![1.0f32; 128];
    let input = vec![0.5f32; 128];

    let rps = measure_throughput(
        |inp| inp.iter().zip(&weight).map(|(x, w)| x * w).collect(),
        &input,
        3,
    );
    println!("Throughput: {:.0} inference/sec", rps);
}

Latency percentile tracker

rust

use std::time::Instant;

struct LatencyRecorder {
    samples: Vec<f64>, // milliseconds
}

impl LatencyRecorder {
    fn new() -> Self { Self { samples: Vec::new() } }

    fn record<F: FnOnce()>(&mut self, f: F) {
        let t = Instant::now();
        f();
        self.samples.push(t.elapsed().as_secs_f64() * 1000.0);
    }

    fn report(&mut self) {
        self.samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
        let n = self.samples.len();
        let p = |pct: f64| self.samples[(n as f64 * pct / 100.0) as usize];
        println!("n={} p50={:.3}ms p95={:.3}ms p99={:.3}ms max={:.3}ms",
            n, p(50.0), p(95.0), p(99.0), self.samples[n-1]);
    }
}

fn main() {
    let weight = vec![1.0f32; 64];
    let input = vec![0.5f32; 64];
    let mut rec = LatencyRecorder::new();

    for _ in 0..10_000 {
        rec.record(|| {
            let _: Vec<f32> = input.iter().zip(&weight).map(|(x, w)| x * w).collect();
        });
    }

    rec.report();
}

Benchmarking checklist

[ ] Use --release flag; debug builds are misleading.
[ ] Warm up for at least 1 second before measuring.
[ ] Use black_box() to prevent compiler from optimizing away benchmarks.
[ ] Measure both single-request and batch scenarios.
[ ] Report p50, p95, p99 — never just mean or min.
[ ] Run on idle hardware with CPU frequency scaling disabled.
[ ] Store baseline results to detect regressions in CI.

CI regression detection

bash

# Store baseline
cargo bench -- --save-baseline main

# Compare after changes
cargo bench -- --baseline main

Rust AI Inference Benchmarking

Rust AI Inference Benchmarking

Setup

Runnable example — Criterion benchmark

Throughput measurement pattern

Latency percentile tracker

Benchmarking checklist

CI regression detection

Related reading

Related Guides

Rust AI Inference Performance Tuning

Rust AI Inference Decision Matrix

Continue in This Topic

Rust AI Inference Architecture

Rust AI Inference Best Practices

More Rust Guides

Building LLM Applications with Rust

LLM API Gateway in Rust

LLM Rust Anti-Patterns

LLM Rust Benchmarking

LLM Rust Decision Matrix

LLM Rust Interview Q&A