Rust AI Inference Benchmarking
How to benchmark AI inference performance in Rust using Criterion. Measure throughput, latency distribution, batch efficiency, and compare implementations with reproducible results.
Topic: Ai Inference
Search intent: High-intent search: "rust ai inference benchmark criterion"
Rust AI Inference Benchmarking
Setup
# Cargo.toml
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
[[bench]]
name = "inference"
harness = falseRunnable example — Criterion benchmark
// benches/inference.rs
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::time::Duration;
/// Simulate a simple linear layer: output = input * weight
fn linear_layer(input: &[f32], weight: &[f32]) -> Vec<f32> {
assert_eq!(input.len(), weight.len());
vec![input.iter().zip(weight).map(|(x, w)| x * w).sum()]
}
/// Simulate batch inference
fn batch_infer(batch: &[Vec<f32>], weight: &[f32]) -> Vec<Vec<f32>> {
batch.iter()
.map(|inp| linear_layer(inp, weight))
.collect()
}
fn benchmark_single(c: &mut Criterion) {
let weight = vec![0.5f32; 256];
let input = vec![1.0f32; 256];
c.bench_function("single_inference_256d", |b| {
b.iter(|| {
black_box(linear_layer(black_box(&input), black_box(&weight)))
})
});
}
fn benchmark_batches(c: &mut Criterion) {
let weight = vec![0.5f32; 256];
let mut group = c.benchmark_group("batch_inference");
group.measurement_time(Duration::from_secs(10));
for batch_size in [1, 8, 32, 128] {
let batch: Vec<Vec<f32>> = (0..batch_size)
.map(|_| vec![1.0f32; 256])
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::from_parameter(batch_size),
&batch,
|b, batch| {
b.iter(|| black_box(batch_infer(black_box(batch), black_box(&weight))))
},
);
}
group.finish();
}
criterion_group!(benches, benchmark_single, benchmark_batches);
criterion_main!(benches);Run with:
cargo bench
# Results in target/criterion/Throughput measurement pattern
use std::time::Instant;
fn measure_throughput(
infer_fn: impl Fn(&[f32]) -> Vec<f32>,
input: &[f32],
duration_secs: u64,
) -> f64 {
let end = Instant::now() + std::time::Duration::from_secs(duration_secs);
let mut count = 0u64;
// Warm up
for _ in 0..100 { infer_fn(input); }
while Instant::now() < end {
infer_fn(input);
count += 1;
}
count as f64 / duration_secs as f64
}
fn main() {
let weight = vec![1.0f32; 128];
let input = vec![0.5f32; 128];
let rps = measure_throughput(
|inp| inp.iter().zip(&weight).map(|(x, w)| x * w).collect(),
&input,
3,
);
println!("Throughput: {:.0} inference/sec", rps);
}Latency percentile tracker
use std::time::Instant;
struct LatencyRecorder {
samples: Vec<f64>, // milliseconds
}
impl LatencyRecorder {
fn new() -> Self { Self { samples: Vec::new() } }
fn record<F: FnOnce()>(&mut self, f: F) {
let t = Instant::now();
f();
self.samples.push(t.elapsed().as_secs_f64() * 1000.0);
}
fn report(&mut self) {
self.samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
let n = self.samples.len();
let p = |pct: f64| self.samples[(n as f64 * pct / 100.0) as usize];
println!("n={} p50={:.3}ms p95={:.3}ms p99={:.3}ms max={:.3}ms",
n, p(50.0), p(95.0), p(99.0), self.samples[n-1]);
}
}
fn main() {
let weight = vec![1.0f32; 64];
let input = vec![0.5f32; 64];
let mut rec = LatencyRecorder::new();
for _ in 0..10_000 {
rec.record(|| {
let _: Vec<f32> = input.iter().zip(&weight).map(|(x, w)| x * w).collect();
});
}
rec.report();
}Benchmarking checklist
- [ ] Use
--releaseflag; debug builds are misleading. - [ ] Warm up for at least 1 second before measuring.
- [ ] Use
black_box()to prevent compiler from optimizing away benchmarks. - [ ] Measure both single-request and batch scenarios.
- [ ] Report p50, p95, p99 — never just mean or min.
- [ ] Run on idle hardware with CPU frequency scaling disabled.
- [ ] Store baseline results to detect regressions in CI.
CI regression detection
# Store baseline
cargo bench -- --save-baseline main
# Compare after changes
cargo bench -- --baseline main