LLM Rust Benchmarking
Benchmark LLM application performance in Rust: measure TTFT, throughput, cache hit rates, and compare provider latency distributions using Criterion and custom instrumentation.
Topic: Llm Rust
Search intent: High-intent search: "rust llm benchmark latency throughput"
LLM Rust Benchmarking
Key metrics to benchmark
| Metric | Description | Measurement |
|---|---|---|
| TTFT | Time to first token | Instant before first chunk |
| Total latency | Full response time | Instant end-to-end |
| Throughput | Requests/second | Concurrent request count / wall time |
| Token throughput | Tokens/second | Output tokens / total time |
| Cache hit rate | Fraction served from cache | hits / (hits + misses) |
| Cost per request | USD per completion | token count × price |
Runnable example — comprehensive LLM benchmarker
use std::time::{Duration, Instant};
use std::sync::Arc;
use tokio::sync::Semaphore;
#[derive(Debug, Clone)]
struct BenchResult {
request_id: u64,
ttft_ms: f64, // Time to first token
total_ms: f64, // Total response time
input_tokens: u32,
output_tokens: u32,
cached: bool,
error: Option<String>,
}
struct LlmBenchmark {
max_concurrent: usize,
total_requests: usize,
}
impl LlmBenchmark {
fn new(concurrent: usize, total: usize) -> Self {
Self { max_concurrent: concurrent, total_requests: total }
}
async fn run<F, Fut>(&self, call_fn: F) -> Vec<BenchResult>
where
F: Fn(u64) -> Fut + Send + Sync + Clone + 'static,
Fut: std::future::Future<Output = BenchResult> + Send,
{
let sem = Arc::new(Semaphore::new(self.max_concurrent));
let handles: Vec<_> = (0..self.total_requests as u64).map(|id| {
let sem = sem.clone();
let f = call_fn.clone();
tokio::spawn(async move {
let _permit = sem.acquire().await.unwrap();
f(id).await
})
}).collect();
let mut results = Vec::with_capacity(self.total_requests);
for h in handles { results.push(h.await.unwrap()); }
results
}
}
/// Compute statistics from benchmark results
fn analyze_results(results: &[BenchResult]) -> BenchStats {
let successful: Vec<&BenchResult> = results.iter()
.filter(|r| r.error.is_none())
.collect();
let n = successful.len();
if n == 0 {
return BenchStats::default();
}
let mut ttfts: Vec<f64> = successful.iter().map(|r| r.ttft_ms).collect();
let mut totals: Vec<f64> = successful.iter().map(|r| r.total_ms).collect();
ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap());
totals.sort_by(|a, b| a.partial_cmp(b).unwrap());
let p = |v: &[f64], pct: f64| v[(n as f64 * pct / 100.0) as usize.min(n - 1)];
let total_input_tokens: u64 = successful.iter().map(|r| r.input_tokens as u64).sum();
let total_output_tokens: u64 = successful.iter().map(|r| r.output_tokens as u64).sum();
let cache_hits = successful.iter().filter(|r| r.cached).count();
let wall_time = totals.iter().sum::<f64>() / 1000.0; // secs (serial equivalent)
BenchStats {
n,
errors: results.len() - n,
ttft_p50: p(&ttfts, 50.0),
ttft_p95: p(&ttfts, 95.0),
ttft_p99: p(&ttfts, 99.0),
total_p50: p(&totals, 50.0),
total_p95: p(&totals, 95.0),
total_p99: p(&totals, 99.0),
tokens_per_sec: total_output_tokens as f64 / (wall_time / n as f64),
cache_hit_rate: cache_hits as f64 / n as f64,
avg_input_tokens: total_input_tokens as f64 / n as f64,
avg_output_tokens: total_output_tokens as f64 / n as f64,
}
}
#[derive(Debug, Default)]
struct BenchStats {
n: usize,
errors: usize,
ttft_p50: f64,
ttft_p95: f64,
ttft_p99: f64,
total_p50: f64,
total_p95: f64,
total_p99: f64,
tokens_per_sec: f64,
cache_hit_rate: f64,
avg_input_tokens: f64,
avg_output_tokens: f64,
}
impl std::fmt::Display for BenchStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "n={} errors={}\n\
TTFT: p50={:.1}ms p95={:.1}ms p99={:.1}ms\n\
Total: p50={:.1}ms p95={:.1}ms p99={:.1}ms\n\
Tokens/sec: {:.0} | Cache hit: {:.1}%\n\
Avg tokens: {:.0} in / {:.0} out",
self.n, self.errors,
self.ttft_p50, self.ttft_p95, self.ttft_p99,
self.total_p50, self.total_p95, self.total_p99,
self.tokens_per_sec, self.cache_hit_rate * 100.0,
self.avg_input_tokens, self.avg_output_tokens,
)
}
}
#[tokio::main]
async fn main() {
let bench = LlmBenchmark::new(10, 100);
println!("Running benchmark: 100 requests, 10 concurrent...\n");
let start = Instant::now();
let results = bench.run(|id| async move {
let t0 = Instant::now();
// Simulate variable latency LLM call
let base_ms = 100u64 + (id % 5) * 50;
tokio::time::sleep(Duration::from_millis(base_ms)).await;
let ttft = base_ms as f64 / 3.0; // First token at 1/3 of total
let total = t0.elapsed().as_secs_f64() * 1000.0;
BenchResult {
request_id: id,
ttft_ms: ttft,
total_ms: total,
input_tokens: 100 + (id % 50) as u32,
output_tokens: 150 + (id % 100) as u32,
cached: id % 5 == 0, // 20% cache hit rate
error: None,
}
}).await;
let wall_time = start.elapsed();
let stats = analyze_results(&results);
println!("Wall time: {:.2}s", wall_time.as_secs_f64());
println!("Throughput: {:.1} req/s\n", results.len() as f64 / wall_time.as_secs_f64());
println!("{}", stats);
}Cost estimation in benchmarks
fn estimate_cost_usd(
model: &str,
input_tokens: u64,
output_tokens: u64,
) -> f64 {
let (input_price, output_price) = match model {
"gpt-4o" => (5.0, 15.0), // per 1M tokens
"gpt-4o-mini" => (0.15, 0.60),
"claude-3-5-sonnet" => (3.0, 15.0),
"claude-3-haiku" => (0.25, 1.25),
"llama3-70b-groq" => (0.59, 0.79),
_ => (5.0, 15.0),
};
input_tokens as f64 * input_price / 1_000_000.0
+ output_tokens as f64 * output_price / 1_000_000.0
}
fn main() {
let scenarios = [
("100 req/day (1k tokens each)", 100, 1000, 500),
("10k req/day (500 tokens each)", 10_000, 500, 200),
("1M req/day (100 tokens each)", 1_000_000, 100, 50),
];
for (label, reqs, inp, out) in scenarios {
let total_in = reqs * inp;
let total_out = reqs * out;
let cost_4o = estimate_cost_usd("gpt-4o", total_in, total_out);
let cost_mini = estimate_cost_usd("gpt-4o-mini", total_in, total_out);
println!("{}", label);
println!(" gpt-4o: ${:.2}/day", cost_4o);
println!(" gpt-4o-mini: ${:.2}/day\n", cost_mini);
}
}