RRust By Example

LLM Rust Benchmarking

Benchmark LLM application performance in Rust: measure TTFT, throughput, cache hit rates, and compare provider latency distributions using Criterion and custom instrumentation.

Topic: Llm Rust

Search intent: High-intent search: "rust llm benchmark latency throughput"

LLM Rust Benchmarking

Key metrics to benchmark

| Metric | Description | Measurement |

|---|---|---|

| TTFT | Time to first token | Instant before first chunk |

| Total latency | Full response time | Instant end-to-end |

| Throughput | Requests/second | Concurrent request count / wall time |

| Token throughput | Tokens/second | Output tokens / total time |

| Cache hit rate | Fraction served from cache | hits / (hits + misses) |

| Cost per request | USD per completion | token count × price |

Runnable example — comprehensive LLM benchmarker

rust
use std::time::{Duration, Instant};
use std::sync::Arc;
use tokio::sync::Semaphore;

#[derive(Debug, Clone)]
struct BenchResult {
    request_id: u64,
    ttft_ms: f64,        // Time to first token
    total_ms: f64,       // Total response time
    input_tokens: u32,
    output_tokens: u32,
    cached: bool,
    error: Option<String>,
}

struct LlmBenchmark {
    max_concurrent: usize,
    total_requests: usize,
}

impl LlmBenchmark {
    fn new(concurrent: usize, total: usize) -> Self {
        Self { max_concurrent: concurrent, total_requests: total }
    }

    async fn run<F, Fut>(&self, call_fn: F) -> Vec<BenchResult>
    where
        F: Fn(u64) -> Fut + Send + Sync + Clone + 'static,
        Fut: std::future::Future<Output = BenchResult> + Send,
    {
        let sem = Arc::new(Semaphore::new(self.max_concurrent));
        let handles: Vec<_> = (0..self.total_requests as u64).map(|id| {
            let sem = sem.clone();
            let f = call_fn.clone();
            tokio::spawn(async move {
                let _permit = sem.acquire().await.unwrap();
                f(id).await
            })
        }).collect();

        let mut results = Vec::with_capacity(self.total_requests);
        for h in handles { results.push(h.await.unwrap()); }
        results
    }
}

/// Compute statistics from benchmark results
fn analyze_results(results: &[BenchResult]) -> BenchStats {
    let successful: Vec<&BenchResult> = results.iter()
        .filter(|r| r.error.is_none())
        .collect();

    let n = successful.len();
    if n == 0 {
        return BenchStats::default();
    }

    let mut ttfts: Vec<f64> = successful.iter().map(|r| r.ttft_ms).collect();
    let mut totals: Vec<f64> = successful.iter().map(|r| r.total_ms).collect();
    ttfts.sort_by(|a, b| a.partial_cmp(b).unwrap());
    totals.sort_by(|a, b| a.partial_cmp(b).unwrap());

    let p = |v: &[f64], pct: f64| v[(n as f64 * pct / 100.0) as usize.min(n - 1)];

    let total_input_tokens: u64 = successful.iter().map(|r| r.input_tokens as u64).sum();
    let total_output_tokens: u64 = successful.iter().map(|r| r.output_tokens as u64).sum();
    let cache_hits = successful.iter().filter(|r| r.cached).count();
    let wall_time = totals.iter().sum::<f64>() / 1000.0; // secs (serial equivalent)

    BenchStats {
        n,
        errors: results.len() - n,
        ttft_p50: p(&ttfts, 50.0),
        ttft_p95: p(&ttfts, 95.0),
        ttft_p99: p(&ttfts, 99.0),
        total_p50: p(&totals, 50.0),
        total_p95: p(&totals, 95.0),
        total_p99: p(&totals, 99.0),
        tokens_per_sec: total_output_tokens as f64 / (wall_time / n as f64),
        cache_hit_rate: cache_hits as f64 / n as f64,
        avg_input_tokens: total_input_tokens as f64 / n as f64,
        avg_output_tokens: total_output_tokens as f64 / n as f64,
    }
}

#[derive(Debug, Default)]
struct BenchStats {
    n: usize,
    errors: usize,
    ttft_p50: f64,
    ttft_p95: f64,
    ttft_p99: f64,
    total_p50: f64,
    total_p95: f64,
    total_p99: f64,
    tokens_per_sec: f64,
    cache_hit_rate: f64,
    avg_input_tokens: f64,
    avg_output_tokens: f64,
}

impl std::fmt::Display for BenchStats {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "n={} errors={}\n\
            TTFT:  p50={:.1}ms p95={:.1}ms p99={:.1}ms\n\
            Total: p50={:.1}ms p95={:.1}ms p99={:.1}ms\n\
            Tokens/sec: {:.0} | Cache hit: {:.1}%\n\
            Avg tokens: {:.0} in / {:.0} out",
            self.n, self.errors,
            self.ttft_p50, self.ttft_p95, self.ttft_p99,
            self.total_p50, self.total_p95, self.total_p99,
            self.tokens_per_sec, self.cache_hit_rate * 100.0,
            self.avg_input_tokens, self.avg_output_tokens,
        )
    }
}

#[tokio::main]
async fn main() {
    let bench = LlmBenchmark::new(10, 100);

    println!("Running benchmark: 100 requests, 10 concurrent...\n");
    let start = Instant::now();

    let results = bench.run(|id| async move {
        let t0 = Instant::now();
        // Simulate variable latency LLM call
        let base_ms = 100u64 + (id % 5) * 50;
        tokio::time::sleep(Duration::from_millis(base_ms)).await;
        let ttft = base_ms as f64 / 3.0; // First token at 1/3 of total
        let total = t0.elapsed().as_secs_f64() * 1000.0;

        BenchResult {
            request_id: id,
            ttft_ms: ttft,
            total_ms: total,
            input_tokens: 100 + (id % 50) as u32,
            output_tokens: 150 + (id % 100) as u32,
            cached: id % 5 == 0, // 20% cache hit rate
            error: None,
        }
    }).await;

    let wall_time = start.elapsed();
    let stats = analyze_results(&results);

    println!("Wall time: {:.2}s", wall_time.as_secs_f64());
    println!("Throughput: {:.1} req/s\n", results.len() as f64 / wall_time.as_secs_f64());
    println!("{}", stats);
}

Cost estimation in benchmarks

rust
fn estimate_cost_usd(
    model: &str,
    input_tokens: u64,
    output_tokens: u64,
) -> f64 {
    let (input_price, output_price) = match model {
        "gpt-4o" => (5.0, 15.0),           // per 1M tokens
        "gpt-4o-mini" => (0.15, 0.60),
        "claude-3-5-sonnet" => (3.0, 15.0),
        "claude-3-haiku" => (0.25, 1.25),
        "llama3-70b-groq" => (0.59, 0.79),
        _ => (5.0, 15.0),
    };

    input_tokens as f64 * input_price / 1_000_000.0
        + output_tokens as f64 * output_price / 1_000_000.0
}

fn main() {
    let scenarios = [
        ("100 req/day (1k tokens each)", 100, 1000, 500),
        ("10k req/day (500 tokens each)", 10_000, 500, 200),
        ("1M req/day (100 tokens each)", 1_000_000, 100, 50),
    ];

    for (label, reqs, inp, out) in scenarios {
        let total_in = reqs * inp;
        let total_out = reqs * out;
        let cost_4o = estimate_cost_usd("gpt-4o", total_in, total_out);
        let cost_mini = estimate_cost_usd("gpt-4o-mini", total_in, total_out);
        println!("{}", label);
        println!("  gpt-4o:      ${:.2}/day", cost_4o);
        println!("  gpt-4o-mini: ${:.2}/day\n", cost_mini);
    }
}

Related reading

Related Guides

Continue in This Topic

More Rust Guides