RRust By Example

LLM Rust Performance Tuning

Optimize LLM application performance in Rust: connection pooling, request pipelining, streaming deserialization, prompt caching, and KV-cache-aware batching.

Topic: Llm Rust

Search intent: High-intent search: "rust llm performance optimization"

LLM Rust Performance Tuning

Performance bottlenecks in LLM apps

LLM apps are fundamentally I/O-bound on the provider call. Optimization focuses on:

1. Reduce round-trips — cache identical prompts.

2. Minimize token count — cheaper + faster.

3. Parallelize — send independent requests concurrently.

4. Stream — return first tokens immediately, don't wait for full response.

5. Connection reuse — HTTP/2 keeps connections alive.

Runnable example — parallel batch processing

rust
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::Semaphore;

/// Process multiple independent LLM requests in parallel
/// with a concurrency limit to avoid rate limiting
async fn parallel_infer(
    prompts: Vec<String>,
    max_concurrent: usize,
) -> Vec<Result<String, String>> {
    let semaphore = Arc::new(Semaphore::new(max_concurrent));

    let handles: Vec<_> = prompts.into_iter().map(|prompt| {
        let sem = semaphore.clone();
        tokio::spawn(async move {
            let _permit = sem.acquire().await.unwrap();
            simulate_llm_call(&prompt).await
        })
    }).collect();

    let mut results = Vec::with_capacity(handles.len());
    for handle in handles {
        results.push(handle.await.unwrap());
    }
    results
}

async fn simulate_llm_call(prompt: &str) -> Result<String, String> {
    // Simulate variable latency
    let words = prompt.split_whitespace().count();
    tokio::time::sleep(Duration::from_millis(50 + words as u64 * 2)).await;
    Ok(format!("Response to: '{}...'", &prompt[..prompt.len().min(30)]))
}

#[tokio::main]
async fn main() {
    let prompts: Vec<String> = (0..10)
        .map(|i| format!("Prompt {} about Rust and AI programming techniques", i))
        .collect();

    let start = Instant::now();
    let results = parallel_infer(prompts, 4).await;
    println!("Processed {} prompts in {:.2}s", results.len(), start.elapsed().as_secs_f64());

    let errors = results.iter().filter(|r| r.is_err()).count();
    println!("Successes: {} | Errors: {}", results.len() - errors, errors);
}

Prompt compression techniques

rust
/// Reduce prompt tokens without losing semantic content
struct PromptCompressor {
    max_tokens: usize,
}

impl PromptCompressor {
    fn new(max_tokens: usize) -> Self { Self { max_tokens } }

    fn estimate_tokens(text: &str) -> usize { text.len() / 4 + 1 }

    /// Remove redundant whitespace
    fn normalize_whitespace(text: &str) -> String {
        text.split_whitespace().collect::<Vec<_>>().join(" ")
    }

    /// Truncate from the middle, keeping start and end context
    fn truncate_middle(text: &str, max_chars: usize) -> String {
        if text.len() <= max_chars { return text.to_string(); }
        let half = max_chars / 2;
        let start = &text[..half];
        let end = &text[text.len() - half..];
        format!("{}...[truncated]...{}", start, end)
    }

    /// Remove boilerplate from code snippets
    fn compress_code(code: &str) -> String {
        code.lines()
            .filter(|line| {
                let trimmed = line.trim();
                // Remove empty lines and pure comment lines
                !trimmed.is_empty() && !trimmed.starts_with("//")
            })
            .collect::<Vec<_>>()
            .join("\n")
    }

    fn compress(&self, prompt: &str) -> String {
        let normalized = Self::normalize_whitespace(prompt);
        let tokens = Self::estimate_tokens(&normalized);
        if tokens <= self.max_tokens {
            return normalized;
        }
        let max_chars = self.max_tokens * 4;
        Self::truncate_middle(&normalized, max_chars)
    }
}

fn main() {
    let compressor = PromptCompressor::new(100);

    let long_prompt = "This is a very detailed explanation of Rust programming. ".repeat(20);
    let code = "// This is a comment\nfn main() {\n// Another comment\n    println!(\"hello\");\n\n\n}";

    println!("Original: {} chars", long_prompt.len());
    let compressed = compressor.compress(&long_prompt);
    println!("Compressed: {} chars", compressed.len());

    println!("\nCode before: {} lines", code.lines().count());
    let clean_code = PromptCompressor::compress_code(code);
    println!("Code after: {} lines\n{}", clean_code.lines().count(), clean_code);
}

Connection pool metrics

rust
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;

/// Track HTTP connection pool health
struct ConnectionPoolMetrics {
    requests_total: AtomicU64,
    requests_reused_connection: AtomicU64,
    total_latency_ms: AtomicU64,
}

impl ConnectionPoolMetrics {
    fn new() -> Arc<Self> {
        Arc::new(Self {
            requests_total: AtomicU64::new(0),
            requests_reused_connection: AtomicU64::new(0),
            total_latency_ms: AtomicU64::new(0),
        })
    }

    fn record_request(&self, reused: bool, latency_ms: u64) {
        self.requests_total.fetch_add(1, Ordering::Relaxed);
        if reused { self.requests_reused_connection.fetch_add(1, Ordering::Relaxed); }
        self.total_latency_ms.fetch_add(latency_ms, Ordering::Relaxed);
    }

    fn report(&self) {
        let total = self.requests_total.load(Ordering::Relaxed);
        let reused = self.requests_reused_connection.load(Ordering::Relaxed);
        let avg_ms = if total > 0 {
            self.total_latency_ms.load(Ordering::Relaxed) / total
        } else { 0 };
        println!(
            "Requests: {} | Connection reuse: {:.1}% | Avg latency: {}ms",
            total,
            reused as f64 / total.max(1) as f64 * 100.0,
            avg_ms
        );
    }
}

fn main() {
    let metrics = ConnectionPoolMetrics::new();

    // Simulate requests
    for i in 0..10 {
        let reused = i > 0; // First request creates connection, rest reuse
        let latency = if reused { 45 } else { 120 }; // New connection is slower
        metrics.record_request(reused, latency);
    }

    metrics.report();
}

Tuning checklist

  • [ ] Use reqwest with connection_verbose(false) and HTTP/2 enabled.
  • [ ] Set appropriate timeout — p99 of your target model's latency.
  • [ ] Cache deterministic prompts (temperature=0) with Redis or in-memory LRU.
  • [ ] Parallelize independent sub-prompts using join_all.
  • [ ] Compress prompts: remove redundant whitespace, truncate middle of long docs.
  • [ ] Stream responses for user-facing features; buffer for background processing.
  • [ ] Monitor TTFT (time to first token) separately from total response time.

Related reading

Related Guides

Continue in This Topic

More Rust Guides