LLM Rust Performance Tuning
Optimize LLM application performance in Rust: connection pooling, request pipelining, streaming deserialization, prompt caching, and KV-cache-aware batching.
Topic: Llm Rust
Search intent: High-intent search: "rust llm performance optimization"
LLM Rust Performance Tuning
Performance bottlenecks in LLM apps
LLM apps are fundamentally I/O-bound on the provider call. Optimization focuses on:
1. Reduce round-trips — cache identical prompts.
2. Minimize token count — cheaper + faster.
3. Parallelize — send independent requests concurrently.
4. Stream — return first tokens immediately, don't wait for full response.
5. Connection reuse — HTTP/2 keeps connections alive.
Runnable example — parallel batch processing
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::Semaphore;
/// Process multiple independent LLM requests in parallel
/// with a concurrency limit to avoid rate limiting
async fn parallel_infer(
prompts: Vec<String>,
max_concurrent: usize,
) -> Vec<Result<String, String>> {
let semaphore = Arc::new(Semaphore::new(max_concurrent));
let handles: Vec<_> = prompts.into_iter().map(|prompt| {
let sem = semaphore.clone();
tokio::spawn(async move {
let _permit = sem.acquire().await.unwrap();
simulate_llm_call(&prompt).await
})
}).collect();
let mut results = Vec::with_capacity(handles.len());
for handle in handles {
results.push(handle.await.unwrap());
}
results
}
async fn simulate_llm_call(prompt: &str) -> Result<String, String> {
// Simulate variable latency
let words = prompt.split_whitespace().count();
tokio::time::sleep(Duration::from_millis(50 + words as u64 * 2)).await;
Ok(format!("Response to: '{}...'", &prompt[..prompt.len().min(30)]))
}
#[tokio::main]
async fn main() {
let prompts: Vec<String> = (0..10)
.map(|i| format!("Prompt {} about Rust and AI programming techniques", i))
.collect();
let start = Instant::now();
let results = parallel_infer(prompts, 4).await;
println!("Processed {} prompts in {:.2}s", results.len(), start.elapsed().as_secs_f64());
let errors = results.iter().filter(|r| r.is_err()).count();
println!("Successes: {} | Errors: {}", results.len() - errors, errors);
}Prompt compression techniques
/// Reduce prompt tokens without losing semantic content
struct PromptCompressor {
max_tokens: usize,
}
impl PromptCompressor {
fn new(max_tokens: usize) -> Self { Self { max_tokens } }
fn estimate_tokens(text: &str) -> usize { text.len() / 4 + 1 }
/// Remove redundant whitespace
fn normalize_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
/// Truncate from the middle, keeping start and end context
fn truncate_middle(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars { return text.to_string(); }
let half = max_chars / 2;
let start = &text[..half];
let end = &text[text.len() - half..];
format!("{}...[truncated]...{}", start, end)
}
/// Remove boilerplate from code snippets
fn compress_code(code: &str) -> String {
code.lines()
.filter(|line| {
let trimmed = line.trim();
// Remove empty lines and pure comment lines
!trimmed.is_empty() && !trimmed.starts_with("//")
})
.collect::<Vec<_>>()
.join("\n")
}
fn compress(&self, prompt: &str) -> String {
let normalized = Self::normalize_whitespace(prompt);
let tokens = Self::estimate_tokens(&normalized);
if tokens <= self.max_tokens {
return normalized;
}
let max_chars = self.max_tokens * 4;
Self::truncate_middle(&normalized, max_chars)
}
}
fn main() {
let compressor = PromptCompressor::new(100);
let long_prompt = "This is a very detailed explanation of Rust programming. ".repeat(20);
let code = "// This is a comment\nfn main() {\n// Another comment\n println!(\"hello\");\n\n\n}";
println!("Original: {} chars", long_prompt.len());
let compressed = compressor.compress(&long_prompt);
println!("Compressed: {} chars", compressed.len());
println!("\nCode before: {} lines", code.lines().count());
let clean_code = PromptCompressor::compress_code(code);
println!("Code after: {} lines\n{}", clean_code.lines().count(), clean_code);
}Connection pool metrics
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
/// Track HTTP connection pool health
struct ConnectionPoolMetrics {
requests_total: AtomicU64,
requests_reused_connection: AtomicU64,
total_latency_ms: AtomicU64,
}
impl ConnectionPoolMetrics {
fn new() -> Arc<Self> {
Arc::new(Self {
requests_total: AtomicU64::new(0),
requests_reused_connection: AtomicU64::new(0),
total_latency_ms: AtomicU64::new(0),
})
}
fn record_request(&self, reused: bool, latency_ms: u64) {
self.requests_total.fetch_add(1, Ordering::Relaxed);
if reused { self.requests_reused_connection.fetch_add(1, Ordering::Relaxed); }
self.total_latency_ms.fetch_add(latency_ms, Ordering::Relaxed);
}
fn report(&self) {
let total = self.requests_total.load(Ordering::Relaxed);
let reused = self.requests_reused_connection.load(Ordering::Relaxed);
let avg_ms = if total > 0 {
self.total_latency_ms.load(Ordering::Relaxed) / total
} else { 0 };
println!(
"Requests: {} | Connection reuse: {:.1}% | Avg latency: {}ms",
total,
reused as f64 / total.max(1) as f64 * 100.0,
avg_ms
);
}
}
fn main() {
let metrics = ConnectionPoolMetrics::new();
// Simulate requests
for i in 0..10 {
let reused = i > 0; // First request creates connection, rest reuse
let latency = if reused { 45 } else { 120 }; // New connection is slower
metrics.record_request(reused, latency);
}
metrics.report();
}Tuning checklist
- [ ] Use
reqwestwithconnection_verbose(false)and HTTP/2 enabled. - [ ] Set appropriate
timeout— p99 of your target model's latency. - [ ] Cache deterministic prompts (temperature=0) with Redis or in-memory LRU.
- [ ] Parallelize independent sub-prompts using
join_all. - [ ] Compress prompts: remove redundant whitespace, truncate middle of long docs.
- [ ] Stream responses for user-facing features; buffer for background processing.
- [ ] Monitor TTFT (time to first token) separately from total response time.