LLM Rust Scaling
Scale LLM-powered Rust services from single instance to high-availability deployments. Covers request queuing, provider load balancing, semantic caching, and horizontal scaling strategies.
Topic: Llm Rust
Search intent: High-intent search: "rust llm scaling high availability"
LLM Rust Scaling
Scaling bottlenecks unique to LLM services
Unlike traditional APIs, LLM calls have:
- Variable latency — 0.5s to 60s depending on output length.
- External rate limits — OpenAI: 500 RPM / 800k TPM on Tier 1.
- High per-request cost — can't just scale out to handle more load.
- Streaming — connections stay open for seconds, impacting concurrency counts.
Runnable example — multi-provider load balancer
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
struct ProviderState {
name: String,
weight: u32, // Higher = more traffic
error_count: Arc<AtomicUsize>,
last_error: Arc<std::sync::Mutex<Option<Instant>>>,
}
impl ProviderState {
fn new(name: &str, weight: u32) -> Self {
Self {
name: name.to_string(),
weight,
error_count: Arc::new(AtomicUsize::new(0)),
last_error: Arc::new(std::sync::Mutex::new(None)),
}
}
fn record_error(&self) {
self.error_count.fetch_add(1, Ordering::Relaxed);
*self.last_error.lock().unwrap() = Some(Instant::now());
}
fn record_success(&self) {
self.error_count.store(0, Ordering::Relaxed);
}
/// Provider is unhealthy if it has recent errors
fn is_healthy(&self) -> bool {
let errors = self.error_count.load(Ordering::Relaxed);
if errors == 0 { return true; }
let last = self.last_error.lock().unwrap();
if let Some(t) = *last {
// Recover after 30 seconds of no new errors
t.elapsed() > Duration::from_secs(30)
} else {
true
}
}
}
struct LlmLoadBalancer {
providers: Vec<ProviderState>,
round_robin: AtomicUsize,
}
impl LlmLoadBalancer {
fn new(providers: Vec<ProviderState>) -> Arc<Self> {
Arc::new(Self {
providers,
round_robin: AtomicUsize::new(0),
})
}
/// Select best available provider using weighted round-robin
fn select(&self) -> Option<&ProviderState> {
let healthy: Vec<&ProviderState> = self.providers.iter()
.filter(|p| p.is_healthy())
.collect();
if healthy.is_empty() { return None; }
// Weighted selection
let total_weight: u32 = healthy.iter().map(|p| p.weight).sum();
let idx = self.round_robin.fetch_add(1, Ordering::Relaxed);
let mut pick = (idx as u32 % total_weight) + 1;
for provider in &healthy {
if pick <= provider.weight {
return Some(provider);
}
pick -= provider.weight;
}
healthy.first().copied()
}
async fn complete(&self, prompt: &str) -> Result<String, String> {
let provider = self.select().ok_or("No healthy providers")?;
match self.call_provider(&provider.name, prompt).await {
Ok(response) => {
provider.record_success();
Ok(response)
}
Err(e) => {
provider.record_error();
Err(format!("Provider {} failed: {}", provider.name, e))
}
}
}
async fn call_provider(&self, provider: &str, prompt: &str) -> Result<String, String> {
// Simulate provider call
tokio::time::sleep(Duration::from_millis(50)).await;
if provider == "anthropic" {
// Simulate occasional failures
return Err("temporary error".to_string());
}
Ok(format!("{} response: processed '{}'", provider, &prompt[..20.min(prompt.len())]))
}
}
#[tokio::main]
async fn main() {
let balancer = LlmLoadBalancer::new(vec![
ProviderState::new("openai", 3), // 60% traffic
ProviderState::new("anthropic", 1), // 20% traffic (currently failing)
ProviderState::new("groq", 1), // 20% traffic
]);
for i in 0..10 {
let prompt = format!("Question {}: explain Rust", i);
match balancer.complete(&prompt).await {
Ok(resp) => println!("✅ {}", resp),
Err(e) => println!("❌ {}", e),
}
}
}Semantic caching
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
/// Semantic cache: cache by prompt similarity, not just exact match
struct SemanticCache {
/// Exact-match cache (fast path)
exact: HashMap<u64, String>,
/// Embedding-based fuzzy cache (slow path — requires embed call)
fuzzy: Vec<(Vec<f32>, String)>,
similarity_threshold: f32,
}
impl SemanticCache {
fn new() -> Self {
Self {
exact: HashMap::new(),
fuzzy: Vec::new(),
similarity_threshold: 0.97, // Very high threshold for LLM caching
}
}
fn hash_prompt(prompt: &str) -> u64 {
let mut h = DefaultHasher::new();
prompt.trim().to_lowercase().hash(&mut h);
h.finish()
}
fn get_exact(&self, prompt: &str) -> Option<&str> {
self.exact.get(&Self::hash_prompt(prompt)).map(|s| s.as_str())
}
fn set_exact(&mut self, prompt: &str, response: String) {
self.exact.insert(Self::hash_prompt(prompt), response);
}
fn cosine(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let na = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let nb = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if na < 1e-8 || nb < 1e-8 { 0.0 } else { dot / (na * nb) }
}
fn get_fuzzy(&self, query_embedding: &[f32]) -> Option<&str> {
self.fuzzy.iter()
.filter_map(|(emb, resp)| {
let sim = Self::cosine(query_embedding, emb);
if sim >= self.similarity_threshold { Some((sim, resp.as_str())) } else { None }
})
.max_by(|a, b| a.0.partial_cmp(&b.0).unwrap())
.map(|(_, resp)| resp)
}
fn set_fuzzy(&mut self, embedding: Vec<f32>, response: String) {
if self.fuzzy.len() < 10_000 {
self.fuzzy.push((embedding, response));
}
}
}
fn main() {
let mut cache = SemanticCache::new();
let prompt1 = "What is Rust ownership?";
let response1 = "Ownership ensures each value has one owner...".to_string();
// Store
cache.set_exact(prompt1, response1.clone());
println!("Cached response for: '{}'", prompt1);
// Exact hit
if let Some(resp) = cache.get_exact(prompt1) {
println!("✅ Cache hit: {}...", &resp[..30]);
}
// Exact miss (different casing)
if cache.get_exact("what is rust ownership?").is_none() {
println!("⚠️ Case mismatch misses exact cache (normalize before hashing in prod)");
}
}Scaling checklist
- [ ] Implement semantic caching — LLM calls are expensive; cache identical/similar prompts.
- [ ] Set per-tenant token quotas to prevent a single tenant from consuming all rate limit.
- [ ] Use multi-provider routing; never depend on a single LLM provider.
- [ ] Monitor TTFT separately from total latency for user-facing features.
- [ ] Queue requests during rate-limit windows rather than failing immediately.
- [ ] Use streaming for all user-facing endpoints; buffer-then-return only for background jobs.