LLM Rust Scaling

Scaling bottlenecks unique to LLM services

Unlike traditional APIs, LLM calls have:

Variable latency — 0.5s to 60s depending on output length.
External rate limits — OpenAI: 500 RPM / 800k TPM on Tier 1.
High per-request cost — can't just scale out to handle more load.
Streaming — connections stay open for seconds, impacting concurrency counts.

Runnable example — multi-provider load balancer

rust

use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};

#[derive(Debug, Clone)]
struct ProviderState {
    name: String,
    weight: u32,          // Higher = more traffic
    error_count: Arc<AtomicUsize>,
    last_error: Arc<std::sync::Mutex<Option<Instant>>>,
}

impl ProviderState {
    fn new(name: &str, weight: u32) -> Self {
        Self {
            name: name.to_string(),
            weight,
            error_count: Arc::new(AtomicUsize::new(0)),
            last_error: Arc::new(std::sync::Mutex::new(None)),
        }
    }

    fn record_error(&self) {
        self.error_count.fetch_add(1, Ordering::Relaxed);
        *self.last_error.lock().unwrap() = Some(Instant::now());
    }

    fn record_success(&self) {
        self.error_count.store(0, Ordering::Relaxed);
    }

    /// Provider is unhealthy if it has recent errors
    fn is_healthy(&self) -> bool {
        let errors = self.error_count.load(Ordering::Relaxed);
        if errors == 0 { return true; }

        let last = self.last_error.lock().unwrap();
        if let Some(t) = *last {
            // Recover after 30 seconds of no new errors
            t.elapsed() > Duration::from_secs(30)
        } else {
            true
        }
    }
}

struct LlmLoadBalancer {
    providers: Vec<ProviderState>,
    round_robin: AtomicUsize,
}

impl LlmLoadBalancer {
    fn new(providers: Vec<ProviderState>) -> Arc<Self> {
        Arc::new(Self {
            providers,
            round_robin: AtomicUsize::new(0),
        })
    }

    /// Select best available provider using weighted round-robin
    fn select(&self) -> Option<&ProviderState> {
        let healthy: Vec<&ProviderState> = self.providers.iter()
            .filter(|p| p.is_healthy())
            .collect();

        if healthy.is_empty() { return None; }

        // Weighted selection
        let total_weight: u32 = healthy.iter().map(|p| p.weight).sum();
        let idx = self.round_robin.fetch_add(1, Ordering::Relaxed);
        let mut pick = (idx as u32 % total_weight) + 1;

        for provider in &healthy {
            if pick <= provider.weight {
                return Some(provider);
            }
            pick -= provider.weight;
        }

        healthy.first().copied()
    }

    async fn complete(&self, prompt: &str) -> Result<String, String> {
        let provider = self.select().ok_or("No healthy providers")?;

        match self.call_provider(&provider.name, prompt).await {
            Ok(response) => {
                provider.record_success();
                Ok(response)
            }
            Err(e) => {
                provider.record_error();
                Err(format!("Provider {} failed: {}", provider.name, e))
            }
        }
    }

    async fn call_provider(&self, provider: &str, prompt: &str) -> Result<String, String> {
        // Simulate provider call
        tokio::time::sleep(Duration::from_millis(50)).await;
        if provider == "anthropic" {
            // Simulate occasional failures
            return Err("temporary error".to_string());
        }
        Ok(format!("{} response: processed '{}'", provider, &prompt[..20.min(prompt.len())]))
    }
}

#[tokio::main]
async fn main() {
    let balancer = LlmLoadBalancer::new(vec![
        ProviderState::new("openai", 3),    // 60% traffic
        ProviderState::new("anthropic", 1), // 20% traffic (currently failing)
        ProviderState::new("groq", 1),      // 20% traffic
    ]);

    for i in 0..10 {
        let prompt = format!("Question {}: explain Rust", i);
        match balancer.complete(&prompt).await {
            Ok(resp) => println!("✅ {}", resp),
            Err(e) => println!("❌ {}", e),
        }
    }
}

Semantic caching

rust

use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;

/// Semantic cache: cache by prompt similarity, not just exact match
struct SemanticCache {
    /// Exact-match cache (fast path)
    exact: HashMap<u64, String>,
    /// Embedding-based fuzzy cache (slow path — requires embed call)
    fuzzy: Vec<(Vec<f32>, String)>,
    similarity_threshold: f32,
}

impl SemanticCache {
    fn new() -> Self {
        Self {
            exact: HashMap::new(),
            fuzzy: Vec::new(),
            similarity_threshold: 0.97, // Very high threshold for LLM caching
        }
    }

    fn hash_prompt(prompt: &str) -> u64 {
        let mut h = DefaultHasher::new();
        prompt.trim().to_lowercase().hash(&mut h);
        h.finish()
    }

    fn get_exact(&self, prompt: &str) -> Option<&str> {
        self.exact.get(&Self::hash_prompt(prompt)).map(|s| s.as_str())
    }

    fn set_exact(&mut self, prompt: &str, response: String) {
        self.exact.insert(Self::hash_prompt(prompt), response);
    }

    fn cosine(a: &[f32], b: &[f32]) -> f32 {
        let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
        let na = a.iter().map(|x| x * x).sum::<f32>().sqrt();
        let nb = b.iter().map(|x| x * x).sum::<f32>().sqrt();
        if na < 1e-8 || nb < 1e-8 { 0.0 } else { dot / (na * nb) }
    }

    fn get_fuzzy(&self, query_embedding: &[f32]) -> Option<&str> {
        self.fuzzy.iter()
            .filter_map(|(emb, resp)| {
                let sim = Self::cosine(query_embedding, emb);
                if sim >= self.similarity_threshold { Some((sim, resp.as_str())) } else { None }
            })
            .max_by(|a, b| a.0.partial_cmp(&b.0).unwrap())
            .map(|(_, resp)| resp)
    }

    fn set_fuzzy(&mut self, embedding: Vec<f32>, response: String) {
        if self.fuzzy.len() < 10_000 {
            self.fuzzy.push((embedding, response));
        }
    }
}

fn main() {
    let mut cache = SemanticCache::new();

    let prompt1 = "What is Rust ownership?";
    let response1 = "Ownership ensures each value has one owner...".to_string();

    // Store
    cache.set_exact(prompt1, response1.clone());
    println!("Cached response for: '{}'", prompt1);

    // Exact hit
    if let Some(resp) = cache.get_exact(prompt1) {
        println!("✅ Cache hit: {}...", &resp[..30]);
    }

    // Exact miss (different casing)
    if cache.get_exact("what is rust ownership?").is_none() {
        println!("⚠️  Case mismatch misses exact cache (normalize before hashing in prod)");
    }
}

Scaling checklist

[ ] Implement semantic caching — LLM calls are expensive; cache identical/similar prompts.
[ ] Set per-tenant token quotas to prevent a single tenant from consuming all rate limit.
[ ] Use multi-provider routing; never depend on a single LLM provider.
[ ] Monitor TTFT separately from total latency for user-facing features.
[ ] Queue requests during rate-limit windows rather than failing immediately.
[ ] Use streaming for all user-facing endpoints; buffer-then-return only for background jobs.

LLM Rust Scaling

LLM Rust Scaling

Scaling bottlenecks unique to LLM services

Runnable example — multi-provider load balancer

Semantic caching

Scaling checklist

Related reading

Related Guides

LLM API Gateway in Rust

LLM Rust Production Guide

Continue in This Topic

LLM Rust Review Checklist

LLM Rust Security

More Rust Guides

Building LLM Applications with Rust

LLM API Gateway in Rust

LLM Rust Anti-Patterns

LLM Rust Benchmarking

LLM Rust Decision Matrix

LLM Rust Interview Q&A