RRust By Example

LLM Rust Scaling

Scale LLM-powered Rust services from single instance to high-availability deployments. Covers request queuing, provider load balancing, semantic caching, and horizontal scaling strategies.

Topic: Llm Rust

Search intent: High-intent search: "rust llm scaling high availability"

LLM Rust Scaling

Scaling bottlenecks unique to LLM services

Unlike traditional APIs, LLM calls have:

  • Variable latency — 0.5s to 60s depending on output length.
  • External rate limits — OpenAI: 500 RPM / 800k TPM on Tier 1.
  • High per-request cost — can't just scale out to handle more load.
  • Streaming — connections stay open for seconds, impacting concurrency counts.

Runnable example — multi-provider load balancer

rust
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};

#[derive(Debug, Clone)]
struct ProviderState {
    name: String,
    weight: u32,          // Higher = more traffic
    error_count: Arc<AtomicUsize>,
    last_error: Arc<std::sync::Mutex<Option<Instant>>>,
}

impl ProviderState {
    fn new(name: &str, weight: u32) -> Self {
        Self {
            name: name.to_string(),
            weight,
            error_count: Arc::new(AtomicUsize::new(0)),
            last_error: Arc::new(std::sync::Mutex::new(None)),
        }
    }

    fn record_error(&self) {
        self.error_count.fetch_add(1, Ordering::Relaxed);
        *self.last_error.lock().unwrap() = Some(Instant::now());
    }

    fn record_success(&self) {
        self.error_count.store(0, Ordering::Relaxed);
    }

    /// Provider is unhealthy if it has recent errors
    fn is_healthy(&self) -> bool {
        let errors = self.error_count.load(Ordering::Relaxed);
        if errors == 0 { return true; }

        let last = self.last_error.lock().unwrap();
        if let Some(t) = *last {
            // Recover after 30 seconds of no new errors
            t.elapsed() > Duration::from_secs(30)
        } else {
            true
        }
    }
}

struct LlmLoadBalancer {
    providers: Vec<ProviderState>,
    round_robin: AtomicUsize,
}

impl LlmLoadBalancer {
    fn new(providers: Vec<ProviderState>) -> Arc<Self> {
        Arc::new(Self {
            providers,
            round_robin: AtomicUsize::new(0),
        })
    }

    /// Select best available provider using weighted round-robin
    fn select(&self) -> Option<&ProviderState> {
        let healthy: Vec<&ProviderState> = self.providers.iter()
            .filter(|p| p.is_healthy())
            .collect();

        if healthy.is_empty() { return None; }

        // Weighted selection
        let total_weight: u32 = healthy.iter().map(|p| p.weight).sum();
        let idx = self.round_robin.fetch_add(1, Ordering::Relaxed);
        let mut pick = (idx as u32 % total_weight) + 1;

        for provider in &healthy {
            if pick <= provider.weight {
                return Some(provider);
            }
            pick -= provider.weight;
        }

        healthy.first().copied()
    }

    async fn complete(&self, prompt: &str) -> Result<String, String> {
        let provider = self.select().ok_or("No healthy providers")?;

        match self.call_provider(&provider.name, prompt).await {
            Ok(response) => {
                provider.record_success();
                Ok(response)
            }
            Err(e) => {
                provider.record_error();
                Err(format!("Provider {} failed: {}", provider.name, e))
            }
        }
    }

    async fn call_provider(&self, provider: &str, prompt: &str) -> Result<String, String> {
        // Simulate provider call
        tokio::time::sleep(Duration::from_millis(50)).await;
        if provider == "anthropic" {
            // Simulate occasional failures
            return Err("temporary error".to_string());
        }
        Ok(format!("{} response: processed '{}'", provider, &prompt[..20.min(prompt.len())]))
    }
}

#[tokio::main]
async fn main() {
    let balancer = LlmLoadBalancer::new(vec![
        ProviderState::new("openai", 3),    // 60% traffic
        ProviderState::new("anthropic", 1), // 20% traffic (currently failing)
        ProviderState::new("groq", 1),      // 20% traffic
    ]);

    for i in 0..10 {
        let prompt = format!("Question {}: explain Rust", i);
        match balancer.complete(&prompt).await {
            Ok(resp) => println!("✅ {}", resp),
            Err(e) => println!("❌ {}", e),
        }
    }
}

Semantic caching

rust
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;

/// Semantic cache: cache by prompt similarity, not just exact match
struct SemanticCache {
    /// Exact-match cache (fast path)
    exact: HashMap<u64, String>,
    /// Embedding-based fuzzy cache (slow path — requires embed call)
    fuzzy: Vec<(Vec<f32>, String)>,
    similarity_threshold: f32,
}

impl SemanticCache {
    fn new() -> Self {
        Self {
            exact: HashMap::new(),
            fuzzy: Vec::new(),
            similarity_threshold: 0.97, // Very high threshold for LLM caching
        }
    }

    fn hash_prompt(prompt: &str) -> u64 {
        let mut h = DefaultHasher::new();
        prompt.trim().to_lowercase().hash(&mut h);
        h.finish()
    }

    fn get_exact(&self, prompt: &str) -> Option<&str> {
        self.exact.get(&Self::hash_prompt(prompt)).map(|s| s.as_str())
    }

    fn set_exact(&mut self, prompt: &str, response: String) {
        self.exact.insert(Self::hash_prompt(prompt), response);
    }

    fn cosine(a: &[f32], b: &[f32]) -> f32 {
        let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
        let na = a.iter().map(|x| x * x).sum::<f32>().sqrt();
        let nb = b.iter().map(|x| x * x).sum::<f32>().sqrt();
        if na < 1e-8 || nb < 1e-8 { 0.0 } else { dot / (na * nb) }
    }

    fn get_fuzzy(&self, query_embedding: &[f32]) -> Option<&str> {
        self.fuzzy.iter()
            .filter_map(|(emb, resp)| {
                let sim = Self::cosine(query_embedding, emb);
                if sim >= self.similarity_threshold { Some((sim, resp.as_str())) } else { None }
            })
            .max_by(|a, b| a.0.partial_cmp(&b.0).unwrap())
            .map(|(_, resp)| resp)
    }

    fn set_fuzzy(&mut self, embedding: Vec<f32>, response: String) {
        if self.fuzzy.len() < 10_000 {
            self.fuzzy.push((embedding, response));
        }
    }
}

fn main() {
    let mut cache = SemanticCache::new();

    let prompt1 = "What is Rust ownership?";
    let response1 = "Ownership ensures each value has one owner...".to_string();

    // Store
    cache.set_exact(prompt1, response1.clone());
    println!("Cached response for: '{}'", prompt1);

    // Exact hit
    if let Some(resp) = cache.get_exact(prompt1) {
        println!("✅ Cache hit: {}...", &resp[..30]);
    }

    // Exact miss (different casing)
    if cache.get_exact("what is rust ownership?").is_none() {
        println!("⚠️  Case mismatch misses exact cache (normalize before hashing in prod)");
    }
}

Scaling checklist

  • [ ] Implement semantic caching — LLM calls are expensive; cache identical/similar prompts.
  • [ ] Set per-tenant token quotas to prevent a single tenant from consuming all rate limit.
  • [ ] Use multi-provider routing; never depend on a single LLM provider.
  • [ ] Monitor TTFT separately from total latency for user-facing features.
  • [ ] Queue requests during rate-limit windows rather than failing immediately.
  • [ ] Use streaming for all user-facing endpoints; buffer-then-return only for background jobs.

Related reading

Related Guides

Continue in This Topic

More Rust Guides