Building LLM Applications with Rust

Why Rust for LLM Applications?

Rust excels at the infrastructure layer around LLMs: API gateways, caching layers, token counting, response streaming, and serving local models. While Python dominates model training, Rust serves as the performance-critical glue.

Architecture patterns

rust

┌─────────────────────────────────────────────────────┐
│                Application Layer                     │
│          Business logic, prompt templates            │
└──────────────────────┬──────────────────────────────┘
                       │
┌──────────────────────▼──────────────────────────────┐
│               LLM Gateway Layer (Rust)              │
│  Rate limiting │ Caching │ Routing │ Auth            │
└──────────────────────┬──────────────────────────────┘
                       │
        ┌──────────────┼──────────────┐
        ▼              ▼              ▼
   OpenAI API    Anthropic API   Local Model
   (reqwest)     (reqwest)       (candle/ort)

Runnable example — OpenAI-compatible API client

rust

use serde::{Deserialize, Serialize};
use std::time::Duration;

/// OpenAI-compatible chat request
#[derive(Debug, Serialize)]
struct ChatRequest {
    model: String,
    messages: Vec<ChatMessage>,
    #[serde(skip_serializing_if = "Option::is_none")]
    max_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    temperature: Option<f32>,
    #[serde(default)]
    stream: bool,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
struct ChatMessage {
    role: String,
    content: String,
}

#[derive(Debug, Deserialize)]
struct ChatResponse {
    id: String,
    choices: Vec<ChatChoice>,
    usage: TokenUsage,
}

#[derive(Debug, Deserialize)]
struct ChatChoice {
    message: ChatMessage,
    finish_reason: Option<String>,
}

#[derive(Debug, Deserialize)]
struct TokenUsage {
    prompt_tokens: u32,
    completion_tokens: u32,
    total_tokens: u32,
}

/// LLM client with retry and timeout
struct LlmClient {
    base_url: String,
    api_key: String,
    timeout: Duration,
}

impl LlmClient {
    fn new(base_url: &str, api_key: &str) -> Self {
        Self {
            base_url: base_url.to_string(),
            api_key: api_key.to_string(),
            timeout: Duration::from_secs(30),
        }
    }

    /// Build a chat request (no actual HTTP in this example)
    fn build_request(&self, messages: Vec<ChatMessage>) -> ChatRequest {
        ChatRequest {
            model: "gpt-4o".to_string(),
            messages,
            max_tokens: Some(1024),
            temperature: Some(0.7),
            stream: false,
        }
    }
}

/// Prompt builder with template support
struct PromptBuilder {
    system: String,
    examples: Vec<(String, String)>,
}

impl PromptBuilder {
    fn new(system: &str) -> Self {
        Self { system: system.to_string(), examples: Vec::new() }
    }

    fn add_example(mut self, user: &str, assistant: &str) -> Self {
        self.examples.push((user.to_string(), assistant.to_string()));
        self
    }

    fn build(&self, user_input: &str) -> Vec<ChatMessage> {
        let mut messages = vec![
            ChatMessage { role: "system".to_string(), content: self.system.clone() },
        ];
        for (user, asst) in &self.examples {
            messages.push(ChatMessage { role: "user".to_string(), content: user.clone() });
            messages.push(ChatMessage { role: "assistant".to_string(), content: asst.clone() });
        }
        messages.push(ChatMessage { role: "user".to_string(), content: user_input.to_string() });
        messages
    }
}

fn main() {
    let client = LlmClient::new("https://api.openai.com/v1", "sk-...");

    let prompt = PromptBuilder::new(
        "You are an expert Rust programmer. Give concise, accurate answers."
    )
    .add_example(
        "What is ownership in Rust?",
        "Ownership is Rust's memory management system. Each value has one owner; when the owner goes out of scope, the value is dropped."
    )
    .build("How do I share data between threads in Rust?");

    let request = client.build_request(prompt.clone());
    let json = serde_json::to_string_pretty(&request).unwrap();
    println!("Request payload:\n{}", json);
    println!("\nMessage count: {}", prompt.len());
}

Token counting (without API call)

rust

/// Approximate token count using GPT-4 tokenization heuristics
/// For production use the `tiktoken-rs` crate
fn estimate_tokens(text: &str) -> usize {
    // GPT-4 averages ~4 characters per token for English text
    let char_estimate = text.len() / 4;
    // Adjust for whitespace (each word boundary ≈ 1 token)
    let word_count = text.split_whitespace().count();
    (char_estimate + word_count) / 2
}

fn count_message_tokens(messages: &[(&str, &str)]) -> usize {
    // Each message: 4 tokens overhead + content tokens
    messages.iter().map(|(role, content)| {
        4 + estimate_tokens(role) + estimate_tokens(content)
    }).sum::<usize>() + 2 // 2 for reply priming
}

fn main() {
    let messages = vec![
        ("system", "You are a helpful Rust programming assistant."),
        ("user", "How do I implement a vector store in Rust for semantic search?"),
    ];

    let total = count_message_tokens(&messages);
    println!("Estimated tokens: ~{}", total);
    println!("Cost estimate (GPT-4o at $5/1M input): ${:.6}", total as f64 * 5.0 / 1_000_000.0);
}

Response caching

rust

use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::time::{Duration, Instant};

struct LlmCache {
    cache: HashMap<u64, (String, Instant)>,
    ttl: Duration,
}

impl LlmCache {
    fn new(ttl_seconds: u64) -> Self {
        Self { cache: HashMap::new(), ttl: Duration::from_secs(ttl_seconds) }
    }

    fn cache_key(model: &str, messages_json: &str) -> u64 {
        let mut h = DefaultHasher::new();
        model.hash(&mut h);
        messages_json.hash(&mut h);
        h.finish()
    }

    fn get(&self, key: u64) -> Option<&str> {
        self.cache.get(&key).and_then(|(resp, ts)| {
            if ts.elapsed() < self.ttl { Some(resp.as_str()) } else { None }
        })
    }

    fn set(&mut self, key: u64, response: String) {
        self.cache.insert(key, (response, Instant::now()));
    }
}

fn main() {
    let mut cache = LlmCache::new(3600); // 1h TTL
    let key = LlmCache::cache_key("gpt-4o", r#"[{"role":"user","content":"hi"}]"#);

    // Cache miss
    println!("Cache hit: {}", cache.get(key).is_some());

    // Store response
    cache.set(key, "Hello! How can I help?".to_string());
    println!("Cache hit: {}", cache.get(key).is_some());
    println!("Cached: {}", cache.get(key).unwrap());
}

Building LLM Applications with Rust

Building LLM Applications with Rust

Why Rust for LLM Applications?

Architecture patterns

Runnable example — OpenAI-compatible API client

Token counting (without API call)

Response caching

Related reading

Related Guides

LLM Rust Production Guide

Building LLM Applications with Rust

Continue in This Topic

LLM API Gateway in Rust

More Rust Guides

LLM API Gateway in Rust

LLM Rust Anti-Patterns

LLM Rust Benchmarking

LLM Rust Decision Matrix

LLM Rust Interview Q&A

LLM Rust Maintainability