RRust By Example

Building LLM Applications with Rust

Comprehensive guide to building Large Language Model (LLM) applications in Rust. Covers OpenAI API integration, local model inference, prompt engineering, and streaming responses.

Topic: Llm Rust

Search intent: High-intent search: "rust llm large language model"

Building LLM Applications with Rust

Why Rust for LLM Applications?

Rust excels at the infrastructure layer around LLMs: API gateways, caching layers, token counting, response streaming, and serving local models. While Python dominates model training, Rust serves as the performance-critical glue.

Architecture patterns

rust
┌─────────────────────────────────────────────────────┐
│                Application Layer                     │
│          Business logic, prompt templates            │
└──────────────────────┬──────────────────────────────┘
                       │
┌──────────────────────▼──────────────────────────────┐
│               LLM Gateway Layer (Rust)              │
│  Rate limiting │ CachingRoutingAuth            │
└──────────────────────┬──────────────────────────────┘
                       │
        ┌──────────────┼──────────────┐
        ▼              ▼              ▼
   OpenAI API    Anthropic API   Local Model
   (reqwest)     (reqwest)       (candle/ort)

Runnable example — OpenAI-compatible API client

rust
use serde::{Deserialize, Serialize};
use std::time::Duration;

/// OpenAI-compatible chat request
#[derive(Debug, Serialize)]
struct ChatRequest {
    model: String,
    messages: Vec<ChatMessage>,
    #[serde(skip_serializing_if = "Option::is_none")]
    max_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    temperature: Option<f32>,
    #[serde(default)]
    stream: bool,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
struct ChatMessage {
    role: String,
    content: String,
}

#[derive(Debug, Deserialize)]
struct ChatResponse {
    id: String,
    choices: Vec<ChatChoice>,
    usage: TokenUsage,
}

#[derive(Debug, Deserialize)]
struct ChatChoice {
    message: ChatMessage,
    finish_reason: Option<String>,
}

#[derive(Debug, Deserialize)]
struct TokenUsage {
    prompt_tokens: u32,
    completion_tokens: u32,
    total_tokens: u32,
}

/// LLM client with retry and timeout
struct LlmClient {
    base_url: String,
    api_key: String,
    timeout: Duration,
}

impl LlmClient {
    fn new(base_url: &str, api_key: &str) -> Self {
        Self {
            base_url: base_url.to_string(),
            api_key: api_key.to_string(),
            timeout: Duration::from_secs(30),
        }
    }

    /// Build a chat request (no actual HTTP in this example)
    fn build_request(&self, messages: Vec<ChatMessage>) -> ChatRequest {
        ChatRequest {
            model: "gpt-4o".to_string(),
            messages,
            max_tokens: Some(1024),
            temperature: Some(0.7),
            stream: false,
        }
    }
}

/// Prompt builder with template support
struct PromptBuilder {
    system: String,
    examples: Vec<(String, String)>,
}

impl PromptBuilder {
    fn new(system: &str) -> Self {
        Self { system: system.to_string(), examples: Vec::new() }
    }

    fn add_example(mut self, user: &str, assistant: &str) -> Self {
        self.examples.push((user.to_string(), assistant.to_string()));
        self
    }

    fn build(&self, user_input: &str) -> Vec<ChatMessage> {
        let mut messages = vec![
            ChatMessage { role: "system".to_string(), content: self.system.clone() },
        ];
        for (user, asst) in &self.examples {
            messages.push(ChatMessage { role: "user".to_string(), content: user.clone() });
            messages.push(ChatMessage { role: "assistant".to_string(), content: asst.clone() });
        }
        messages.push(ChatMessage { role: "user".to_string(), content: user_input.to_string() });
        messages
    }
}

fn main() {
    let client = LlmClient::new("https://api.openai.com/v1", "sk-...");

    let prompt = PromptBuilder::new(
        "You are an expert Rust programmer. Give concise, accurate answers."
    )
    .add_example(
        "What is ownership in Rust?",
        "Ownership is Rust's memory management system. Each value has one owner; when the owner goes out of scope, the value is dropped."
    )
    .build("How do I share data between threads in Rust?");

    let request = client.build_request(prompt.clone());
    let json = serde_json::to_string_pretty(&request).unwrap();
    println!("Request payload:\n{}", json);
    println!("\nMessage count: {}", prompt.len());
}

Token counting (without API call)

rust
/// Approximate token count using GPT-4 tokenization heuristics
/// For production use the `tiktoken-rs` crate
fn estimate_tokens(text: &str) -> usize {
    // GPT-4 averages ~4 characters per token for English text
    let char_estimate = text.len() / 4;
    // Adjust for whitespace (each word boundary ≈ 1 token)
    let word_count = text.split_whitespace().count();
    (char_estimate + word_count) / 2
}

fn count_message_tokens(messages: &[(&str, &str)]) -> usize {
    // Each message: 4 tokens overhead + content tokens
    messages.iter().map(|(role, content)| {
        4 + estimate_tokens(role) + estimate_tokens(content)
    }).sum::<usize>() + 2 // 2 for reply priming
}

fn main() {
    let messages = vec![
        ("system", "You are a helpful Rust programming assistant."),
        ("user", "How do I implement a vector store in Rust for semantic search?"),
    ];

    let total = count_message_tokens(&messages);
    println!("Estimated tokens: ~{}", total);
    println!("Cost estimate (GPT-4o at $5/1M input): ${:.6}", total as f64 * 5.0 / 1_000_000.0);
}

Response caching

rust
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::time::{Duration, Instant};

struct LlmCache {
    cache: HashMap<u64, (String, Instant)>,
    ttl: Duration,
}

impl LlmCache {
    fn new(ttl_seconds: u64) -> Self {
        Self { cache: HashMap::new(), ttl: Duration::from_secs(ttl_seconds) }
    }

    fn cache_key(model: &str, messages_json: &str) -> u64 {
        let mut h = DefaultHasher::new();
        model.hash(&mut h);
        messages_json.hash(&mut h);
        h.finish()
    }

    fn get(&self, key: u64) -> Option<&str> {
        self.cache.get(&key).and_then(|(resp, ts)| {
            if ts.elapsed() < self.ttl { Some(resp.as_str()) } else { None }
        })
    }

    fn set(&mut self, key: u64, response: String) {
        self.cache.insert(key, (response, Instant::now()));
    }
}

fn main() {
    let mut cache = LlmCache::new(3600); // 1h TTL
    let key = LlmCache::cache_key("gpt-4o", r#"[{"role":"user","content":"hi"}]"#);

    // Cache miss
    println!("Cache hit: {}", cache.get(key).is_some());

    // Store response
    cache.set(key, "Hello! How can I help?".to_string());
    println!("Cache hit: {}", cache.get(key).is_some());
    println!("Cached: {}", cache.get(key).unwrap());
}

Related reading

Related Guides

Continue in This Topic

Previous

No previous guide in this topic.

Next

LLM API Gateway in Rust

More Rust Guides