Building LLM Applications with Rust
Comprehensive guide to building Large Language Model (LLM) applications in Rust. Covers OpenAI API integration, local model inference, prompt engineering, and streaming responses.
Topic: Llm Rust
Search intent: High-intent search: "rust llm large language model"
Building LLM Applications with Rust
Why Rust for LLM Applications?
Rust excels at the infrastructure layer around LLMs: API gateways, caching layers, token counting, response streaming, and serving local models. While Python dominates model training, Rust serves as the performance-critical glue.
Architecture patterns
┌─────────────────────────────────────────────────────┐
│ Application Layer │
│ Business logic, prompt templates │
└──────────────────────┬──────────────────────────────┘
│
┌──────────────────────▼──────────────────────────────┐
│ LLM Gateway Layer (Rust) │
│ Rate limiting │ Caching │ Routing │ Auth │
└──────────────────────┬──────────────────────────────┘
│
┌──────────────┼──────────────┐
▼ ▼ ▼
OpenAI API Anthropic API Local Model
(reqwest) (reqwest) (candle/ort)Runnable example — OpenAI-compatible API client
use serde::{Deserialize, Serialize};
use std::time::Duration;
/// OpenAI-compatible chat request
#[derive(Debug, Serialize)]
struct ChatRequest {
model: String,
messages: Vec<ChatMessage>,
#[serde(skip_serializing_if = "Option::is_none")]
max_tokens: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
temperature: Option<f32>,
#[serde(default)]
stream: bool,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
struct ChatMessage {
role: String,
content: String,
}
#[derive(Debug, Deserialize)]
struct ChatResponse {
id: String,
choices: Vec<ChatChoice>,
usage: TokenUsage,
}
#[derive(Debug, Deserialize)]
struct ChatChoice {
message: ChatMessage,
finish_reason: Option<String>,
}
#[derive(Debug, Deserialize)]
struct TokenUsage {
prompt_tokens: u32,
completion_tokens: u32,
total_tokens: u32,
}
/// LLM client with retry and timeout
struct LlmClient {
base_url: String,
api_key: String,
timeout: Duration,
}
impl LlmClient {
fn new(base_url: &str, api_key: &str) -> Self {
Self {
base_url: base_url.to_string(),
api_key: api_key.to_string(),
timeout: Duration::from_secs(30),
}
}
/// Build a chat request (no actual HTTP in this example)
fn build_request(&self, messages: Vec<ChatMessage>) -> ChatRequest {
ChatRequest {
model: "gpt-4o".to_string(),
messages,
max_tokens: Some(1024),
temperature: Some(0.7),
stream: false,
}
}
}
/// Prompt builder with template support
struct PromptBuilder {
system: String,
examples: Vec<(String, String)>,
}
impl PromptBuilder {
fn new(system: &str) -> Self {
Self { system: system.to_string(), examples: Vec::new() }
}
fn add_example(mut self, user: &str, assistant: &str) -> Self {
self.examples.push((user.to_string(), assistant.to_string()));
self
}
fn build(&self, user_input: &str) -> Vec<ChatMessage> {
let mut messages = vec![
ChatMessage { role: "system".to_string(), content: self.system.clone() },
];
for (user, asst) in &self.examples {
messages.push(ChatMessage { role: "user".to_string(), content: user.clone() });
messages.push(ChatMessage { role: "assistant".to_string(), content: asst.clone() });
}
messages.push(ChatMessage { role: "user".to_string(), content: user_input.to_string() });
messages
}
}
fn main() {
let client = LlmClient::new("https://api.openai.com/v1", "sk-...");
let prompt = PromptBuilder::new(
"You are an expert Rust programmer. Give concise, accurate answers."
)
.add_example(
"What is ownership in Rust?",
"Ownership is Rust's memory management system. Each value has one owner; when the owner goes out of scope, the value is dropped."
)
.build("How do I share data between threads in Rust?");
let request = client.build_request(prompt.clone());
let json = serde_json::to_string_pretty(&request).unwrap();
println!("Request payload:\n{}", json);
println!("\nMessage count: {}", prompt.len());
}Token counting (without API call)
/// Approximate token count using GPT-4 tokenization heuristics
/// For production use the `tiktoken-rs` crate
fn estimate_tokens(text: &str) -> usize {
// GPT-4 averages ~4 characters per token for English text
let char_estimate = text.len() / 4;
// Adjust for whitespace (each word boundary ≈ 1 token)
let word_count = text.split_whitespace().count();
(char_estimate + word_count) / 2
}
fn count_message_tokens(messages: &[(&str, &str)]) -> usize {
// Each message: 4 tokens overhead + content tokens
messages.iter().map(|(role, content)| {
4 + estimate_tokens(role) + estimate_tokens(content)
}).sum::<usize>() + 2 // 2 for reply priming
}
fn main() {
let messages = vec![
("system", "You are a helpful Rust programming assistant."),
("user", "How do I implement a vector store in Rust for semantic search?"),
];
let total = count_message_tokens(&messages);
println!("Estimated tokens: ~{}", total);
println!("Cost estimate (GPT-4o at $5/1M input): ${:.6}", total as f64 * 5.0 / 1_000_000.0);
}Response caching
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
use std::time::{Duration, Instant};
struct LlmCache {
cache: HashMap<u64, (String, Instant)>,
ttl: Duration,
}
impl LlmCache {
fn new(ttl_seconds: u64) -> Self {
Self { cache: HashMap::new(), ttl: Duration::from_secs(ttl_seconds) }
}
fn cache_key(model: &str, messages_json: &str) -> u64 {
let mut h = DefaultHasher::new();
model.hash(&mut h);
messages_json.hash(&mut h);
h.finish()
}
fn get(&self, key: u64) -> Option<&str> {
self.cache.get(&key).and_then(|(resp, ts)| {
if ts.elapsed() < self.ttl { Some(resp.as_str()) } else { None }
})
}
fn set(&mut self, key: u64, response: String) {
self.cache.insert(key, (response, Instant::now()));
}
}
fn main() {
let mut cache = LlmCache::new(3600); // 1h TTL
let key = LlmCache::cache_key("gpt-4o", r#"[{"role":"user","content":"hi"}]"#);
// Cache miss
println!("Cache hit: {}", cache.get(key).is_some());
// Store response
cache.set(key, "Hello! How can I help?".to_string());
println!("Cache hit: {}", cache.get(key).is_some());
println!("Cached: {}", cache.get(key).unwrap());
}