LLM Rust Production Guide
Deploy LLM-powered Rust services to production: API key management, streaming proxies, context window management, fallback chains, and cost controls.
Topic: Llm Rust
Search intent: High-intent search: "rust llm production deployment api"
LLM Rust Production Guide
Pre-production checklist
- [ ] API keys stored in environment variables or secret manager, never in code.
- [ ] Token budget limits set per tenant/user to prevent runaway costs.
- [ ] Request timeouts configured (p99 LLM latency can be 10–60s).
- [ ] Streaming proxy tested under client disconnection scenarios.
- [ ] Fallback chain defined: primary → secondary provider → graceful error.
- [ ] Token usage logged for cost attribution.
- [ ] Output moderation/filtering applied before returning to users.
Runnable example — production-hardened LLM caller
use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};
#[derive(Debug, Serialize, Clone)]
struct ChatMessage {
role: String,
content: String,
}
#[derive(Debug, Serialize)]
struct OpenAiRequest {
model: String,
messages: Vec<ChatMessage>,
max_tokens: u32,
temperature: f32,
stream: bool,
}
#[derive(Debug, Deserialize)]
struct OpenAiResponse {
choices: Vec<OpenAiChoice>,
usage: OpenAiUsage,
}
#[derive(Debug, Deserialize)]
struct OpenAiChoice {
message: OpenAiMessage,
finish_reason: Option<String>,
}
#[derive(Debug, Deserialize)]
struct OpenAiMessage { content: String }
#[derive(Debug, Deserialize)]
struct OpenAiUsage {
prompt_tokens: u32,
completion_tokens: u32,
total_tokens: u32,
}
/// Production caller with retry, timeout, and cost guards
struct ProductionLlmCaller {
api_key: String,
max_retries: u32,
base_timeout: Duration,
max_cost_per_request_usd: f64,
}
impl ProductionLlmCaller {
fn new(api_key: String) -> Self {
Self {
api_key,
max_retries: 3,
base_timeout: Duration::from_secs(30),
max_cost_per_request_usd: 0.10, // $0.10 per request limit
}
}
fn estimate_cost(&self, model: &str, total_tokens: u32) -> f64 {
match model {
"gpt-4o" => total_tokens as f64 * 7.5 / 1_000_000.0,
"gpt-4o-mini" => total_tokens as f64 * 0.3 / 1_000_000.0,
"claude-3-5-sonnet" => total_tokens as f64 * 4.5 / 1_000_000.0,
_ => total_tokens as f64 * 5.0 / 1_000_000.0,
}
}
fn check_cost_guard(&self, model: &str, estimated_tokens: u32) -> Result<(), String> {
let estimated_cost = self.estimate_cost(model, estimated_tokens);
if estimated_cost > self.max_cost_per_request_usd {
return Err(format!(
"Estimated cost ${:.4} exceeds per-request limit ${:.4}",
estimated_cost, self.max_cost_per_request_usd
));
}
Ok(())
}
/// Simulate making an LLM API call
async fn call_api(&self, request: &OpenAiRequest) -> Result<OpenAiResponse, String> {
// In production, use reqwest:
// let client = reqwest::Client::new();
// let resp = client.post(url)
// .bearer_auth(&self.api_key)
// .json(request)
// .timeout(self.base_timeout)
// .send().await?;
// Simulated response
tokio::time::sleep(Duration::from_millis(50)).await;
Ok(OpenAiResponse {
choices: vec![OpenAiChoice {
message: OpenAiMessage {
content: "Rust is excellent for AI workloads due to zero-cost abstractions and predictable performance.".to_string(),
},
finish_reason: Some("stop".to_string()),
}],
usage: OpenAiUsage {
prompt_tokens: 50,
completion_tokens: 25,
total_tokens: 75,
},
})
}
async fn complete(
&self,
messages: Vec<ChatMessage>,
model: &str,
max_tokens: u32,
) -> Result<(String, OpenAiUsage), String> {
// Pre-flight cost check
let estimated_tokens = messages.iter().map(|m| m.content.len() / 4 + 4).sum::<usize>() as u32 + max_tokens;
self.check_cost_guard(model, estimated_tokens)?;
let request = OpenAiRequest {
model: model.to_string(),
messages,
max_tokens,
temperature: 0.7,
stream: false,
};
// Retry with exponential backoff
let mut last_error = String::new();
for attempt in 0..=self.max_retries {
if attempt > 0 {
let delay = Duration::from_millis(500 * 2u64.pow(attempt - 1));
println!("[llm] Retry {} after {:?}", attempt, delay);
tokio::time::sleep(delay).await;
}
let start = Instant::now();
match self.call_api(&request).await {
Ok(resp) => {
let elapsed = start.elapsed();
let content = resp.choices[0].message.content.clone();
let actual_cost = self.estimate_cost(model, resp.usage.total_tokens);
println!(
"[llm] OK in {:.1}s | {} tokens | ${:.6}",
elapsed.as_secs_f64(),
resp.usage.total_tokens,
actual_cost
);
return Ok((content, resp.usage));
}
Err(e) => {
eprintln!("[llm] Attempt {} failed: {}", attempt, e);
last_error = e;
}
}
}
Err(format!("All {} retries failed: {}", self.max_retries, last_error))
}
}
#[tokio::main]
async fn main() {
let api_key = std::env::var("OPENAI_API_KEY")
.unwrap_or_else(|_| "sk-test".to_string());
let caller = ProductionLlmCaller::new(api_key);
let messages = vec![
ChatMessage { role: "system".to_string(), content: "You are an expert Rust programmer.".to_string() },
ChatMessage { role: "user".to_string(), content: "Why is Rust good for AI inference?".to_string() },
];
match caller.complete(messages, "gpt-4o", 256).await {
Ok((content, usage)) => {
println!("\nResponse: {}", content);
println!("Tokens: {} prompt + {} completion = {} total",
usage.prompt_tokens, usage.completion_tokens, usage.total_tokens);
}
Err(e) => eprintln!("Error: {}", e),
}
}Context window management
/// Manage conversation history within token limits
struct ConversationManager {
messages: Vec<ChatMessage>,
max_tokens: usize,
system_prompt: String,
}
impl ConversationManager {
fn new(system: &str, max_tokens: usize) -> Self {
Self {
messages: Vec::new(),
max_tokens,
system_prompt: system.to_string(),
}
}
fn add_message(&mut self, role: &str, content: &str) {
self.messages.push(ChatMessage {
role: role.to_string(),
content: content.to_string(),
});
self.trim_to_fit();
}
fn estimate_tokens(s: &str) -> usize { s.len() / 4 + 4 }
fn trim_to_fit(&mut self) {
let system_tokens = Self::estimate_tokens(&self.system_prompt);
let budget = self.max_tokens.saturating_sub(system_tokens + 500); // 500 for completion
// Remove oldest messages (keep pairs: user+assistant) until we fit
while self.total_tokens() > budget && self.messages.len() > 2 {
self.messages.remove(0);
if !self.messages.is_empty() && self.messages[0].role == "assistant" {
self.messages.remove(0);
}
}
}
fn total_tokens(&self) -> usize {
self.messages.iter().map(|m| Self::estimate_tokens(&m.content)).sum()
}
fn build_messages(&self) -> Vec<ChatMessage> {
let mut msgs = vec![ChatMessage {
role: "system".to_string(),
content: self.system_prompt.clone(),
}];
msgs.extend(self.messages.clone());
msgs
}
}
fn main() {
let mut conv = ConversationManager::new(
"You are a Rust expert.",
4096,
);
conv.add_message("user", "How does ownership work?");
conv.add_message("assistant", "Ownership ensures each value has one owner...");
conv.add_message("user", "What about borrowing?");
println!("Messages in context: {}", conv.build_messages().len());
println!("Estimated tokens: ~{}", conv.total_tokens());
}