LLM Rust Migration Guide
Migrate your LLM application to Rust from Python (LangChain, LlamaIndex) or Node.js. Step-by-step guide with API compatibility, performance comparison, and migration strategies.
Topic: Llm Rust
Search intent: High-intent search: "migrate python langchain to rust llm"
LLM Rust Migration Guide
Why migrate from Python LangChain to Rust?
| Aspect | Python LangChain | Rust Custom |
|---|---|---|
| Memory per instance | 200–500MB | 20–50MB |
| Cold start | 3–8s | 0.1–0.5s |
| Concurrent requests (single machine) | 50–200 | 5,000–50,000 |
| Type safety | Runtime (Pydantic) | Compile-time |
| Deployment complexity | Python env + deps | Single binary |
| LangChain abstraction | High (magic) | Low (explicit) |
Best candidates for migration: high-throughput LLM gateways, latency-sensitive features, services where LLM is one step in a pipeline.
Keep Python: research prototyping, complex chain orchestration, rapid experimentation.
Step 1: Map LangChain concepts to Rust
// LangChain Python: Rust equivalent:
// ChatOpenAI(model="gpt-4o") → LlmClient struct with reqwest
// PromptTemplate(...) → String formatting functions
// LLMChain(prompt, llm) → Function composition
// ConversationBufferMemory() → Vec<Message> + context window management
// VectorStore(embeddings) → Custom Vec<Embedding> + cosine search
// RetrievalQA → retrieve() + build_prompt() + complete()
use serde::{Deserialize, Serialize};
#[derive(Clone, Serialize, Deserialize)]
struct Message { role: String, content: String }
/// Rust equivalent of LangChain's PromptTemplate
struct PromptTemplate {
template: String,
}
impl PromptTemplate {
fn new(template: &str) -> Self {
Self { template: template.to_string() }
}
/// Fill template variables — Rust version of .format_messages()
fn format(&self, vars: &[(&str, &str)]) -> String {
let mut result = self.template.clone();
for (key, value) in vars {
result = result.replace(&format!("{{{{{}}}}}", key), value);
}
result
}
}
fn main() {
let template = PromptTemplate::new(
"You are a {{role}}. Answer the following: {{question}}"
);
let prompt = template.format(&[
("role", "Rust expert"),
("question", "How do I use async/await?"),
]);
println!("{}", prompt);
}Step 2: Port RAG pipeline
/// Rust equivalent of LangChain's RetrievalQAChain
struct Document { id: String, content: String, embedding: Vec<f32> }
struct RagChain {
documents: Vec<Document>,
top_k: usize,
}
impl RagChain {
fn new(top_k: usize) -> Self { Self { documents: Vec::new(), top_k } }
fn add_document(&mut self, id: &str, content: &str, embed: Vec<f32>) {
self.documents.push(Document {
id: id.to_string(), content: content.to_string(), embedding: embed,
});
}
fn retrieve(&self, query_embed: &[f32]) -> Vec<&Document> {
let cosine = |a: &[f32], b: &[f32]| -> f32 {
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let na = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let nb = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if na < 1e-8 || nb < 1e-8 { 0.0 } else { dot / (na * nb) }
};
let mut scored: Vec<(usize, f32)> = self.documents.iter()
.enumerate()
.map(|(i, d)| (i, cosine(query_embed, &d.embedding)))
.collect();
scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
scored.into_iter().take(self.top_k).map(|(i, _)| &self.documents[i]).collect()
}
fn build_prompt(&self, question: &str, docs: &[&Document]) -> Vec<Message> {
let context = docs.iter()
.map(|d| format!("[{}]: {}", d.id, d.content))
.collect::<Vec<_>>()
.join("\n\n");
vec![
Message {
role: "system".to_string(),
content: "Answer based on context only. If not in context, say you don't know.".to_string(),
},
Message {
role: "user".to_string(),
content: format!("Context:\n{}\n\nQuestion: {}", context, question),
},
]
}
fn run(&self, question: &str, query_embed: &[f32]) -> Vec<Message> {
let docs = self.retrieve(query_embed);
self.build_prompt(question, &docs)
}
}
fn mock_embed(text: &str) -> Vec<f32> {
let mut v = vec![0.0f32; 4];
for (i, c) in text.chars().enumerate() { v[i % 4] += c as f32 / 1000.0; }
let n = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-8);
v.iter().map(|x| x / n).collect()
}
fn main() {
let mut rag = RagChain::new(2);
rag.add_document("doc1", "Rust ownership ensures memory safety without GC.", mock_embed("ownership memory safety"));
rag.add_document("doc2", "Tokio enables async I/O in Rust applications.", mock_embed("tokio async io"));
rag.add_document("doc3", "Candle is a Rust ML framework from Hugging Face.", mock_embed("candle ml framework"));
let query = "How does Rust handle memory?";
let query_emb = mock_embed("memory management rust");
let messages = rag.run(query, &query_emb);
println!("Generated {} messages for LLM", messages.len());
println!("Context preview: {}...", &messages[1].content[..100]);
}Step 3: Incremental migration approach
/// Use feature flags to migrate gradually
#[derive(Debug, Clone, Copy)]
enum LlmBackend {
Python, // Old Python service
Rust, // New Rust implementation
}
struct FeatureRouter {
/// Percentage of traffic to route to Rust (0-100)
rust_percentage: u8,
counter: std::sync::atomic::AtomicU64,
}
impl FeatureRouter {
fn new(rust_percentage: u8) -> Self {
Self {
rust_percentage,
counter: std::sync::atomic::AtomicU64::new(0),
}
}
fn select_backend(&self) -> LlmBackend {
let n = self.counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
if n % 100 < self.rust_percentage as u64 {
LlmBackend::Rust
} else {
LlmBackend::Python
}
}
}
fn main() {
// Migration phases:
let phases = [
("Phase 1: Shadow mode", 0), // 0% Rust traffic, log discrepancies
("Phase 2: Canary", 5), // 5% Rust
("Phase 3: Ramp up", 50), // 50% Rust
("Phase 4: Full migration", 100), // 100% Rust
];
for (phase_name, percentage) in phases {
let router = FeatureRouter::new(percentage);
let rust_count = (0..100).filter(|_| {
matches!(router.select_backend(), LlmBackend::Rust)
}).count();
println!("{}: {}% Rust ({}/100 requests)", phase_name, percentage, rust_count);
}
}