LLM Rust Team Workflow
Team practices for building and maintaining LLM applications in Rust: prompt engineering workflow, model evaluation, deployment pipelines, incident response, and cost governance.
Topic: Llm Rust
Search intent: High-intent search: "rust llm team practices mlops"
LLM Rust Team Workflow
Prompt development lifecycle
1. Draft prompt in dev environment
↓
2. Test against golden eval dataset (offline)
↓
3. A/B test in staging (5% traffic)
↓
4. Review metrics: quality score, latency, cost
↓
5. Promote to production (gradual rollout)
↓
6. Monitor production metrics
↓
7. Version and tag in gitRunnable example — evaluation harness
use std::time::Instant;
/// One test case for prompt evaluation
#[derive(Debug, Clone)]
struct EvalCase {
id: String,
input: String,
expected_output_contains: Vec<String>,
expected_output_not_contains: Vec<String>,
quality_rubric: Vec<String>, // Instructions for scoring
}
#[derive(Debug)]
struct EvalResult {
case_id: String,
output: String,
latency_ms: f64,
input_tokens: u32,
output_tokens: u32,
passed_checks: usize,
total_checks: usize,
score: f64, // 0.0 to 1.0
}
impl EvalResult {
fn passed(&self) -> bool { self.score >= 0.8 }
}
struct EvalHarness {
cases: Vec<EvalCase>,
}
impl EvalHarness {
fn new(cases: Vec<EvalCase>) -> Self { Self { cases } }
async fn run<F, Fut>(&self, model_fn: F) -> Vec<EvalResult>
where
F: Fn(&str) -> Fut,
Fut: std::future::Future<Output = String>,
{
let mut results = Vec::new();
for case in &self.cases {
let t = Instant::now();
let output = model_fn(&case.input).await;
let latency_ms = t.elapsed().as_secs_f64() * 1000.0;
let mut passed = 0usize;
let total = case.expected_output_contains.len() + case.expected_output_not_contains.len();
for must_contain in &case.expected_output_contains {
if output.to_lowercase().contains(&must_contain.to_lowercase()) {
passed += 1;
}
}
for must_not in &case.expected_output_not_contains {
if !output.to_lowercase().contains(&must_not.to_lowercase()) {
passed += 1;
}
}
let score = if total == 0 { 1.0 } else { passed as f64 / total as f64 };
let input_tokens = case.input.len() as u32 / 4;
let output_tokens = output.len() as u32 / 4;
results.push(EvalResult {
case_id: case.id.clone(),
output: output[..output.len().min(100)].to_string(),
latency_ms,
input_tokens,
output_tokens,
passed_checks: passed,
total_checks: total,
score,
});
}
results
}
fn print_report(results: &[EvalResult]) {
let n = results.len();
let passed = results.iter().filter(|r| r.passed()).count();
let avg_score = results.iter().map(|r| r.score).sum::<f64>() / n as f64;
let avg_latency = results.iter().map(|r| r.latency_ms).sum::<f64>() / n as f64;
println!("=== Eval Report ===");
println!("Passed: {}/{} ({:.0}%)", passed, n, passed as f64 / n as f64 * 100.0);
println!("Avg score: {:.2} | Avg latency: {:.0}ms", avg_score, avg_latency);
println!("\nFailed cases:");
for result in results.iter().filter(|r| !r.passed()) {
println!(" ❌ [{}] score={:.2} output='{}'", result.case_id, result.score, result.output);
}
}
}
#[tokio::main]
async fn main() {
let harness = EvalHarness::new(vec![
EvalCase {
id: "ownership_1".to_string(),
input: "What is Rust ownership?".to_string(),
expected_output_contains: vec!["owner".to_string(), "memory".to_string()],
expected_output_not_contains: vec!["garbage collector".to_string()],
quality_rubric: vec!["Accurate".to_string(), "Concise".to_string()],
},
EvalCase {
id: "async_1".to_string(),
input: "How does async work in Rust?".to_string(),
expected_output_contains: vec!["future".to_string(), "await".to_string()],
expected_output_not_contains: vec!["threading".to_string()],
quality_rubric: vec!["Technically correct".to_string()],
},
]);
let results = harness.run(|prompt| async move {
// Simulate model output
if prompt.contains("ownership") {
"Rust ownership: each value has one owner; memory is freed when owner is dropped.".to_string()
} else {
"Async in Rust uses Future trait and await syntax for non-blocking I/O.".to_string()
}
}).await;
EvalHarness::print_report(&results);
}Cost governance
use std::sync::atomic::{AtomicU64, Ordering};
/// Track and enforce token budgets per team/feature
struct CostGovernor {
daily_budget_cents: u64,
spent_today: AtomicU64, // in micro-cents (avoid floating point)
}
impl CostGovernor {
fn new(daily_budget_usd: f64) -> Self {
Self {
daily_budget_cents: (daily_budget_usd * 100.0) as u64,
spent_today: AtomicU64::new(0),
}
}
fn record_usage(&self, model: &str, input_tokens: u32, output_tokens: u32) -> Result<(), String> {
let cost_cents = self.calculate_cost_cents(model, input_tokens, output_tokens);
let current = self.spent_today.fetch_add(cost_cents, Ordering::Relaxed);
if current + cost_cents > self.daily_budget_cents {
// Rollback the addition
self.spent_today.fetch_sub(cost_cents, Ordering::Relaxed);
return Err(format!(
"Daily budget exceeded: spent ${:.2} of ${:.2}",
current as f64 / 100.0,
self.daily_budget_cents as f64 / 100.0
));
}
Ok(())
}
fn calculate_cost_cents(&self, model: &str, input: u32, output: u32) -> u64 {
let (input_rate, output_rate) = match model {
"gpt-4o" => (500, 1500), // $5/$15 per 1M → micro-cents
"gpt-4o-mini" => (15, 60),
_ => (500, 1500),
};
(input as u64 * input_rate + output as u64 * output_rate) / 1_000_000
}
fn remaining_budget_usd(&self) -> f64 {
let spent = self.spent_today.load(Ordering::Relaxed);
(self.daily_budget_cents.saturating_sub(spent)) as f64 / 100.0
}
}
fn main() {
let governor = CostGovernor::new(50.0); // $50/day budget
for i in 0..5 {
match governor.record_usage("gpt-4o", 5000, 2000) {
Ok(()) => println!("Request {}: OK | Remaining: ${:.2}", i, governor.remaining_budget_usd()),
Err(e) => println!("Request {}: BLOCKED — {}", i, e),
}
}
}