RRust By Example

LLM Rust Testing Strategy

Testing strategies for LLM applications in Rust: mocking LLM APIs, golden output tests, prompt regression testing, integration testing streaming responses, and evaluating model outputs.

Topic: Llm Rust

Search intent: High-intent search: "rust llm testing mock api"

LLM Rust Testing Strategy

Testing pyramid for LLM apps

rust
         ┌─────────────────────┐
         │  Eval tests (few)   │ — Real LLM calls, measure output quality
         ├─────────────────────┤
         │  Integration tests  │ — Mock LLM API, test full pipeline
         ├─────────────────────┤
         │  Unit tests (many)  │ — Prompt builders, parsers, validators
         └─────────────────────┘

Unit tests: prompt builders and parsers

rust
fn build_qa_prompt(system: &str, context: &str, question: &str) -> Vec<(String, String)> {
    vec![
        ("system".to_string(), system.to_string()),
        ("user".to_string(), format!("Context: {}\n\nQuestion: {}", context, question)),
    ]
}

fn parse_json_from_llm(raw: &str) -> Result<serde_json::Value, String> {
    // LLMs sometimes wrap JSON in markdown code blocks
    let clean = raw
        .trim()
        .trim_start_matches("```json")
        .trim_start_matches("```")
        .trim_end_matches("```")
        .trim();
    serde_json::from_str(clean).map_err(|e| format!("Parse error: {}", e))
}

#[cfg(test)]
mod unit_tests {
    use super::*;

    #[test]
    fn test_prompt_structure() {
        let msgs = build_qa_prompt("You are helpful.", "Rust is fast.", "Why is Rust fast?");
        assert_eq!(msgs.len(), 2);
        assert_eq!(msgs[0].0, "system");
        assert_eq!(msgs[1].0, "user");
        assert!(msgs[1].1.contains("Context:"));
        assert!(msgs[1].1.contains("Question:"));
    }

    #[test]
    fn test_parse_clean_json() {
        let json = r#"{"answer": "Rust is fast due to zero-cost abstractions."}"#;
        let v = parse_json_from_llm(json).unwrap();
        assert_eq!(v["answer"].as_str().unwrap(), "Rust is fast due to zero-cost abstractions.");
    }

    #[test]
    fn test_parse_markdown_wrapped_json() {
        let markdown_json = "```json\n{\"score\": 95}\n```";
        let v = parse_json_from_llm(markdown_json).unwrap();
        assert_eq!(v["score"].as_i64().unwrap(), 95);
    }

    #[test]
    fn test_parse_invalid_json_returns_error() {
        let bad = "This is not JSON at all";
        assert!(parse_json_from_llm(bad).is_err());
    }
}

fn main() {
    // Run: cargo test
    let msgs = build_qa_prompt("Be concise.", "Rust has a borrow checker.", "What does Rust have?");
    println!("Prompt messages: {}", msgs.len());
}

Mock LLM client for integration tests

rust
use std::collections::HashMap;
use async_trait::async_trait;

#[async_trait]
trait LlmClient: Send + Sync {
    async fn complete(&self, prompt: &str) -> Result<String, String>;
}

/// Production client (uses real API)
struct OpenAiClient { api_key: String }

#[async_trait]
impl LlmClient for OpenAiClient {
    async fn complete(&self, prompt: &str) -> Result<String, String> {
        // Real HTTP call in production
        Ok(format!("OpenAI response to: {}", &prompt[..20.min(prompt.len())]))
    }
}

/// Mock client for tests — returns pre-programmed responses
struct MockLlmClient {
    responses: HashMap<String, String>,
    default_response: String,
}

impl MockLlmClient {
    fn new(default: &str) -> Self {
        Self {
            responses: HashMap::new(),
            default_response: default.to_string(),
        }
    }

    fn with_response(mut self, prompt_contains: &str, response: &str) -> Self {
        self.responses.insert(prompt_contains.to_string(), response.to_string());
        self
    }
}

#[async_trait]
impl LlmClient for MockLlmClient {
    async fn complete(&self, prompt: &str) -> Result<String, String> {
        for (pattern, response) in &self.responses {
            if prompt.contains(pattern.as_str()) {
                return Ok(response.clone());
            }
        }
        Ok(self.default_response.clone())
    }
}

/// Application logic that uses LlmClient — testable via trait object
struct QaBot {
    client: Box<dyn LlmClient>,
}

impl QaBot {
    fn new(client: Box<dyn LlmClient>) -> Self { Self { client } }

    async fn answer(&self, question: &str) -> Result<String, String> {
        let prompt = format!("Answer concisely: {}", question);
        self.client.complete(&prompt).await
    }
}

#[tokio::main]
async fn main() {
    // Test with mock client
    let mock = MockLlmClient::new("I don't know")
        .with_response("ownership", "Ownership ensures each value has one owner in Rust.")
        .with_response("async", "Rust async uses Futures and the .await syntax.");

    let bot = QaBot::new(Box::new(mock));

    let questions = [
        "What is ownership in Rust?",
        "How does async work?",
        "What is 42?",
    ];

    for q in &questions {
        let answer = bot.answer(q).await.unwrap();
        println!("Q: {}\nA: {}\n", q, answer);
    }
}

Golden output regression tests

rust
/// Record expected outputs and alert on changes
struct GoldenTests {
    tests: Vec<GoldenTestCase>,
}

struct GoldenTestCase {
    name: String,
    prompt: String,
    expected_contains: Vec<String>,
    expected_not_contains: Vec<String>,
}

impl GoldenTests {
    fn run(&self, generate: impl Fn(&str) -> String) -> (usize, usize) {
        let mut passed = 0;
        let mut failed = 0;

        for test in &self.tests {
            let output = generate(&test.prompt);
            let mut ok = true;

            for must_contain in &test.expected_contains {
                if !output.to_lowercase().contains(&must_contain.to_lowercase()) {
                    println!("❌ [{}] Missing: '{}'", test.name, must_contain);
                    ok = false;
                }
            }
            for must_not in &test.expected_not_contains {
                if output.to_lowercase().contains(&must_not.to_lowercase()) {
                    println!("❌ [{}] Should not contain: '{}'", test.name, must_not);
                    ok = false;
                }
            }

            if ok { println!("✅ [{}] passed", test.name); passed += 1; }
            else { failed += 1; }
        }
        (passed, failed)
    }
}

fn main() {
    let golden = GoldenTests {
        tests: vec![
            GoldenTestCase {
                name: "ownership_explanation".to_string(),
                prompt: "Explain Rust ownership".to_string(),
                expected_contains: vec!["owner".to_string(), "memory".to_string()],
                expected_not_contains: vec!["garbage collector".to_string()],
            },
        ],
    };

    let (passed, failed) = golden.run(|prompt| {
        // Simulate LLM output
        format!("Rust ownership means each value has exactly one owner and memory is freed when owner drops.")
    });
    println!("\nResults: {} passed, {} failed", passed, failed);
}

Related reading

Related Guides

Continue in This Topic

More Rust Guides