LLM Rust Testing Strategy

Testing pyramid for LLM apps

rust

         ┌─────────────────────┐
         │  Eval tests (few)   │ — Real LLM calls, measure output quality
         ├─────────────────────┤
         │  Integration tests  │ — Mock LLM API, test full pipeline
         ├─────────────────────┤
         │  Unit tests (many)  │ — Prompt builders, parsers, validators
         └─────────────────────┘

Unit tests: prompt builders and parsers

rust

fn build_qa_prompt(system: &str, context: &str, question: &str) -> Vec<(String, String)> {
    vec![
        ("system".to_string(), system.to_string()),
        ("user".to_string(), format!("Context: {}\n\nQuestion: {}", context, question)),
    ]
}

fn parse_json_from_llm(raw: &str) -> Result<serde_json::Value, String> {
    // LLMs sometimes wrap JSON in markdown code blocks
    let clean = raw
        .trim()
        .trim_start_matches("```json")
        .trim_start_matches("```")
        .trim_end_matches("```")
        .trim();
    serde_json::from_str(clean).map_err(|e| format!("Parse error: {}", e))
}

#[cfg(test)]
mod unit_tests {
    use super::*;

    #[test]
    fn test_prompt_structure() {
        let msgs = build_qa_prompt("You are helpful.", "Rust is fast.", "Why is Rust fast?");
        assert_eq!(msgs.len(), 2);
        assert_eq!(msgs[0].0, "system");
        assert_eq!(msgs[1].0, "user");
        assert!(msgs[1].1.contains("Context:"));
        assert!(msgs[1].1.contains("Question:"));
    }

    #[test]
    fn test_parse_clean_json() {
        let json = r#"{"answer": "Rust is fast due to zero-cost abstractions."}"#;
        let v = parse_json_from_llm(json).unwrap();
        assert_eq!(v["answer"].as_str().unwrap(), "Rust is fast due to zero-cost abstractions.");
    }

    #[test]
    fn test_parse_markdown_wrapped_json() {
        let markdown_json = "```json\n{\"score\": 95}\n```";
        let v = parse_json_from_llm(markdown_json).unwrap();
        assert_eq!(v["score"].as_i64().unwrap(), 95);
    }

    #[test]
    fn test_parse_invalid_json_returns_error() {
        let bad = "This is not JSON at all";
        assert!(parse_json_from_llm(bad).is_err());
    }
}

fn main() {
    // Run: cargo test
    let msgs = build_qa_prompt("Be concise.", "Rust has a borrow checker.", "What does Rust have?");
    println!("Prompt messages: {}", msgs.len());
}

Mock LLM client for integration tests

rust

use std::collections::HashMap;
use async_trait::async_trait;

#[async_trait]
trait LlmClient: Send + Sync {
    async fn complete(&self, prompt: &str) -> Result<String, String>;
}

/// Production client (uses real API)
struct OpenAiClient { api_key: String }

#[async_trait]
impl LlmClient for OpenAiClient {
    async fn complete(&self, prompt: &str) -> Result<String, String> {
        // Real HTTP call in production
        Ok(format!("OpenAI response to: {}", &prompt[..20.min(prompt.len())]))
    }
}

/// Mock client for tests — returns pre-programmed responses
struct MockLlmClient {
    responses: HashMap<String, String>,
    default_response: String,
}

impl MockLlmClient {
    fn new(default: &str) -> Self {
        Self {
            responses: HashMap::new(),
            default_response: default.to_string(),
        }
    }

    fn with_response(mut self, prompt_contains: &str, response: &str) -> Self {
        self.responses.insert(prompt_contains.to_string(), response.to_string());
        self
    }
}

#[async_trait]
impl LlmClient for MockLlmClient {
    async fn complete(&self, prompt: &str) -> Result<String, String> {
        for (pattern, response) in &self.responses {
            if prompt.contains(pattern.as_str()) {
                return Ok(response.clone());
            }
        }
        Ok(self.default_response.clone())
    }
}

/// Application logic that uses LlmClient — testable via trait object
struct QaBot {
    client: Box<dyn LlmClient>,
}

impl QaBot {
    fn new(client: Box<dyn LlmClient>) -> Self { Self { client } }

    async fn answer(&self, question: &str) -> Result<String, String> {
        let prompt = format!("Answer concisely: {}", question);
        self.client.complete(&prompt).await
    }
}

#[tokio::main]
async fn main() {
    // Test with mock client
    let mock = MockLlmClient::new("I don't know")
        .with_response("ownership", "Ownership ensures each value has one owner in Rust.")
        .with_response("async", "Rust async uses Futures and the .await syntax.");

    let bot = QaBot::new(Box::new(mock));

    let questions = [
        "What is ownership in Rust?",
        "How does async work?",
        "What is 42?",
    ];

    for q in &questions {
        let answer = bot.answer(q).await.unwrap();
        println!("Q: {}\nA: {}\n", q, answer);
    }
}

Golden output regression tests

rust

/// Record expected outputs and alert on changes
struct GoldenTests {
    tests: Vec<GoldenTestCase>,
}

struct GoldenTestCase {
    name: String,
    prompt: String,
    expected_contains: Vec<String>,
    expected_not_contains: Vec<String>,
}

impl GoldenTests {
    fn run(&self, generate: impl Fn(&str) -> String) -> (usize, usize) {
        let mut passed = 0;
        let mut failed = 0;

        for test in &self.tests {
            let output = generate(&test.prompt);
            let mut ok = true;

            for must_contain in &test.expected_contains {
                if !output.to_lowercase().contains(&must_contain.to_lowercase()) {
                    println!("❌ [{}] Missing: '{}'", test.name, must_contain);
                    ok = false;
                }
            }
            for must_not in &test.expected_not_contains {
                if output.to_lowercase().contains(&must_not.to_lowercase()) {
                    println!("❌ [{}] Should not contain: '{}'", test.name, must_not);
                    ok = false;
                }
            }

            if ok { println!("✅ [{}] passed", test.name); passed += 1; }
            else { failed += 1; }
        }
        (passed, failed)
    }
}

fn main() {
    let golden = GoldenTests {
        tests: vec![
            GoldenTestCase {
                name: "ownership_explanation".to_string(),
                prompt: "Explain Rust ownership".to_string(),
                expected_contains: vec!["owner".to_string(), "memory".to_string()],
                expected_not_contains: vec!["garbage collector".to_string()],
            },
        ],
    };

    let (passed, failed) = golden.run(|prompt| {
        // Simulate LLM output
        format!("Rust ownership means each value has exactly one owner and memory is freed when owner drops.")
    });
    println!("\nResults: {} passed, {} failed", passed, failed);
}

LLM Rust Testing Strategy

LLM Rust Testing Strategy

Testing pyramid for LLM apps

Unit tests: prompt builders and parsers

Mock LLM client for integration tests

Golden output regression tests

Related reading

Related Guides

Building LLM Applications with Rust

LLM Rust Troubleshooting

Continue in This Topic

LLM Rust Team Workflow

LLM Rust Troubleshooting

More Rust Guides

Building LLM Applications with Rust

LLM API Gateway in Rust

LLM Rust Anti-Patterns

LLM Rust Benchmarking

LLM Rust Decision Matrix

LLM Rust Interview Q&A