Rust AI Inference Error Playbook

Error taxonomy

rust

use std::fmt;

#[derive(Debug, Clone, PartialEq)]
pub enum InferenceError {
    /// Input failed validation before reaching the model
    InvalidInput(String),
    /// Model not loaded or not ready
    ModelNotReady,
    /// Inference took longer than the configured timeout
    Timeout { duration_ms: u64 },
    /// Internal model error (NaN output, shape mismatch)
    ModelError(String),
    /// Downstream dependency failed (GPU OOM, driver error)
    InfrastructureError(String),
    /// Too many concurrent requests
    Overloaded { queue_depth: usize },
}

impl fmt::Display for InferenceError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::InvalidInput(msg) => write!(f, "invalid input: {}", msg),
            Self::ModelNotReady => write!(f, "model is not ready yet"),
            Self::Timeout { duration_ms } =>
                write!(f, "inference timed out after {}ms", duration_ms),
            Self::ModelError(msg) => write!(f, "model error: {}", msg),
            Self::InfrastructureError(msg) => write!(f, "infrastructure error: {}", msg),
            Self::Overloaded { queue_depth } =>
                write!(f, "service overloaded (queue depth: {})", queue_depth),
        }
    }
}

impl InferenceError {
    /// HTTP status code for this error
    pub fn http_status(&self) -> u16 {
        match self {
            Self::InvalidInput(_) => 400,
            Self::ModelNotReady => 503,
            Self::Timeout { .. } => 504,
            Self::ModelError(_) => 500,
            Self::InfrastructureError(_) => 503,
            Self::Overloaded { .. } => 429,
        }
    }

    /// Whether this error is safe to retry
    pub fn is_retryable(&self) -> bool {
        matches!(self, Self::Overloaded { .. } | Self::Timeout { .. } | Self::ModelNotReady)
    }
}

Retry with exponential backoff

rust

use std::time::Duration;

async fn infer_with_retry(
    input: Vec<f32>,
    max_retries: u32,
) -> Result<Vec<f32>, InferenceError> {
    let mut last_error = None;

    for attempt in 0..=max_retries {
        if attempt > 0 {
            let delay = Duration::from_millis(50 * 2u64.pow(attempt - 1));
            tokio::time::sleep(delay).await;
            println!("Retry attempt {} after {:?}", attempt, delay);
        }

        match do_infer(&input).await {
            Ok(result) => return Ok(result),
            Err(e) if e.is_retryable() => {
                last_error = Some(e);
                continue;
            }
            Err(e) => return Err(e), // Non-retryable: fail fast
        }
    }

    Err(last_error.unwrap())
}

async fn do_infer(input: &[f32]) -> Result<Vec<f32>, InferenceError> {
    // Simulate occasional overload
    use std::time::{SystemTime, UNIX_EPOCH};
    let t = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().subsec_nanos();
    if t % 3 == 0 {
        return Err(InferenceError::Overloaded { queue_depth: 150 });
    }
    Ok(input.iter().map(|x| x * 2.0).collect())
}

#[tokio::main]
async fn main() {
    match infer_with_retry(vec![1.0, 2.0, 3.0], 3).await {
        Ok(out) => println!("Success: {:?}", out),
        Err(e) => println!("Failed after retries: {} (HTTP {})", e, e.http_status()),
    }
}

Circuit breaker pattern

rust

use std::sync::atomic::{AtomicU32, AtomicBool, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::sync::Mutex;

struct CircuitBreaker {
    failure_count: AtomicU32,
    failure_threshold: u32,
    open: AtomicBool,
    opened_at: Mutex<Option<Instant>>,
    recovery_timeout: Duration,
}

impl CircuitBreaker {
    fn new(threshold: u32, recovery_timeout: Duration) -> Arc<Self> {
        Arc::new(Self {
            failure_count: AtomicU32::new(0),
            failure_threshold: threshold,
            open: AtomicBool::new(false),
            opened_at: Mutex::new(None),
            recovery_timeout,
        })
    }

    fn is_open(&self) -> bool {
        if !self.open.load(Ordering::Acquire) { return false; }
        // Check if recovery timeout has elapsed (half-open state)
        let opened_at = self.opened_at.lock().unwrap();
        if let Some(t) = *opened_at {
            if t.elapsed() > self.recovery_timeout {
                return false; // Allow one probe request
            }
        }
        true
    }

    fn record_success(&self) {
        self.failure_count.store(0, Ordering::Release);
        self.open.store(false, Ordering::Release);
    }

    fn record_failure(&self) {
        let count = self.failure_count.fetch_add(1, Ordering::AcqRel) + 1;
        if count >= self.failure_threshold {
            self.open.store(true, Ordering::Release);
            *self.opened_at.lock().unwrap() = Some(Instant::now());
            println!("⚡ Circuit breaker OPEN after {} failures", count);
        }
    }

    async fn call<F, Fut>(&self, f: F) -> Result<Vec<f32>, InferenceError>
    where
        F: FnOnce() -> Fut,
        Fut: std::future::Future<Output = Result<Vec<f32>, InferenceError>>,
    {
        if self.is_open() {
            return Err(InferenceError::InfrastructureError("circuit breaker open".into()));
        }
        match f().await {
            Ok(r) => { self.record_success(); Ok(r) }
            Err(e) => { self.record_failure(); Err(e) }
        }
    }
}

#[tokio::main]
async fn main() {
    let cb = CircuitBreaker::new(3, Duration::from_secs(30));

    for i in 0..6 {
        let result = cb.call(|| async move {
            if i < 4 {
                Err(InferenceError::InfrastructureError("GPU OOM".into()))
            } else {
                Ok(vec![1.0, 2.0])
            }
        }).await;
        println!("Call {}: {:?}", i, result.map_err(|e| e.to_string()));
    }
}

Fallback model pattern

rust

async fn infer_with_fallback(input: Vec<f32>) -> Vec<f32> {
    // Try primary (large, accurate) model first
    match run_primary_model(&input).await {
        Ok(result) => result,
        Err(e) => {
            eprintln!("Primary model failed ({}), falling back to lightweight model", e);
            run_fallback_model(&input).await
        }
    }
}

async fn run_primary_model(input: &[f32]) -> Result<Vec<f32>, InferenceError> {
    // Simulate primary failure
    Err(InferenceError::Timeout { duration_ms: 5000 })
}

async fn run_fallback_model(input: &[f32]) -> Vec<f32> {
    // Fast, less accurate model
    input.iter().map(|x| x * 1.5).collect()
}

#[tokio::main]
async fn main() {
    let result = infer_with_fallback(vec![1.0, 2.0, 3.0]).await;
    println!("Final result: {:?}", result);
}

Rust AI Inference Error Playbook

Rust AI Inference Error Playbook

Error taxonomy

Retry with exponential backoff

Circuit breaker pattern

Fallback model pattern

Related reading

Related Guides

Rust AI Inference Troubleshooting

Rust AI Inference Production Guide

Continue in This Topic

Rust AI Inference Decision Matrix

Rust AI Inference Interview Q&A

More Rust Guides

Building LLM Applications with Rust

LLM API Gateway in Rust

LLM Rust Anti-Patterns

LLM Rust Benchmarking

LLM Rust Decision Matrix

LLM Rust Interview Q&A