RRust By Example

LLM API Gateway in Rust

Build a production LLM API gateway in Rust with Axum: multi-provider routing, rate limiting, cost tracking, request logging, and streaming proxy for OpenAI and Anthropic.

Topic: Llm Rust

Search intent: High-intent search: "rust llm api gateway openai proxy"

LLM API Gateway in Rust

Architecture

An LLM gateway sits between your application and LLM providers. It adds:

  • Authentication — validate API keys before calling providers.
  • Rate limiting — per-tenant token budgets.
  • Cost tracking — log token usage for billing.
  • Fallback routing — switch providers on failure.
  • Caching — serve repeated prompts from cache.
  • Streaming proxy — forward SSE streams efficiently.

Runnable example — gateway core

rust
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};

/// Provider configuration
#[derive(Clone, Debug)]
struct ProviderConfig {
    name: String,
    base_url: String,
    timeout: Duration,
    priority: u8, // lower = higher priority
}

/// Gateway configuration
struct GatewayConfig {
    providers: Vec<ProviderConfig>,
    max_tokens_per_min_per_tenant: u32,
    cache_ttl_secs: u64,
}

impl GatewayConfig {
    fn default_config() -> Self {
        Self {
            providers: vec![
                ProviderConfig {
                    name: "openai".to_string(),
                    base_url: "https://api.openai.com/v1".to_string(),
                    timeout: Duration::from_secs(30),
                    priority: 1,
                },
                ProviderConfig {
                    name: "anthropic".to_string(),
                    base_url: "https://api.anthropic.com/v1".to_string(),
                    timeout: Duration::from_secs(30),
                    priority: 2,
                },
            ],
            max_tokens_per_min_per_tenant: 100_000,
            cache_ttl_secs: 3600,
        }
    }
}

/// Per-tenant rate limiter using token bucket
struct TenantRateLimiter {
    tokens: f64,
    max_tokens: f64,
    refill_rate: f64,
    last_refill: Instant,
}

impl TenantRateLimiter {
    fn new(tokens_per_min: u32) -> Self {
        let rate = tokens_per_min as f64 / 60.0; // tokens per second
        Self {
            tokens: tokens_per_min as f64,
            max_tokens: tokens_per_min as f64,
            refill_rate: rate,
            last_refill: Instant::now(),
        }
    }

    fn consume(&mut self, tokens: u32) -> bool {
        let elapsed = self.last_refill.elapsed().as_secs_f64();
        self.tokens = (self.tokens + elapsed * self.refill_rate).min(self.max_tokens);
        self.last_refill = Instant::now();

        if self.tokens >= tokens as f64 {
            self.tokens -= tokens as f64;
            true
        } else {
            false
        }
    }
}

/// Cost tracking
#[derive(Debug, Default, Clone)]
struct UsageRecord {
    prompt_tokens: u64,
    completion_tokens: u64,
    total_cost_usd: f64,
}

/// LLM request/response types
#[derive(Debug, Serialize, Deserialize)]
struct LlmRequest {
    model: String,
    prompt: String,
    max_tokens: Option<u32>,
    tenant_id: String,
}

#[derive(Debug, Serialize)]
struct LlmResponse {
    content: String,
    model: String,
    provider: String,
    prompt_tokens: u32,
    completion_tokens: u32,
    cached: bool,
}

/// The gateway
struct LlmGateway {
    config: GatewayConfig,
    rate_limiters: Mutex<HashMap<String, TenantRateLimiter>>,
    usage: Mutex<HashMap<String, UsageRecord>>,
    cache: Mutex<HashMap<u64, (LlmResponse, Instant)>>,
}

impl LlmGateway {
    fn new(config: GatewayConfig) -> Arc<Self> {
        Arc::new(Self {
            config,
            rate_limiters: Mutex::new(HashMap::new()),
            usage: Mutex::new(HashMap::new()),
            cache: Mutex::new(HashMap::new()),
        })
    }

    fn check_rate_limit(&self, tenant_id: &str, estimated_tokens: u32) -> bool {
        let mut limiters = self.rate_limiters.lock().unwrap();
        let limiter = limiters.entry(tenant_id.to_string())
            .or_insert_with(|| TenantRateLimiter::new(
                self.config.max_tokens_per_min_per_tenant
            ));
        limiter.consume(estimated_tokens)
    }

    fn cache_key(request: &LlmRequest) -> u64 {
        use std::hash::{Hash, Hasher};
        use std::collections::hash_map::DefaultHasher;
        let mut h = DefaultHasher::new();
        request.model.hash(&mut h);
        request.prompt.hash(&mut h);
        request.max_tokens.hash(&mut h);
        h.finish()
    }

    fn get_cached(&self, key: u64) -> Option<LlmResponse> {
        let cache = self.cache.lock().unwrap();
        cache.get(&key).and_then(|(resp, ts)| {
            if ts.elapsed().as_secs() < self.config.cache_ttl_secs {
                Some(LlmResponse {
                    content: resp.content.clone(),
                    model: resp.model.clone(),
                    provider: resp.provider.clone(),
                    prompt_tokens: resp.prompt_tokens,
                    completion_tokens: resp.completion_tokens,
                    cached: true,
                })
            } else {
                None
            }
        })
    }

    fn record_usage(&self, tenant_id: &str, prompt_tokens: u32, completion_tokens: u32) {
        let mut usage = self.usage.lock().unwrap();
        let record = usage.entry(tenant_id.to_string()).or_default();
        record.prompt_tokens += prompt_tokens as u64;
        record.completion_tokens += completion_tokens as u64;
        // GPT-4o pricing: $5/1M input, $15/1M output
        record.total_cost_usd +=
            prompt_tokens as f64 * 5.0 / 1_000_000.0 +
            completion_tokens as f64 * 15.0 / 1_000_000.0;
    }

    fn get_tenant_usage(&self, tenant_id: &str) -> Option<UsageRecord> {
        self.usage.lock().unwrap().get(tenant_id).cloned()
    }

    /// Route to best available provider
    fn route(&self) -> &ProviderConfig {
        &self.config.providers[0] // Simplified: always use highest priority
    }

    /// Process an LLM request through the gateway
    fn process(&self, request: LlmRequest) -> Result<LlmResponse, String> {
        let estimated_tokens = request.prompt.len() as u32 / 4 + 100;

        // Check rate limit
        if !self.check_rate_limit(&request.tenant_id, estimated_tokens) {
            return Err(format!(
                "rate limit exceeded for tenant {}",
                request.tenant_id
            ));
        }

        // Check cache
        let cache_key = Self::cache_key(&request);
        if let Some(cached) = self.get_cached(cache_key) {
            println!("[gateway] Cache hit for tenant {}", request.tenant_id);
            return Ok(cached);
        }

        // Route to provider
        let provider = self.route();
        println!("[gateway] Routing to provider: {}", provider.name);

        // Simulate LLM call (in production: make HTTP request)
        let prompt_tokens = (request.prompt.len() / 4) as u32;
        let completion_tokens = 150u32;
        let content = format!(
            "Response from {} via {}: Here is a helpful answer about Rust and AI...",
            request.model, provider.name
        );

        self.record_usage(&request.tenant_id, prompt_tokens, completion_tokens);

        let response = LlmResponse {
            content,
            model: request.model,
            provider: provider.name.clone(),
            prompt_tokens,
            completion_tokens,
            cached: false,
        };

        // Cache response
        let mut cache = self.cache.lock().unwrap();
        cache.insert(cache_key, (LlmResponse {
            cached: false,
            content: response.content.clone(),
            model: response.model.clone(),
            provider: response.provider.clone(),
            prompt_tokens: response.prompt_tokens,
            completion_tokens: response.completion_tokens,
        }, Instant::now()));

        Ok(response)
    }
}

fn main() {
    let gateway = LlmGateway::new(GatewayConfig::default_config());

    // Simulate requests from different tenants
    let requests = vec![
        LlmRequest {
            model: "gpt-4o".to_string(),
            prompt: "Explain Rust ownership in simple terms".to_string(),
            max_tokens: Some(256),
            tenant_id: "tenant-a".to_string(),
        },
        LlmRequest {
            model: "gpt-4o".to_string(),
            prompt: "How do I use async/await in Rust?".to_string(),
            max_tokens: Some(512),
            tenant_id: "tenant-b".to_string(),
        },
        // Same request as first — should hit cache
        LlmRequest {
            model: "gpt-4o".to_string(),
            prompt: "Explain Rust ownership in simple terms".to_string(),
            max_tokens: Some(256),
            tenant_id: "tenant-a".to_string(),
        },
    ];

    for req in requests {
        let tenant = req.tenant_id.clone();
        match gateway.process(req) {
            Ok(resp) => {
                println!(
                    "[{}] {} | cached={} | tokens={}/{}",
                    tenant, resp.provider, resp.cached,
                    resp.prompt_tokens, resp.completion_tokens
                );
            }
            Err(e) => println!("[{}] Error: {}", tenant, e),
        }
    }

    // Print usage summary
    for tenant in ["tenant-a", "tenant-b"] {
        if let Some(usage) = gateway.get_tenant_usage(tenant) {
            println!(
                "\n[usage] {}: {} prompt + {} completion tokens = ${:.6}",
                tenant, usage.prompt_tokens, usage.completion_tokens, usage.total_cost_usd
            );
        }
    }
}

Related reading

Related Guides

Continue in This Topic

More Rust Guides