LLM API Gateway in Rust
Build a production LLM API gateway in Rust with Axum: multi-provider routing, rate limiting, cost tracking, request logging, and streaming proxy for OpenAI and Anthropic.
Topic: Llm Rust
Search intent: High-intent search: "rust llm api gateway openai proxy"
LLM API Gateway in Rust
Architecture
An LLM gateway sits between your application and LLM providers. It adds:
- Authentication — validate API keys before calling providers.
- Rate limiting — per-tenant token budgets.
- Cost tracking — log token usage for billing.
- Fallback routing — switch providers on failure.
- Caching — serve repeated prompts from cache.
- Streaming proxy — forward SSE streams efficiently.
Runnable example — gateway core
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};
/// Provider configuration
#[derive(Clone, Debug)]
struct ProviderConfig {
name: String,
base_url: String,
timeout: Duration,
priority: u8, // lower = higher priority
}
/// Gateway configuration
struct GatewayConfig {
providers: Vec<ProviderConfig>,
max_tokens_per_min_per_tenant: u32,
cache_ttl_secs: u64,
}
impl GatewayConfig {
fn default_config() -> Self {
Self {
providers: vec![
ProviderConfig {
name: "openai".to_string(),
base_url: "https://api.openai.com/v1".to_string(),
timeout: Duration::from_secs(30),
priority: 1,
},
ProviderConfig {
name: "anthropic".to_string(),
base_url: "https://api.anthropic.com/v1".to_string(),
timeout: Duration::from_secs(30),
priority: 2,
},
],
max_tokens_per_min_per_tenant: 100_000,
cache_ttl_secs: 3600,
}
}
}
/// Per-tenant rate limiter using token bucket
struct TenantRateLimiter {
tokens: f64,
max_tokens: f64,
refill_rate: f64,
last_refill: Instant,
}
impl TenantRateLimiter {
fn new(tokens_per_min: u32) -> Self {
let rate = tokens_per_min as f64 / 60.0; // tokens per second
Self {
tokens: tokens_per_min as f64,
max_tokens: tokens_per_min as f64,
refill_rate: rate,
last_refill: Instant::now(),
}
}
fn consume(&mut self, tokens: u32) -> bool {
let elapsed = self.last_refill.elapsed().as_secs_f64();
self.tokens = (self.tokens + elapsed * self.refill_rate).min(self.max_tokens);
self.last_refill = Instant::now();
if self.tokens >= tokens as f64 {
self.tokens -= tokens as f64;
true
} else {
false
}
}
}
/// Cost tracking
#[derive(Debug, Default, Clone)]
struct UsageRecord {
prompt_tokens: u64,
completion_tokens: u64,
total_cost_usd: f64,
}
/// LLM request/response types
#[derive(Debug, Serialize, Deserialize)]
struct LlmRequest {
model: String,
prompt: String,
max_tokens: Option<u32>,
tenant_id: String,
}
#[derive(Debug, Serialize)]
struct LlmResponse {
content: String,
model: String,
provider: String,
prompt_tokens: u32,
completion_tokens: u32,
cached: bool,
}
/// The gateway
struct LlmGateway {
config: GatewayConfig,
rate_limiters: Mutex<HashMap<String, TenantRateLimiter>>,
usage: Mutex<HashMap<String, UsageRecord>>,
cache: Mutex<HashMap<u64, (LlmResponse, Instant)>>,
}
impl LlmGateway {
fn new(config: GatewayConfig) -> Arc<Self> {
Arc::new(Self {
config,
rate_limiters: Mutex::new(HashMap::new()),
usage: Mutex::new(HashMap::new()),
cache: Mutex::new(HashMap::new()),
})
}
fn check_rate_limit(&self, tenant_id: &str, estimated_tokens: u32) -> bool {
let mut limiters = self.rate_limiters.lock().unwrap();
let limiter = limiters.entry(tenant_id.to_string())
.or_insert_with(|| TenantRateLimiter::new(
self.config.max_tokens_per_min_per_tenant
));
limiter.consume(estimated_tokens)
}
fn cache_key(request: &LlmRequest) -> u64 {
use std::hash::{Hash, Hasher};
use std::collections::hash_map::DefaultHasher;
let mut h = DefaultHasher::new();
request.model.hash(&mut h);
request.prompt.hash(&mut h);
request.max_tokens.hash(&mut h);
h.finish()
}
fn get_cached(&self, key: u64) -> Option<LlmResponse> {
let cache = self.cache.lock().unwrap();
cache.get(&key).and_then(|(resp, ts)| {
if ts.elapsed().as_secs() < self.config.cache_ttl_secs {
Some(LlmResponse {
content: resp.content.clone(),
model: resp.model.clone(),
provider: resp.provider.clone(),
prompt_tokens: resp.prompt_tokens,
completion_tokens: resp.completion_tokens,
cached: true,
})
} else {
None
}
})
}
fn record_usage(&self, tenant_id: &str, prompt_tokens: u32, completion_tokens: u32) {
let mut usage = self.usage.lock().unwrap();
let record = usage.entry(tenant_id.to_string()).or_default();
record.prompt_tokens += prompt_tokens as u64;
record.completion_tokens += completion_tokens as u64;
// GPT-4o pricing: $5/1M input, $15/1M output
record.total_cost_usd +=
prompt_tokens as f64 * 5.0 / 1_000_000.0 +
completion_tokens as f64 * 15.0 / 1_000_000.0;
}
fn get_tenant_usage(&self, tenant_id: &str) -> Option<UsageRecord> {
self.usage.lock().unwrap().get(tenant_id).cloned()
}
/// Route to best available provider
fn route(&self) -> &ProviderConfig {
&self.config.providers[0] // Simplified: always use highest priority
}
/// Process an LLM request through the gateway
fn process(&self, request: LlmRequest) -> Result<LlmResponse, String> {
let estimated_tokens = request.prompt.len() as u32 / 4 + 100;
// Check rate limit
if !self.check_rate_limit(&request.tenant_id, estimated_tokens) {
return Err(format!(
"rate limit exceeded for tenant {}",
request.tenant_id
));
}
// Check cache
let cache_key = Self::cache_key(&request);
if let Some(cached) = self.get_cached(cache_key) {
println!("[gateway] Cache hit for tenant {}", request.tenant_id);
return Ok(cached);
}
// Route to provider
let provider = self.route();
println!("[gateway] Routing to provider: {}", provider.name);
// Simulate LLM call (in production: make HTTP request)
let prompt_tokens = (request.prompt.len() / 4) as u32;
let completion_tokens = 150u32;
let content = format!(
"Response from {} via {}: Here is a helpful answer about Rust and AI...",
request.model, provider.name
);
self.record_usage(&request.tenant_id, prompt_tokens, completion_tokens);
let response = LlmResponse {
content,
model: request.model,
provider: provider.name.clone(),
prompt_tokens,
completion_tokens,
cached: false,
};
// Cache response
let mut cache = self.cache.lock().unwrap();
cache.insert(cache_key, (LlmResponse {
cached: false,
content: response.content.clone(),
model: response.model.clone(),
provider: response.provider.clone(),
prompt_tokens: response.prompt_tokens,
completion_tokens: response.completion_tokens,
}, Instant::now()));
Ok(response)
}
}
fn main() {
let gateway = LlmGateway::new(GatewayConfig::default_config());
// Simulate requests from different tenants
let requests = vec![
LlmRequest {
model: "gpt-4o".to_string(),
prompt: "Explain Rust ownership in simple terms".to_string(),
max_tokens: Some(256),
tenant_id: "tenant-a".to_string(),
},
LlmRequest {
model: "gpt-4o".to_string(),
prompt: "How do I use async/await in Rust?".to_string(),
max_tokens: Some(512),
tenant_id: "tenant-b".to_string(),
},
// Same request as first — should hit cache
LlmRequest {
model: "gpt-4o".to_string(),
prompt: "Explain Rust ownership in simple terms".to_string(),
max_tokens: Some(256),
tenant_id: "tenant-a".to_string(),
},
];
for req in requests {
let tenant = req.tenant_id.clone();
match gateway.process(req) {
Ok(resp) => {
println!(
"[{}] {} | cached={} | tokens={}/{}",
tenant, resp.provider, resp.cached,
resp.prompt_tokens, resp.completion_tokens
);
}
Err(e) => println!("[{}] Error: {}", tenant, e),
}
}
// Print usage summary
for tenant in ["tenant-a", "tenant-b"] {
if let Some(usage) = gateway.get_tenant_usage(tenant) {
println!(
"\n[usage] {}: {} prompt + {} completion tokens = ${:.6}",
tenant, usage.prompt_tokens, usage.completion_tokens, usage.total_cost_usd
);
}
}
}