RRust By Example
intermediate

Vector Similarity Search in Rust

Implement cosine similarity search over embedding vectors in Rust for semantic search, RAG, and recommendation systems.

Vector Similarity Search in Rust

Build an in-memory vector store for semantic search using cosine similarity.

Difficulty

Intermediate

Code

rust
use std::collections::BinaryHeap;
use std::cmp::Ordering;

/// Embedding vector with associated document ID
#[derive(Clone)]
struct Embedding {
    id: String,
    vector: Vec<f32>,
}

/// Min-heap entry for top-k search
#[derive(PartialEq)]
struct ScoredDoc {
    score: f32,
    id: String,
}

impl Eq for ScoredDoc {}
impl PartialOrd for ScoredDoc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) }
}
impl Ord for ScoredDoc {
    fn cmp(&self, other: &Self) -> Ordering {
        self.score.partial_cmp(&other.score).unwrap_or(Ordering::Equal)
    }
}

/// Cosine similarity: dot(a, b) / (|a| * |b|)
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm_a < 1e-8 || norm_b < 1e-8 { return 0.0; }
    (dot / (norm_a * norm_b)).clamp(-1.0, 1.0)
}

/// In-memory vector store
struct VectorStore {
    embeddings: Vec<Embedding>,
    dim: usize,
}

impl VectorStore {
    fn new(dim: usize) -> Self { Self { embeddings: Vec::new(), dim } }

    fn add(&mut self, id: &str, vector: Vec<f32>) {
        assert_eq!(vector.len(), self.dim, "dimension mismatch");
        self.embeddings.push(Embedding { id: id.to_string(), vector });
    }

    /// Return top-k most similar documents to the query vector
    fn search(&self, query: &[f32], top_k: usize) -> Vec<(String, f32)> {
        let mut heap = BinaryHeap::new(); // min-heap by score

        for emb in &self.embeddings {
            let score = cosine_similarity(query, &emb.vector);
            heap.push(std::cmp::Reverse(ScoredDoc { score, id: emb.id.clone() }));
            if heap.len() > top_k { heap.pop(); }
        }

        let mut results: Vec<(String, f32)> = heap
            .into_iter()
            .map(|std::cmp::Reverse(d)| (d.id, d.score))
            .collect();
        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
        results
    }
}

fn main() {
    let mut store = VectorStore::new(4);

    // Add documents with their embeddings
    store.add("rust-async", &[0.9, 0.8, 0.1, 0.2]);
    store.add("python-ml",  &[0.1, 0.2, 0.9, 0.8]);
    store.add("rust-ml",    &[0.8, 0.7, 0.6, 0.3]);
    store.add("go-server",  &[0.5, 0.9, 0.1, 0.2]);
    store.add("cpp-perf",   &[0.7, 0.6, 0.2, 0.1]);

    let query = vec![0.85f32, 0.75, 0.2, 0.15]; // "systems programming" embedding
    println!("Query: Rust systems programming\n");

    let results = store.search(&query, 3);
    for (i, (doc, score)) in results.iter().enumerate() {
        println!("  {}. {} (similarity: {:.4})", i + 1, doc, score);
    }
}

Explanation

Cosine similarity measures the angle between two vectors — ideal for comparing normalized embeddings regardless of magnitude. The BinaryHeap with min-heap trick efficiently maintains the top-k results in O(n log k) time.

Key Concepts

  • Cosine similarity normalizes for vector magnitude
  • BinaryHeap with fixed size gives O(n log k) top-k search
  • Clamp output to [-1.0, 1.0] to handle floating-point edge cases
  • For large datasets, use HNSW or IVF indexing (e.g., hnswlib)

Related Topics

Browse more examples in the ai-inference category to explore RAG and embeddings patterns.

More ai-inference Examples