Vector Indexing

High-performance vector indexing with HNSW, IVF, and quantization

Overview

Vector indexing methods:

• HNSW: Hierarchical Navigable Small World graphs
• IVF: Inverted File Index with quantization
• Flat Index: Exact search for small datasets
• Quantization: Compress vectors for efficiency
• Hybrid Indexes: Combine multiple strategies

HNSW Index

-- Create HNSW index
CREATE INDEX idx_documents_embedding ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (
  m = 16,                    -- Max connections per layer (higher = better recall, slower build)
  ef_construction = 64,      -- Size of dynamic candidate list (higher = better quality)
  ef_search = 40            -- Runtime search quality (higher = better recall, slower search)
);

-- Supported distance metrics
CREATE INDEX idx_cosine ON documents USING hnsw (embedding vector_cosine_ops);
CREATE INDEX idx_l2 ON documents USING hnsw (embedding vector_l2_ops);
CREATE INDEX idx_inner_product ON documents USING hnsw (embedding vector_ip_ops);

-- Tune for your use case
-- Small datasets (< 100K):
CREATE INDEX idx_small ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 8, ef_construction = 32);

-- Large datasets (> 1M):
CREATE INDEX idx_large ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 32, ef_construction = 128);

-- High recall requirement:
CREATE INDEX idx_high_recall ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 48, ef_construction = 256, ef_search = 100);

-- Fast build time:
CREATE INDEX idx_fast_build ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 12, ef_construction = 40);

-- Query with index
SET hnsw.ef_search = 100;  -- Adjust at query time
SELECT title, 1 - (embedding <=> '[0.1, 0.2, ...]') as similarity
FROM documents
ORDER BY embedding <=> '[0.1, 0.2, ...]'
LIMIT 10;

IVF Index

-- Create IVF index
CREATE INDEX idx_documents_ivf ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (
  lists = 100  -- Number of clusters (sqrt(num_rows) is good starting point)
);

-- For 1M vectors:
CREATE INDEX idx_ivf_large ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1000);

-- Query with IVF
SET ivfflat.probes = 10;  -- Number of lists to search (higher = better recall)
SELECT title, 1 - (embedding <=> '[0.1, 0.2, ...]') as similarity
FROM documents
ORDER BY embedding <=> '[0.1, 0.2, ...]'
LIMIT 10;

-- Trade-offs:
-- More lists = faster queries, slower index build
-- More probes = better recall, slower queries

-- Optimal lists calculation
SELECT SQRT(COUNT(*)) as recommended_lists
FROM documents;

-- Rebuild index after significant data changes
REINDEX INDEX idx_documents_ivf;

Product Quantization

-- Create index with quantization
CREATE INDEX idx_documents_pq ON documents
USING ivfflat_pq (embedding vector_cosine_ops)
WITH (
  lists = 1000,
  pq_segments = 8,          -- Divide vector into segments
  pq_bits = 8               -- Bits per segment (256 centroids)
);

-- Memory savings calculation
-- Original: 1M vectors * 1536 dims * 4 bytes = 6GB
-- Quantized: 1M vectors * 8 segments * 1 byte = 8MB (750x reduction)

-- Binary quantization for extreme compression
CREATE INDEX idx_documents_binary ON documents
USING binary_quantization (embedding)
WITH (method = 'hamming');

-- Scalar quantization (int8)
ALTER TABLE documents
ALTER COLUMN embedding 
SET STORAGE QUANTIZED
WITH (quantization_type = 'int8');

-- Query quantized index
SELECT title, 
       entropy_quantized_similarity(embedding, '[0.1, 0.2, ...]') as similarity
FROM documents
ORDER BY embedding <~> '[0.1, 0.2, ...]'  -- Quantized distance operator
LIMIT 10;

Index Management

-- View index statistics
SELECT 
  schemaname,
  tablename,
  indexname,
  idx_scan as times_used,
  idx_tup_read as tuples_read,
  idx_tup_fetch as tuples_fetched,
  pg_size_pretty(pg_relation_size(indexrelid)) as index_size
FROM pg_stat_user_indexes
WHERE indexname LIKE '%embedding%';

-- Monitor index build progress
SELECT 
  phase,
  round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "% complete",
  blocks_done,
  blocks_total
FROM pg_stat_progress_create_index
WHERE command = 'CREATE INDEX';

-- Concurrent index creation (no locks)
CREATE INDEX CONCURRENTLY idx_documents_embedding 
ON documents USING hnsw (embedding vector_cosine_ops);

-- Drop and recreate
DROP INDEX idx_documents_embedding;
CREATE INDEX idx_documents_embedding 
ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 24, ef_construction = 80);

-- Vacuum and analyze
VACUUM ANALYZE documents;

-- Check index bloat
SELECT 
  schemaname,
  tablename,
  indexname,
  pg_size_pretty(pg_relation_size(indexrelid)) as index_size,
  idx_scan,
  idx_tup_read / NULLIF(idx_scan, 0) as tuples_per_scan
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY pg_relation_size(indexrelid) DESC;

Performance Tuning

-- Benchmark different index configurations
CREATE OR REPLACE FUNCTION benchmark_vector_index(
  query_vector VECTOR,
  k INT DEFAULT 10
) RETURNS TABLE(
  index_type TEXT,
  avg_latency_ms DECIMAL,
  recall DECIMAL
) AS $$
BEGIN
  -- Test HNSW
  RETURN QUERY
  WITH hnsw_results AS (
    SELECT title, embedding
    FROM documents
    ORDER BY embedding <=> query_vector
    LIMIT k
  ),
  timings AS (
    SELECT 
      'HNSW' as index_type,
      extract(milliseconds from (clock_timestamp() - statement_timestamp())) as latency
    FROM hnsw_results
  )
  SELECT index_type, AVG(latency), 0.95 FROM timings GROUP BY index_type;
  
  -- Add similar tests for IVF, PQ, etc.
END;
$$ LANGUAGE plpgsql;

-- Run benchmark
SELECT * FROM benchmark_vector_index('[0.1, 0.2, ...]'::vector);

-- Query plan analysis
EXPLAIN (ANALYZE, BUFFERS)
SELECT title
FROM documents
ORDER BY embedding <=> '[0.1, 0.2, ...]'
LIMIT 10;

-- Optimize settings
SET work_mem = '256MB';
SET maintenance_work_mem = '2GB';
SET effective_cache_size = '16GB';
SET random_page_cost = 1.1;

-- Parallel index build
SET max_parallel_maintenance_workers = 4;
CREATE INDEX idx_documents_parallel
ON documents USING hnsw (embedding vector_cosine_ops);

Best Practices

Index Selection

• HNSW: Best overall performance
• IVF: Good for very large datasets
• Quantization: Reduce memory usage
• Benchmark before choosing

Maintenance

• Monitor index size and usage
• Rebuild after bulk updates
• Tune parameters for workload
• Use concurrent builds

Next Steps

Embeddings

Generate embeddings

Vector Search

Semantic search

Performance Tuning

Optimize queries