diff --git a/helix-db/src/helix_engine/tests/traversal_tests/upsert_tests.rs b/helix-db/src/helix_engine/tests/traversal_tests/upsert_tests.rs index 7df407e1..50f9f976 100644 --- a/helix-db/src/helix_engine/tests/traversal_tests/upsert_tests.rs +++ b/helix-db/src/helix_engine/tests/traversal_tests/upsert_tests.rs @@ -530,7 +530,7 @@ fn test_upsert_v_creates_new_vector_when_none_exists() { } #[test] -fn test_upsert_v_creates_vector_with_default_data_when_none_provided() { +fn test_upsert_v_rejects_empty_vector_data() { let (_temp_dir, storage) = setup_test_db(); let arena = Bump::new(); let mut txn = storage.graph_env.write_txn().unwrap(); @@ -542,21 +542,9 @@ fn test_upsert_v_creates_vector_with_default_data_when_none_provided() { &arena, ) .upsert_v(&[], "placeholder", &[("status", Value::from("pending"))]) - .collect::, _>>() - .unwrap(); + .collect::, _>>(); - assert_eq!(result.len(), 1); - if let TraversalValue::Vector(vector) = &result[0] { - assert_eq!(vector.label, "placeholder"); - assert!(vector.data.is_empty()); // Default empty data - assert_eq!( - vector.get_property("status").unwrap(), - &Value::from("pending") - ); - } else { - panic!("Expected vector"); - } - txn.commit().unwrap(); + assert!(result.is_err()); } #[test] diff --git a/helix-db/src/helix_engine/tests/vector_tests.rs b/helix-db/src/helix_engine/tests/vector_tests.rs index 4799902d..e2accfc3 100644 --- a/helix-db/src/helix_engine/tests/vector_tests.rs +++ b/helix-db/src/helix_engine/tests/vector_tests.rs @@ -1,4 +1,6 @@ -use crate::helix_engine::vector_core::vector_distance::{MAX_DISTANCE, MIN_DISTANCE, ORTHOGONAL}; +use crate::helix_engine::vector_core::vector_distance::{ + cosine_similarity, MAX_DISTANCE, MIN_DISTANCE, ORTHOGONAL, +}; use crate::helix_engine::vector_core::vector::HVector; use bumpalo::Bump; @@ -36,8 +38,8 @@ fn test_hvector_distance_min() { #[test] fn test_hvector_distance_max() { let arena = Bump::new(); - let v1 = alloc_vector(&arena, &[0.0, 0.0]); - let v2 = alloc_vector(&arena, &[3.0, 4.0]); + let v1 = alloc_vector(&arena, &[-1.0, -2.0, -3.0]); + let v2 = alloc_vector(&arena, &[1.0, 2.0, 3.0]); let distance = v1.distance_to(&v2).unwrap(); assert_eq!(distance, MAX_DISTANCE); } @@ -99,3 +101,82 @@ fn test_hvector_cosine_similarity() { let similarity = v1.distance_to(&v2).unwrap(); assert!((similarity - (1.0 - 0.9746318461970762)).abs() < 1e-9); } + +#[test] +fn test_cosine_similarity_zero_vector_returns_error() { + let result = cosine_similarity(&[0.0, 0.0, 0.0], &[1.0, 2.0, 3.0]); + assert!(result.is_err()); +} + +#[test] +fn test_cosine_similarity_both_zero_vectors_returns_error() { + let result = cosine_similarity(&[0.0, 0.0], &[0.0, 0.0]); + assert!(result.is_err()); +} + +#[test] +fn test_cosine_similarity_empty_vectors_returns_error() { + let result = cosine_similarity(&[], &[]); + assert!(result.is_err()); +} + +#[test] +fn test_cosine_similarity_one_empty_vector_returns_error() { + let result = cosine_similarity(&[], &[1.0, 2.0]); + assert!(result.is_err()); +} + +#[test] +fn test_cosine_similarity_dimension_mismatch_returns_error() { + let result = cosine_similarity(&[1.0, 2.0], &[1.0, 2.0, 3.0]); + assert!(result.is_err()); +} + +#[test] +fn test_cosine_similarity_identical_vectors() { + let result = cosine_similarity(&[1.0, 2.0, 3.0], &[1.0, 2.0, 3.0]); + assert!((result.unwrap() - 1.0).abs() < 1e-10); +} + +#[test] +fn test_cosine_similarity_opposite_vectors() { + let result = cosine_similarity(&[1.0, 2.0, 3.0], &[-1.0, -2.0, -3.0]); + assert!((result.unwrap() - (-1.0)).abs() < 1e-10); +} + +#[test] +fn test_cosine_similarity_orthogonal_vectors() { + let result = cosine_similarity(&[1.0, 0.0], &[0.0, 1.0]); + assert!(result.unwrap().abs() < 1e-10); +} + +#[test] +fn test_cosine_similarity_single_element() { + let result = cosine_similarity(&[5.0], &[3.0]); + assert!((result.unwrap() - 1.0).abs() < 1e-10); +} + +#[test] +fn test_cosine_similarity_large_dimensions() { + let a: Vec = (0..1024).map(|i| (i as f64).sin()).collect(); + let b: Vec = (0..1024).map(|i| (i as f64).cos()).collect(); + let result = cosine_similarity(&a, &b); + assert!(result.is_ok()); + let sim = result.unwrap(); + assert!(sim >= -1.0 && sim <= 1.0); +} + +#[test] +fn test_hvector_distance_zero_vector_returns_error() { + let arena = Bump::new(); + let v1 = alloc_vector(&arena, &[0.0, 0.0]); + let v2 = alloc_vector(&arena, &[3.0, 4.0]); + assert!(v1.distance_to(&v2).is_err()); +} + +#[test] +fn test_cosine_similarity_near_zero_magnitude_returns_error() { + let tiny = f64::EPSILON * 0.1; + let result = cosine_similarity(&[tiny, 0.0], &[1.0, 2.0]); + assert!(result.is_err()); +} diff --git a/helix-db/src/helix_engine/vector_core/vector_core.rs b/helix-db/src/helix_engine/vector_core/vector_core.rs index e463feea..ff74acf4 100644 --- a/helix-db/src/helix_engine/vector_core/vector_core.rs +++ b/helix-db/src/helix_engine/vector_core/vector_core.rs @@ -288,18 +288,10 @@ impl VectorCore { neighbor.set_distance(neighbor.distance_to(query)?); - /* - let passes_filters = match filter { - Some(filter_slice) => filter_slice.iter().all(|f| f(&neighbor, txn)), - None => true, - }; - - if passes_filters { - result.push(neighbor); - } - */ - - if filter.is_none() || filter.unwrap().iter().all(|f| f(&neighbor, txn)) { + if filter + .as_ref() + .map_or(true, |f| f.iter().all(|f| f(&neighbor, txn))) + { result.push(neighbor); } } @@ -458,7 +450,7 @@ impl VectorCore { let (key, _) = result?; // Extract id from the key: v: (2 bytes) + id (16 bytes) + level (8 bytes) - if key.len() < VECTOR_PREFIX.len() + 16 { + if key.len() < VECTOR_PREFIX.len() + 16 + 8 { continue; // Skip malformed keys } @@ -505,6 +497,10 @@ impl HNSW for VectorCore { 'db: 'arena, 'arena: 'txn, { + if query.is_empty() { + return Err(VectorError::InvalidVectorData); + } + let query = HVector::from_slice(label, 0, query); // let temp_arena = bumpalo::Bump::new(); @@ -572,6 +568,10 @@ impl HNSW for VectorCore { 'db: 'arena, 'arena: 'txn, { + if data.is_empty() { + return Err(VectorError::InvalidVectorData); + } + let new_level = self.get_new_level(); let mut query = HVector::from_slice(label, 0, data); @@ -597,7 +597,7 @@ impl HNSW for VectorCore { let mut nearest = self.search_level::(txn, label, &query, &mut curr_ep, 1, level, None, arena)?; curr_ep = nearest.pop().ok_or(VectorError::VectorCoreError( - "emtpy search result".to_string(), + "empty search result".to_string(), ))?; } @@ -613,7 +613,7 @@ impl HNSW for VectorCore { arena, )?; curr_ep = *nearest.peek().ok_or(VectorError::VectorCoreError( - "emtpy search result".to_string(), + "empty search result".to_string(), ))?; let neighbors = diff --git a/helix-db/src/helix_engine/vector_core/vector_distance.rs b/helix-db/src/helix_engine/vector_core/vector_distance.rs index d92737e2..746be3fb 100644 --- a/helix-db/src/helix_engine/vector_core/vector_distance.rs +++ b/helix-db/src/helix_engine/vector_core/vector_distance.rs @@ -28,11 +28,13 @@ pub fn cosine_similarity(from: &[f64], to: &[f64]) -> Result { let len = from.len(); let other_len = to.len(); + if len == 0 || other_len == 0 { + return Err(VectorError::InvalidVectorData); + } + if len != other_len { - println!("mis-match in vector dimensions!\n{len} != {other_len}"); return Err(VectorError::InvalidVectorLength); } - //debug_assert_eq!(len, other.data.len(), "Vectors must have the same length"); #[cfg(target_feature = "avx2")] { @@ -78,17 +80,23 @@ pub fn cosine_similarity(from: &[f64], to: &[f64]) -> Result { magnitude_b += b_val * b_val; } - if magnitude_a.abs() == 0.0 || magnitude_b.abs() == 0.0 { - return Ok(-1.0); + if magnitude_a < f64::EPSILON || magnitude_b < f64::EPSILON { + return Err(VectorError::InvalidVectorData); + } + + let similarity = dot_product / (magnitude_a.sqrt() * magnitude_b.sqrt()); + + if similarity.is_nan() || similarity.is_infinite() { + return Err(VectorError::InvalidVectorData); } - Ok(dot_product / (magnitude_a.sqrt() * magnitude_b.sqrt())) + Ok(similarity) } // SIMD implementation using AVX2 (256-bit vectors) #[cfg(target_feature = "avx2")] #[inline(always)] -pub fn cosine_similarity_avx2(a: &[f64], b: &[f64]) -> f64 { +pub fn cosine_similarity_avx2(a: &[f64], b: &[f64]) -> Result { use std::arch::x86_64::*; let len = a.len(); @@ -133,10 +141,20 @@ pub fn cosine_similarity_avx2(a: &[f64], b: &[f64]) -> f64 { // Combine SIMD and scalar results let dot_product_total = dot_sum + dot_remainder; - let magnitude_a_total = (mag_a_sum + mag_a_remainder).sqrt(); - let magnitude_b_total = (mag_b_sum + mag_b_remainder).sqrt(); + let mag_a_total = mag_a_sum + mag_a_remainder; + let mag_b_total = mag_b_sum + mag_b_remainder; + + if mag_a_total < f64::EPSILON || mag_b_total < f64::EPSILON { + return Err(VectorError::InvalidVectorData); + } + + let similarity = dot_product_total / (mag_a_total.sqrt() * mag_b_total.sqrt()); + + if similarity.is_nan() || similarity.is_infinite() { + return Err(VectorError::InvalidVectorData); + } - dot_product_total / (magnitude_a_total * magnitude_b_total) + Ok(similarity) } }