From 920b1918861cb061a219888f3e9c369ee3c08882 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 21 Nov 2024 10:07:45 -0800 Subject: [PATCH] fix: handle filter on empty partition (#3151) It panicked when it applies filter on an empty partition ``` thread 'index::vector::pq::tests::test_filter_on_empty_pq_code' panicked at rust/lance/src/index/vector/pq.rs:125:14: chunk size must be non-zero ``` --- rust/lance/src/index/vector/pq.rs | 59 ++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index e497517c24..91973d4a35 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -28,7 +28,7 @@ use lance_index::{ }; use lance_io::{traits::Reader, utils::read_fixed_stride_array}; use lance_linalg::distance::{DistanceType, MetricType}; -use log::info; +use log::{info, warn}; use roaring::RoaringBitmap; use serde_json::json; use snafu::{location, Location}; @@ -110,6 +110,10 @@ impl PQIndex { _num_sub_vectors: i32, ) -> Result<(Arc, Arc)> { let num_vectors = row_ids.len(); + if num_vectors == 0 { + warn!("Filtering on empty PQ code array"); + return Ok((code, row_ids)); + } let indices_to_keep = pre_filter.filter_row_ids(Box::new(row_ids.values().iter())); let indices_to_keep = UInt64Array::from(indices_to_keep); @@ -488,14 +492,18 @@ pub(crate) fn build_pq_storage( #[cfg(test)] mod tests { use super::*; - use crate::index::vector::ivf::build_ivf_model; + + use std::ops::Range; + use arrow::datatypes::Float32Type; use arrow_array::RecordBatchIterator; use arrow_schema::{Field, Schema}; + use tempfile::tempdir; + + use crate::index::vector::ivf::build_ivf_model; + use lance_core::utils::mask::RowIdMask; use lance_index::vector::ivf::IvfBuildParams; use lance_testing::datagen::generate_random_array_with_range; - use std::ops::Range; - use tempfile::tempdir; const DIM: usize = 128; async fn generate_dataset( @@ -618,4 +626,47 @@ mod tests { distances ); } + + struct TestPreFilter { + row_ids: Vec, + } + + impl TestPreFilter { + fn new(row_ids: Vec) -> Self { + Self { row_ids } + } + } + + #[async_trait] + impl PreFilter for TestPreFilter { + async fn wait_for_ready(&self) -> Result<()> { + Ok(()) + } + + fn is_empty(&self) -> bool { + self.row_ids.is_empty() + } + + fn mask(&self) -> Arc { + RowIdMask::all_rows().into() + } + + fn filter_row_ids<'a>(&self, row_ids: Box + 'a>) -> Vec { + row_ids + .filter(|&row_id| self.row_ids.contains(row_id)) + .cloned() + .collect() + } + } + + #[test] + fn test_filter_on_empty_pq_code() { + let pre_filter = TestPreFilter::new(vec![1, 3, 5, 7, 9]); + let code = Arc::new(UInt8Array::from(Vec::::new())); + let row_ids = Arc::new(UInt64Array::from(Vec::::new())); + + let (code, row_ids) = PQIndex::filter_arrays(&pre_filter, code, row_ids, 16).unwrap(); + assert!(code.values().is_empty()); + assert!(row_ids.is_empty()); + } }