From 99ae76133f043002994651d506c0d2993870389e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 12 Dec 2024 16:33:38 -0800 Subject: [PATCH] fix: correctly copy null buffer when making deep copy (#3238) In some situations an array could be sliced in such a way that the array had no offset, but the array's null buffer did have an offset. In these cases we were not deep copying the array correctly and the offset of the null buffer was lost. This does mean, in some cases, the 2.0 writer could write incorrect nulls. However, the input conditions would mean that the user's data would have to originate from rust in such a way that it was sliced like this. It would be impossible for batches from the C data interface or from python to look like this. --- python/python/tests/test_scalar_index.py | 30 +++++++++++++ rust/lance-arrow/src/deepcopy.rs | 56 +++++++++++++++++------- rust/lance-encoding/src/data.rs | 22 +++++++++- 3 files changed, 92 insertions(+), 16 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index dd3df96d64..6669f27a7f 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -368,6 +368,36 @@ def check(has_index: bool): check(True) +def test_scalar_index_with_nulls(tmp_path): + # Create a test dataframe with 50% null values. + test_table_size = 10_000 + test_table = pa.table( + { + "item_id": list(range(test_table_size)), + "inner_id": list(range(test_table_size)), + "category": ["a", None] * (test_table_size // 2), + "numeric_int": [1, None] * (test_table_size // 2), + "numeric_float": [0.1, None] * (test_table_size // 2), + "boolean_col": [True, None] * (test_table_size // 2), + "timestamp_col": [datetime(2023, 1, 1), None] * (test_table_size // 2), + } + ) + ds = lance.write_dataset(test_table, tmp_path) + ds.create_scalar_index("inner_id", index_type="BTREE") + ds.create_scalar_index("category", index_type="BTREE") + ds.create_scalar_index("boolean_col", index_type="BTREE") + ds.create_scalar_index("timestamp_col", index_type="BTREE") + # Test querying with filters on columns with nulls. + k = test_table_size // 2 + result = ds.to_table(filter="category = 'a'", limit=k) + assert len(result) == k + # Booleans should be stored as strings in the table for backwards compatibility. + result = ds.to_table(filter="boolean_col IS TRUE", limit=k) + assert len(result) == k + result = ds.to_table(filter="timestamp_col IS NOT NULL", limit=k) + assert len(result) == k + + def test_label_list_index(tmp_path: Path): tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"]) tag_list = pa.ListArray.from_arrays([0, 2, 4], tags) diff --git a/rust/lance-arrow/src/deepcopy.rs b/rust/lance-arrow/src/deepcopy.rs index 93b58fb9c8..f3c0c13fd0 100644 --- a/rust/lance-arrow/src/deepcopy.rs +++ b/rust/lance-arrow/src/deepcopy.rs @@ -4,22 +4,28 @@ use std::sync::Arc; use arrow_array::{make_array, Array, RecordBatch}; -use arrow_buffer::{Buffer, NullBuffer}; -use arrow_data::ArrayData; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer { Buffer::from(buffer.as_slice()) } -fn deep_copy_nulls(nulls: &NullBuffer) -> Buffer { - deep_copy_buffer(nulls.inner().inner()) +fn deep_copy_nulls(nulls: Option<&NullBuffer>) -> Option { + let nulls = nulls?; + let bit_buffer = deep_copy_buffer(nulls.inner().inner()); + Some(unsafe { + NullBuffer::new_unchecked( + BooleanBuffer::new(bit_buffer, nulls.offset(), nulls.len()), + nulls.null_count(), + ) + }) } pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData { let data_type = data.data_type().clone(); let len = data.len(); - let null_count = data.null_count(); - let null_bit_buffer = data.nulls().map(deep_copy_nulls); + let nulls = deep_copy_nulls(data.nulls()); let offset = data.offset(); let buffers = data .buffers() @@ -32,15 +38,13 @@ pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData { .map(deep_copy_array_data) .collect::>(); unsafe { - ArrayData::new_unchecked( - data_type, - len, - Some(null_count), - null_bit_buffer, - offset, - buffers, - child_data, - ) + ArrayDataBuilder::new(data_type) + .len(len) + .nulls(nulls) + .offset(offset) + .buffers(buffers) + .child_data(child_data) + .build_unchecked() } } @@ -58,3 +62,25 @@ pub fn deep_copy_batch(batch: &RecordBatch) -> crate::Result { .collect::>(); RecordBatch::try_new(batch.schema(), arrays) } + +#[cfg(test)] +pub mod tests { + use std::sync::Arc; + + use arrow_array::{Array, Int32Array}; + + #[test] + fn test_deep_copy_sliced_array_with_nulls() { + let array = Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + None, + Some(5), + ])); + let sliced_array = array.slice(1, 3); + let copied_array = super::deep_copy_array(&sliced_array); + assert_eq!(sliced_array.len(), copied_array.len()); + assert_eq!(sliced_array.nulls(), copied_array.nulls()); + } +} diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index a4105b6d8c..c0d3e27791 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -1534,7 +1534,7 @@ mod tests { use arrow::datatypes::{Int32Type, Int8Type}; use arrow_array::{ make_array, new_null_array, ArrayRef, DictionaryArray, Int8Array, LargeBinaryArray, - StringArray, UInt8Array, + StringArray, UInt16Array, UInt8Array, }; use arrow_buffer::{BooleanBuffer, NullBuffer}; @@ -1548,6 +1548,26 @@ mod tests { use arrow::compute::concat; use arrow_array::Array; + + #[test] + fn test_sliced_to_data_block() { + let ints = UInt16Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8]); + let ints = ints.slice(2, 4); + let data = DataBlock::from_array(ints); + + let fixed_data = data.as_fixed_width().unwrap(); + assert_eq!(fixed_data.num_values, 4); + assert_eq!(fixed_data.data.len(), 8); + + let nullable_ints = + UInt16Array::from(vec![Some(0), None, Some(2), None, Some(4), None, Some(6)]); + let nullable_ints = nullable_ints.slice(1, 3); + let data = DataBlock::from_array(nullable_ints); + + let nullable = data.as_nullable().unwrap(); + assert_eq!(nullable.nulls, LanceBuffer::Owned(vec![0b00000010])); + } + #[test] fn test_string_to_data_block() { // Converting string arrays that contain nulls to DataBlock