Skip to content

Commit

Permalink
fix: correctly copy null buffer when making deep copy (#3238)
Browse files Browse the repository at this point in the history
In some situations an array could be sliced in such a way that the array
had no offset, but the array's null buffer did have an offset. In these
cases we were not deep copying the array correctly and the offset of the
null buffer was lost. This does mean, in some cases, the 2.0 writer
could write incorrect nulls. However, the input conditions would mean
that the user's data would have to originate from rust in such a way
that it was sliced like this. It would be impossible for batches from
the C data interface or from python to look like this.
  • Loading branch information
westonpace authored Dec 13, 2024
1 parent d3a4bc1 commit 99ae761
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 16 deletions.
30 changes: 30 additions & 0 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,36 @@ def check(has_index: bool):
check(True)


def test_scalar_index_with_nulls(tmp_path):
# Create a test dataframe with 50% null values.
test_table_size = 10_000
test_table = pa.table(
{
"item_id": list(range(test_table_size)),
"inner_id": list(range(test_table_size)),
"category": ["a", None] * (test_table_size // 2),
"numeric_int": [1, None] * (test_table_size // 2),
"numeric_float": [0.1, None] * (test_table_size // 2),
"boolean_col": [True, None] * (test_table_size // 2),
"timestamp_col": [datetime(2023, 1, 1), None] * (test_table_size // 2),
}
)
ds = lance.write_dataset(test_table, tmp_path)
ds.create_scalar_index("inner_id", index_type="BTREE")
ds.create_scalar_index("category", index_type="BTREE")
ds.create_scalar_index("boolean_col", index_type="BTREE")
ds.create_scalar_index("timestamp_col", index_type="BTREE")
# Test querying with filters on columns with nulls.
k = test_table_size // 2
result = ds.to_table(filter="category = 'a'", limit=k)
assert len(result) == k
# Booleans should be stored as strings in the table for backwards compatibility.
result = ds.to_table(filter="boolean_col IS TRUE", limit=k)
assert len(result) == k
result = ds.to_table(filter="timestamp_col IS NOT NULL", limit=k)
assert len(result) == k


def test_label_list_index(tmp_path: Path):
tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"])
tag_list = pa.ListArray.from_arrays([0, 2, 4], tags)
Expand Down
56 changes: 41 additions & 15 deletions rust/lance-arrow/src/deepcopy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,28 @@
use std::sync::Arc;

use arrow_array::{make_array, Array, RecordBatch};
use arrow_buffer::{Buffer, NullBuffer};
use arrow_data::ArrayData;
use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};

pub fn deep_copy_buffer(buffer: &Buffer) -> Buffer {
Buffer::from(buffer.as_slice())
}

fn deep_copy_nulls(nulls: &NullBuffer) -> Buffer {
deep_copy_buffer(nulls.inner().inner())
fn deep_copy_nulls(nulls: Option<&NullBuffer>) -> Option<NullBuffer> {
let nulls = nulls?;
let bit_buffer = deep_copy_buffer(nulls.inner().inner());
Some(unsafe {
NullBuffer::new_unchecked(
BooleanBuffer::new(bit_buffer, nulls.offset(), nulls.len()),
nulls.null_count(),
)
})
}

pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData {
let data_type = data.data_type().clone();
let len = data.len();
let null_count = data.null_count();
let null_bit_buffer = data.nulls().map(deep_copy_nulls);
let nulls = deep_copy_nulls(data.nulls());
let offset = data.offset();
let buffers = data
.buffers()
Expand All @@ -32,15 +38,13 @@ pub fn deep_copy_array_data(data: &ArrayData) -> ArrayData {
.map(deep_copy_array_data)
.collect::<Vec<_>>();
unsafe {
ArrayData::new_unchecked(
data_type,
len,
Some(null_count),
null_bit_buffer,
offset,
buffers,
child_data,
)
ArrayDataBuilder::new(data_type)
.len(len)
.nulls(nulls)
.offset(offset)
.buffers(buffers)
.child_data(child_data)
.build_unchecked()
}
}

Expand All @@ -58,3 +62,25 @@ pub fn deep_copy_batch(batch: &RecordBatch) -> crate::Result<RecordBatch> {
.collect::<Vec<_>>();
RecordBatch::try_new(batch.schema(), arrays)
}

#[cfg(test)]
pub mod tests {
use std::sync::Arc;

use arrow_array::{Array, Int32Array};

#[test]
fn test_deep_copy_sliced_array_with_nulls() {
let array = Arc::new(Int32Array::from(vec![
Some(1),
None,
Some(3),
None,
Some(5),
]));
let sliced_array = array.slice(1, 3);
let copied_array = super::deep_copy_array(&sliced_array);
assert_eq!(sliced_array.len(), copied_array.len());
assert_eq!(sliced_array.nulls(), copied_array.nulls());
}
}
22 changes: 21 additions & 1 deletion rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1534,7 +1534,7 @@ mod tests {
use arrow::datatypes::{Int32Type, Int8Type};
use arrow_array::{
make_array, new_null_array, ArrayRef, DictionaryArray, Int8Array, LargeBinaryArray,
StringArray, UInt8Array,
StringArray, UInt16Array, UInt8Array,
};
use arrow_buffer::{BooleanBuffer, NullBuffer};

Expand All @@ -1548,6 +1548,26 @@ mod tests {

use arrow::compute::concat;
use arrow_array::Array;

#[test]
fn test_sliced_to_data_block() {
let ints = UInt16Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8]);
let ints = ints.slice(2, 4);
let data = DataBlock::from_array(ints);

let fixed_data = data.as_fixed_width().unwrap();
assert_eq!(fixed_data.num_values, 4);
assert_eq!(fixed_data.data.len(), 8);

let nullable_ints =
UInt16Array::from(vec![Some(0), None, Some(2), None, Some(4), None, Some(6)]);
let nullable_ints = nullable_ints.slice(1, 3);
let data = DataBlock::from_array(nullable_ints);

let nullable = data.as_nullable().unwrap();
assert_eq!(nullable.nulls, LanceBuffer::Owned(vec![0b00000010]));
}

#[test]
fn test_string_to_data_block() {
// Converting string arrays that contain nulls to DataBlock
Expand Down

0 comments on commit 99ae761

Please sign in to comment.