diff --git a/internal/core/src/common/Array.h b/internal/core/src/common/Array.h index 4d617a1145ba4..c78d93a950c30 100644 --- a/internal/core/src/common/Array.h +++ b/internal/core/src/common/Array.h @@ -35,6 +35,10 @@ class Array { ~Array() { delete[] data_; + if (offsets_ptr_) { + // only deallocate offsets for string type array + delete[] offsets_ptr_; + } } explicit Array(const ScalarArray& field_data) { @@ -97,17 +101,17 @@ class Array { case ScalarArray::kStringData: { element_type_ = DataType::STRING; length_ = field_data.string_data().data().size(); - offsets_.reserve(length_); + offsets_ptr_ = new uint32_t[length_]; for (int i = 0; i < length_; ++i) { - offsets_.push_back(size_); - size_ += field_data.string_data().data(i).size(); + offsets_ptr_[i] = reinterpret_cast(size_); + size_ += field_data.string_data().data(i).size(); //type risk here between uint32_t vs size_t } data_ = new char[size_]; for (int i = 0; i < length_; ++i) { std::copy_n(field_data.string_data().data(i).data(), field_data.string_data().data(i).size(), - data_ + offsets_[i]); + data_ + offsets_ptr_[i]); } break; } @@ -117,49 +121,33 @@ class Array { } } - Array(char* data, - size_t size, - DataType element_type, - std::vector&& element_offsets) - : size_(size), - offsets_(std::move(element_offsets)), - element_type_(element_type) { - delete[] data_; - data_ = new char[size]; - std::copy(data, data + size, data_); - if (IsVariableDataType(element_type_)) { - length_ = offsets_.size(); - } else { - // int8, int16, int32 are all promoted to int32 - if (element_type_ == DataType::INT8 || - element_type_ == DataType::INT16) { - length_ = size / sizeof(int32_t); - } else { - length_ = size / GetDataTypeSize(element_type_); - } - } - } - Array(const Array& array) noexcept : length_{array.length_}, size_{array.size_}, element_type_{array.element_type_} { - delete[] data_; data_ = new char[array.size_]; std::copy(array.data_, array.data_ + array.size_, data_); - offsets_ = array.offsets_; + if (IsVariableDataType(array.element_type_)) { + offsets_ptr_ = new uint32_t[array.length()]; + std::copy_n(array.get_offsets_data(), array.length(), offsets_ptr_); + } } Array& operator=(const Array& array) { delete[] data_; - - data_ = new char[array.size_]; - std::copy(array.data_, array.data_ + array.size_, data_); + if (offsets_ptr_) { + delete[] offsets_ptr_; + } length_ = array.length_; size_ = array.size_; - offsets_ = array.offsets_; element_type_ = array.element_type_; + data_ = new char[array.size_]; + std::copy(array.data_, array.data_ + array.size_, data_); + if (IsVariableDataType(array.get_element_type())) { + offsets_ptr_ = new uint32_t[array.length()]; + } + return *this; } @@ -242,9 +230,9 @@ class Array { if constexpr (std::is_same_v || std::is_same_v) { size_t element_length = (index == length_ - 1) - ? size_ - offsets_.back() - : offsets_[index + 1] - offsets_[index]; - return T(data_ + offsets_[index], element_length); + ? size_ - offsets_ptr_[length_ - 1] + : offsets_ptr_[index + 1] - offsets_ptr_[index]; + return T(data_ + offsets_ptr_[index], element_length); } if constexpr (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || @@ -272,14 +260,9 @@ class Array { return reinterpret_cast(data_)[index]; } - const std::vector& - get_offsets() const { - return offsets_; - } - - std::vector - get_offsets_in_copy() const { - return offsets_; + const uint32_t* + get_offsets_data() const { + return offsets_ptr_; } ScalarArray @@ -435,12 +418,9 @@ class Array { private: char* data_{nullptr}; int length_ = 0; - int size_ = 0; - std::vector offsets_{}; + uint32_t size_ = 0; DataType element_type_ = DataType::NONE; - - //offsets for mmap, padding to be consistent with ArrayView - const uint64_t* offsets_ptr_; + uint32_t* offsets_ptr_{nullptr}; }; class ArrayView { @@ -451,7 +431,7 @@ class ArrayView { int len, size_t size, DataType element_type, - const uint64_t* offsets_ptr) + const uint32_t* offsets_ptr) : data_(data), length_(len), size_(size), @@ -466,28 +446,6 @@ class ArrayView { } } - ArrayView(char* data, - size_t size, - DataType element_type, - std::vector&& element_offsets) - : size_(size), - offsets_(std::move(element_offsets)), - element_type_(element_type), - offsets_ptr_(nullptr) { - data_ = data; - if (IsVariableDataType(element_type_)) { - length_ = offsets_.size(); - } else { - // int8, int16, int32 are all promoted to int32 - if (element_type_ == DataType::INT8 || - element_type_ == DataType::INT16) { - length_ = size / sizeof(int32_t); - } else { - length_ = size / GetDataTypeSize(element_type_); - } - } - } - template T get_data(const int index) const { @@ -498,19 +456,11 @@ class ArrayView { if constexpr (std::is_same_v || std::is_same_v) { - if (offsets_ptr_) { - size_t element_length = - (index == length_ - 1) - ? size_ - offsets_ptr_[length_ - 1] - : offsets_ptr_[index + 1] - offsets_ptr_[index]; - return T(data_ + offsets_ptr_[index], element_length); - } else { - size_t element_length = - (index == length_ - 1) - ? size_ - offsets_.back() - : offsets_[index + 1] - offsets_[index]; - return T(data_ + offsets_[index], element_length); - } + size_t element_length = + (index == length_ - 1) + ? size_ - offsets_ptr_[length_ - 1] + : offsets_ptr_[index + 1] - offsets_ptr_[index]; + return T(data_ + offsets_ptr_[index], element_length); } if constexpr (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v) { @@ -612,15 +562,6 @@ class ArrayView { data() const { return data_; } - // copy to result - std::vector - get_offsets_in_copy() const { - if (offsets_ptr_) { - return std::vector( - offsets_ptr_, offsets_ptr_ + sizeof(uint64_t) * length_); - } - return offsets_; - } bool is_same_array(const proto::plan::Array& arr2) const { @@ -697,11 +638,10 @@ class ArrayView { char* data_{nullptr}; int length_ = 0; int size_ = 0; - std::vector offsets_{}; DataType element_type_ = DataType::NONE; - //offsets for mmap - const uint64_t* offsets_ptr_; + //offsets ptr + const uint32_t* offsets_ptr_; }; } // namespace milvus diff --git a/internal/core/src/common/Chunk.cpp b/internal/core/src/common/Chunk.cpp index b1de873d40f2f..ca912e128c0dd 100644 --- a/internal/core/src/common/Chunk.cpp +++ b/internal/core/src/common/Chunk.cpp @@ -53,10 +53,10 @@ ArrayChunk::ConstructViews() { int len = offsets_lens_[2 * i + 1]; auto data_ptr = data_ + offset; auto offsets_bytes_len = 0; - uint64_t* offsets_ptr = nullptr; + uint32_t* offsets_ptr = nullptr; if (IsStringDataType(element_type_)) { - offsets_bytes_len = len * sizeof(uint64_t); - offsets_ptr = reinterpret_cast(data_ptr); + offsets_bytes_len = len * sizeof(uint32_t); + offsets_ptr = reinterpret_cast(data_ptr); } views_.emplace_back(data_ptr + offsets_bytes_len, len, diff --git a/internal/core/src/common/ChunkWriter.cpp b/internal/core/src/common/ChunkWriter.cpp index dc2d9e4bfe45e..ac4ebd877a71c 100644 --- a/internal/core/src/common/ChunkWriter.cpp +++ b/internal/core/src/common/ChunkWriter.cpp @@ -205,32 +205,32 @@ ArrayChunkWriter::write(std::shared_ptr data) { int offsets_num = row_nums_ + 1; int len_num = row_nums_; - int offset_start_pos = - target_->tell() + sizeof(uint64_t) * (offsets_num + len_num); - std::vector offsets; - std::vector lens; - for (auto& arr : arrays) { - offsets.push_back(offset_start_pos); - lens.push_back(arr.length()); - offset_start_pos += - is_string ? sizeof(uint64_t) * arr.get_offsets().size() : 0; + uint32_t offset_start_pos = + target_->tell() + sizeof(uint32_t) * (offsets_num + len_num); + std::vector offsets(offsets_num); + std::vector lens(len_num); + for (auto i = 0; i < arrays.size(); i++) { + auto& arr = arrays[i]; + offsets[i] = offset_start_pos; + lens[i] = arr.length(); + offset_start_pos += is_string ? sizeof(uint32_t) * lens[i] : 0; offset_start_pos += arr.byte_size(); } offsets.push_back(offset_start_pos); for (int i = 0; i < offsets.size(); i++) { if (i == offsets.size() - 1) { - target_->write(&offsets[i], sizeof(uint64_t)); + target_->write(&offsets[i], sizeof(uint32_t)); break; } - target_->write(&offsets[i], sizeof(uint64_t)); - target_->write(&lens[i], sizeof(uint64_t)); + target_->write(&offsets[i], sizeof(uint32_t)); + target_->write(&lens[i], sizeof(uint32_t)); } for (auto& arr : arrays) { if (is_string) { - target_->write(arr.get_offsets().data(), - arr.get_offsets().size() * sizeof(uint64_t)); + target_->write(arr.get_offsets_data(), + arr.length() * sizeof(uint32_t)); } target_->write(arr.data(), arr.byte_size()); } diff --git a/internal/core/src/mmap/ChunkData.h b/internal/core/src/mmap/ChunkData.h index ec70d94ab6423..3f7f0ed9e347f 100644 --- a/internal/core/src/mmap/ChunkData.h +++ b/internal/core/src/mmap/ChunkData.h @@ -202,9 +202,10 @@ VariableLengthChunk::set(const Array* src, char* data_ptr = buf + offset; std::copy(src[i].data(), src[i].data() + src[i].byte_size(), data_ptr); data_[i + begin] = ArrayView(data_ptr, + src[i].length(), data_size, src[i].get_element_type(), - src[i].get_offsets_in_copy()); + src[i].get_offsets_data()); offset += data_size; } } diff --git a/internal/core/src/mmap/ChunkVector.h b/internal/core/src/mmap/ChunkVector.h index 74341195c8edc..5c0a2b4de828c 100644 --- a/internal/core/src/mmap/ChunkVector.h +++ b/internal/core/src/mmap/ChunkVector.h @@ -119,9 +119,10 @@ class ThreadSafeChunkVector : public ChunkVectorBase { } else if constexpr (std::is_same_v) { auto& src = chunk[chunk_offset]; return ArrayView(const_cast(src.data()), + src.length(), src.byte_size(), src.get_element_type(), - src.get_offsets_in_copy()); + src.get_offsets_data()); } else { return chunk[chunk_offset]; } diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index 0862f53b052de..d69adc9d792be 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -918,7 +918,8 @@ class SingleChunkArrayColumn : public SingleChunkColumnBase { void Append(const Array& array, bool valid_data = false) { indices_.emplace_back(data_size_); - element_indices_.emplace_back(array.get_offsets()); + element_indices_.emplace_back(array.get_offsets_data(), array.get_offsets_data() + array.length()); + // have to copy element offsets from external array if (nullable_) { return SingleChunkColumnBase::Append( static_cast(array.data()), @@ -931,7 +932,7 @@ class SingleChunkArrayColumn : public SingleChunkColumnBase { void Seal(std::vector&& indices = {}, - std::vector>&& element_indices = {}) { + std::vector>&& element_indices = {}) { if (!indices.empty()) { indices_ = std::move(indices); element_indices_ = std::move(element_indices); @@ -946,20 +947,22 @@ class SingleChunkArrayColumn : public SingleChunkColumnBase { views_.reserve(indices_.size()); for (size_t i = 0; i < indices_.size() - 1; i++) { views_.emplace_back(data_ + indices_[i], + element_indices_[i].size(), indices_[i + 1] - indices_[i], element_type_, - std::move(element_indices_[i])); + element_indices_[i].data()); } views_.emplace_back(data_ + indices_.back(), + element_indices_[indices_.size() - 1].size(), data_size_ - indices_.back(), element_type_, - std::move(element_indices_[indices_.size() - 1])); + element_indices_[indices_.size() - 1].data()); element_indices_.clear(); } private: std::vector indices_{}; - std::vector> element_indices_{}; + std::vector> element_indices_{}; // Compatible with current Span type std::vector views_{}; DataType element_type_; diff --git a/internal/core/src/mmap/Utils.h b/internal/core/src/mmap/Utils.h index c93165df31812..eff46c1c679fb 100644 --- a/internal/core/src/mmap/Utils.h +++ b/internal/core/src/mmap/Utils.h @@ -90,7 +90,7 @@ WriteFieldData(File& file, const FieldDataPtr& data, uint64_t& total_written, std::vector& indices, - std::vector>& element_indices, + std::vector>& element_indices, FixedVector& valid_data) { if (IsVariableDataType(data_type)) { // use buffered writer to reduce fwrite/write syscall @@ -131,7 +131,7 @@ WriteFieldData(File& file, indices.push_back(total_written); auto array = static_cast(data->RawValue(i)); bw.Write(array->data(), array->byte_size()); - element_indices.emplace_back(array->get_offsets()); + element_indices.emplace_back(array->get_offsets_data(), array->get_offsets_data() + array->length()); total_written += array->byte_size(); } break; diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index ee8652da06d50..d8145859d0360 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -544,7 +544,7 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) { FieldDataPtr field_data; uint64_t total_written = 0; std::vector indices{}; - std::vector> element_indices{}; + std::vector> element_indices{}; FixedVector valid_data{}; while (data.channel->pop(field_data)) { WriteFieldData(file, diff --git a/internal/core/unittest/test_array.cpp b/internal/core/unittest/test_array.cpp index 37caa3b64c555..b9c964dbd8ac4 100644 --- a/internal/core/unittest/test_array.cpp +++ b/internal/core/unittest/test_array.cpp @@ -33,24 +33,28 @@ TEST(Array, TestConstructArray) { } ASSERT_TRUE(int_array.is_same_array(field_int_array)); auto int_array_tmp = Array(const_cast(int_array.data()), + int_array.length(), int_array.byte_size(), int_array.get_element_type(), - {}); + int_array.get_offsets_data()); auto int_8_array = Array(const_cast(int_array.data()), + int_array.length(), int_array.byte_size(), DataType::INT8, - {}); + int_array.get_offsets_data()); ASSERT_EQ(int_array.length(), int_8_array.length()); auto int_16_array = Array(const_cast(int_array.data()), + int_array.length(), int_array.byte_size(), DataType::INT16, - {}); + int_array.get_offsets_data()); ASSERT_EQ(int_array.length(), int_16_array.length()); ASSERT_TRUE(int_array_tmp == int_array); auto int_array_view = ArrayView(const_cast(int_array.data()), + int_array.length(), int_array.byte_size(), int_array.get_element_type(), - {}); + int_array.get_offsets_data()); ASSERT_EQ(int_array.length(), int_array_view.length()); ASSERT_EQ(int_array.byte_size(), int_array_view.byte_size()); ASSERT_EQ(int_array.get_element_type(), int_array_view.get_element_type()); @@ -70,14 +74,16 @@ TEST(Array, TestConstructArray) { } ASSERT_TRUE(long_array.is_same_array(field_int_array)); auto long_array_tmp = Array(const_cast(long_array.data()), + long_array.length(), long_array.byte_size(), long_array.get_element_type(), - {}); + long_array.get_offsets_data()); ASSERT_TRUE(long_array_tmp == long_array); auto long_array_view = ArrayView(const_cast(long_array.data()), + long_array.length(), long_array.byte_size(), long_array.get_element_type(), - {}); + long_array.get_offsets_data()); ASSERT_EQ(long_array.length(), long_array_view.length()); ASSERT_EQ(long_array.byte_size(), long_array_view.byte_size()); ASSERT_EQ(long_array.get_element_type(), @@ -107,14 +113,16 @@ TEST(Array, TestConstructArray) { string_view_element_offsets.emplace_back(offset); } auto string_array_tmp = Array(const_cast(string_array.data()), + string_array.length(), string_array.byte_size(), string_array.get_element_type(), - std::move(string_element_offsets)); + string_array.get_offsets_data()); ASSERT_TRUE(string_array_tmp == string_array); auto string_array_view = ArrayView(const_cast(string_array.data()), + string_array.length(), string_array.byte_size(), string_array.get_element_type(), - std::move(string_view_element_offsets)); + string_array.get_offsets_data()); ASSERT_EQ(string_array.length(), string_array_view.length()); ASSERT_EQ(string_array.byte_size(), string_array_view.byte_size()); ASSERT_EQ(string_array.get_element_type(), @@ -135,14 +143,16 @@ TEST(Array, TestConstructArray) { } ASSERT_TRUE(bool_array.is_same_array(field_bool_array)); auto bool_array_tmp = Array(const_cast(bool_array.data()), + bool_array.length(), bool_array.byte_size(), bool_array.get_element_type(), - {}); + bool_array.get_offsets_data()); ASSERT_TRUE(bool_array_tmp == bool_array); auto bool_array_view = ArrayView(const_cast(bool_array.data()), + bool_array.length(), bool_array.byte_size(), bool_array.get_element_type(), - {}); + bool_array.get_offsets_data()); ASSERT_EQ(bool_array.length(), bool_array_view.length()); ASSERT_EQ(bool_array.byte_size(), bool_array_view.byte_size()); ASSERT_EQ(bool_array.get_element_type(), @@ -163,14 +173,16 @@ TEST(Array, TestConstructArray) { } ASSERT_TRUE(float_array.is_same_array(field_float_array)); auto float_array_tmp = Array(const_cast(float_array.data()), + float_array.length(), float_array.byte_size(), float_array.get_element_type(), - {}); + float_array.get_offsets_data()); ASSERT_TRUE(float_array_tmp == float_array); auto float_array_view = ArrayView(const_cast(float_array.data()), + float_array.length(), float_array.byte_size(), float_array.get_element_type(), - {}); + float_array.get_offsets_data()); ASSERT_EQ(float_array.length(), float_array_view.length()); ASSERT_EQ(float_array.byte_size(), float_array_view.byte_size()); ASSERT_EQ(float_array.get_element_type(), @@ -192,14 +204,16 @@ TEST(Array, TestConstructArray) { } ASSERT_TRUE(double_array.is_same_array(field_double_array)); auto double_array_tmp = Array(const_cast(double_array.data()), + double_array.length(), double_array.byte_size(), double_array.get_element_type(), - {}); + double_array.get_offsets_data()); ASSERT_TRUE(double_array_tmp == double_array); auto double_array_view = ArrayView(const_cast(double_array.data()), + double_array.length(), double_array.byte_size(), double_array.get_element_type(), - {}); + double_array.get_offsets_data()); ASSERT_EQ(double_array.length(), double_array_view.length()); ASSERT_EQ(double_array.byte_size(), double_array_view.byte_size()); ASSERT_EQ(double_array.get_element_type(),