Skip to content

Commit

Permalink
feat!: index_statistics returns the concrete index type (#2716)
Browse files Browse the repository at this point in the history
Before this, we try to fetch the index_type from stats of index, but
many indices don't put index_type in it.

This breaks:
- Before this, IVF_PQ, IVF_HNSW_PQ would return `index_type=IVF`, now
it's the concrete index type (IVF_PQ, IVF_HNSW_PQ)
- Before this, scalar indices would return `index_type=None`, now they
return their type

---------

Signed-off-by: BubbleCal <[email protected]>
  • Loading branch information
BubbleCal authored Aug 9, 2024
1 parent 304f590 commit 089716e
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 40 deletions.
32 changes: 16 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ exclude = ["python"]
resolver = "2"

[workspace.package]
version = "0.16.2"
version = "0.17.0"
edition = "2021"
authors = ["Lance Devs <[email protected]>"]
license = "Apache-2.0"
Expand All @@ -44,20 +44,20 @@ categories = [
rust-version = "1.78"

[workspace.dependencies]
lance = { version = "=0.16.2", path = "./rust/lance" }
lance-arrow = { version = "=0.16.2", path = "./rust/lance-arrow" }
lance-core = { version = "=0.16.2", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.16.2", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.16.2", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.16.2", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.16.2", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.16.2", path = "./rust/lance-file" }
lance-index = { version = "=0.16.2", path = "./rust/lance-index" }
lance-io = { version = "=0.16.2", path = "./rust/lance-io" }
lance-linalg = { version = "=0.16.2", path = "./rust/lance-linalg" }
lance-table = { version = "=0.16.2", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.16.2", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.16.2", path = "./rust/lance-testing" }
lance = { version = "=0.17.0", path = "./rust/lance" }
lance-arrow = { version = "=0.17.0", path = "./rust/lance-arrow" }
lance-core = { version = "=0.17.0", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.17.0", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.17.0", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.17.0", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.17.0", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.17.0", path = "./rust/lance-file" }
lance-index = { version = "=0.17.0", path = "./rust/lance-index" }
lance-io = { version = "=0.17.0", path = "./rust/lance-io" }
lance-linalg = { version = "=0.17.0", path = "./rust/lance-linalg" }
lance-table = { version = "=0.17.0", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.17.0", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.17.0", path = "./rust/lance-testing" }
approx = "0.5.1"
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false, features = ["prettyprint"] }
Expand Down Expand Up @@ -110,7 +110,7 @@ datafusion-physical-expr = { version = "40.0", features = [
] }
deepsize = "0.2.0"
either = "1.0"
fsst = { version = "=0.16.2", path = "./rust/lance-encoding/compression-algo/fsst" }
fsst = { version = "=0.17.0", path = "./rust/lance-encoding/compression-algo/fsst" }
futures = "0.3"
http = "0.2.9"
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
Expand Down
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pylance"
version = "0.16.2"
version = "0.17.0"
edition = "2021"
authors = ["Lance Devs <[email protected]>"]
rust-version = "1.65"
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def data_table(indexed_dataset: lance.LanceDataset):

def test_load_indices(indexed_dataset: lance.LanceDataset):
indices = indexed_dataset.list_indices()
vec_idx = next(idx for idx in indices if idx["type"] == "Vector")
vec_idx = next(idx for idx in indices if idx["type"] == "IVF_PQ")
scalar_idx = next(idx for idx in indices if idx["type"] == "BTree")
assert vec_idx is not None
assert scalar_idx is not None
Expand Down
2 changes: 1 addition & 1 deletion python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path):
if platform.system() == "Windows":
expected_filepath = expected_filepath.replace("\\", "/")
expected_statistics = {
"index_type": "IVF",
"index_type": "IVF_PQ",
"uuid": index_uuid,
"uri": expected_filepath,
"metric_type": "l2",
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-encoding/compression-algo/fsst/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fsst"
version = "0.16.2"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
Expand Down
18 changes: 15 additions & 3 deletions rust/lance-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ pub enum IndexType {

// 100+ and up for vector index.
/// Flat vector index.
Vector = 100,
Vector = 100, // Legacy vector index, alias to IvfPq
IvfFlat = 101,
IvfSq = 102,
IvfPq = 103,
IvfHnswSq = 104,
IvfHnswPq = 105,
}

impl std::fmt::Display for IndexType {
Expand All @@ -89,7 +94,11 @@ impl std::fmt::Display for IndexType {
Self::Bitmap => write!(f, "Bitmap"),
Self::LabelList => write!(f, "LabelList"),
Self::Inverted => write!(f, "Inverted"),
Self::Vector => write!(f, "Vector"),
Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
Self::IvfFlat => write!(f, "IVF_FLAT"),
Self::IvfSq => write!(f, "IVF_SQ"),
Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
}
}
}
Expand All @@ -103,7 +112,10 @@ impl IndexType {
}

pub fn is_vector(&self) -> bool {
matches!(self, Self::Vector)
matches!(
self,
Self::Vector | Self::IvfPq | Self::IvfHnswSq | Self::IvfHnswPq
)
}
}

Expand Down
2 changes: 1 addition & 1 deletion rust/lance/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2489,7 +2489,7 @@ mod tests {
serde_json::from_str(&dataset.index_statistics("embeddings_idx").await.unwrap())
.unwrap();
let actual_statistics = actual_statistics.as_object().unwrap();
assert_eq!(actual_statistics["index_type"].as_str().unwrap(), "IVF");
assert_eq!(actual_statistics["index_type"].as_str().unwrap(), "IVF_PQ");

let deltas = actual_statistics["indices"].as_array().unwrap();
assert_eq!(deltas.len(), 1);
Expand Down
3 changes: 2 additions & 1 deletion rust/lance/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ impl DatasetIndexExt for Dataset {
.map(|idx| idx.statistics())
.collect::<Result<Vec<_>>>()?;

let index_type = indices[0].index_type().to_string();
let unindexed_fragments = self.unindexed_fragments(index_name).await?;
let mut num_unindexed_rows = 0;
for f in unindexed_fragments.iter() {
Expand All @@ -515,7 +516,7 @@ impl DatasetIndexExt for Dataset {
let num_indexed_rows = self.count_rows(None).await? - num_unindexed_rows;

let stats = json!({
"index_type": indices_stats[0]["index_type"],
"index_type": index_type,
"name": index_name,
"num_indices": metadatas.len(),
"indices": indices_stats,
Expand Down
22 changes: 20 additions & 2 deletions rust/lance/src/index/vector/ivf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,25 @@ impl Index for IVFIndex {
}

fn index_type(&self) -> IndexType {
IndexType::Vector
if self.sub_index.as_any().downcast_ref::<PQIndex>().is_some() {
IndexType::IvfPq
} else if self
.sub_index
.as_any()
.downcast_ref::<HNSWIndex<ScalarQuantizer>>()
.is_some()
{
IndexType::IvfHnswSq
} else if self
.sub_index
.as_any()
.downcast_ref::<HNSWIndex<ProductQuantizer>>()
.is_some()
{
IndexType::IvfHnswPq
} else {
IndexType::Vector
}
}

fn statistics(&self) -> Result<serde_json::Value> {
Expand All @@ -728,7 +746,7 @@ impl Index for IVFIndex {
let centroid_vecs = centroids_to_vectors(self.ivf.centroids.as_ref().unwrap())?;

Ok(serde_json::to_value(IvfIndexStatistics {
index_type: "IVF".to_string(),
index_type: self.index_type().to_string(),
uuid: self.uuid.clone(),
uri: to_local_path(self.reader.path()),
metric_type: self.metric_type.to_string(),
Expand Down
22 changes: 9 additions & 13 deletions rust/lance/src/index/vector/ivf/v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,14 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> Index for IVFIndex<S,
}

fn index_type(&self) -> IndexType {
IndexType::Vector
match self.sub_index_type() {
(SubIndexType::Flat, QuantizationType::Flat) => IndexType::IvfFlat,
(SubIndexType::Flat, QuantizationType::Product) => IndexType::IvfPq,
(SubIndexType::Flat, QuantizationType::Scalar) => IndexType::IvfSq,
(SubIndexType::Hnsw, QuantizationType::Product) => IndexType::IvfHnswPq,
(SubIndexType::Hnsw, QuantizationType::Scalar) => IndexType::IvfHnswSq,
_ => IndexType::Vector,
}
}

fn statistics(&self) -> Result<serde_json::Value> {
Expand All @@ -314,18 +321,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> Index for IVFIndex<S,

let centroid_vecs = centroids_to_vectors(self.ivf.centroids.as_ref().unwrap())?;

let index_type = match self.sub_index_type() {
(sub_index_type, QuantizationType::Flat) => format!("IVF_{}", sub_index_type), // ignore FLAT quantization
(sub_index_type, quantization_type) => {
if sub_index_type.to_string() == quantization_type.to_string() {
// ignore redundant quantization type
// e.g. IVF_PQ_PQ should be IVF_PQ
format!("IVF_{}", sub_index_type)
} else {
format!("IVF_{}_{}", sub_index_type, quantization_type)
}
}
};
let index_type = self.index_type().to_string();
let mut sub_index_stats: serde_json::Value =
if let Some(metadata) = self.sub_index_metadata.iter().find(|m| !m.is_empty()) {
serde_json::from_str(metadata)?
Expand Down

0 comments on commit 089716e

Please sign in to comment.