ci: add benchmark suite (#3165)

Benchmarks report to https://bencher.dev/console/projects/weston-lancedb/plots At some point it may be nice for these to be used for regression detection in PRs. However, we need to get a stable baseline first. These benchmarks rely on a private runner hosted by LanceDB and some private datasets. They run against GCS. It would be good to get some NVME & Azure & S3 benchmarks at some point.
lancedb · Nov 25, 2024 · e6c2343 · e6c2343
1 parent 196ec06
commit e6c2343
Show file tree

Hide file tree

Showing 9 changed files with 259 additions and 0 deletions.
diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml
@@ -0,0 +1,59 @@
+name: Run Regression Benchmarks
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  bench_regress:
+    timeout-minutes: 30
+    runs-on: warp-custom-gcp-storage-benchmark
+    env:
+      # Need up-to-date compilers for kernels
+      CC: clang-18
+      CXX: clang-18
+    defaults:
+      run:
+        shell: bash
+        working-directory: python
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Authenticate with GCS
+        uses: "google-github-actions/auth@v2"
+        with:
+          credentials_json: "${{ secrets.GCLOUD_BENCH_STORAGE_USER_KEY }}"
+      - name: Install bencher
+        uses: bencherdev/bencher@main
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11 # Ray does not support 3.12 yet.
+      - uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: python
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y protobuf-compiler libssl-dev
+      - name: Build
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          pip install maturin duckdb requests pytest pytest-benchmark
+          maturin develop --locked --release
+      - name: Generate datasets
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          python python/ci_benchmarks/datagen/gen_all.py
+      - name: Run benchmarks
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          bencher run --project weston-lancedb --token ${{ secrets.LANCE_BENCHER_TOKEN }} --adapter python_pytest \
+                      --branch main --testbed google-genoa --err --file results.json "python -mpytest --benchmark-json \
+                      results.json python/ci_benchmarks"
diff --git a/python/python/ci_benchmarks/__init__.py b/python/python/ci_benchmarks/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
diff --git a/python/python/ci_benchmarks/benchmarks/test_random_access.py b/python/python/ci_benchmarks/benchmarks/test_random_access.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import random
+
+import lance
+import pytest
+from ci_benchmarks.datasets import get_dataset_uri
+
+DATASETS = ["tpch"]
+
+
+@pytest.mark.parametrize("dataset", DATASETS)
+def test_random_access(benchmark, dataset):
+    NUM_INDICES = 10
+    dataset_uri = get_dataset_uri(dataset)
+
+    ds = lance.dataset(dataset_uri)
+    random_indices = [random.randint(0, ds.count_rows()) for _ in range(NUM_INDICES)]
+
+    def bench(random_indices):
+        ds.take(random_indices)
+
+    benchmark.pedantic(bench, args=(random_indices,), rounds=5)
diff --git a/python/python/ci_benchmarks/benchmarks/test_scan.py b/python/python/ci_benchmarks/benchmarks/test_scan.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import lance
+import pytest
+from ci_benchmarks.datasets import get_dataset_uri
+
+DATASETS = ["tpch"]
+
+
+@pytest.mark.parametrize("dataset", DATASETS)
+def test_full_scan(benchmark, dataset):
+    dataset_uri = get_dataset_uri(dataset)
+
+    def bench():
+        ds = lance.dataset(dataset_uri)
+        ds.to_table()
+
+    benchmark.pedantic(bench, rounds=1, iterations=1)
+
+
+@pytest.mark.parametrize("dataset", DATASETS)
+def test_scan_slice(benchmark, dataset):
+    dataset_uri = get_dataset_uri(dataset)
+
+    ds = lance.dataset(dataset_uri)
+    num_rows = ds.count_rows()
+
+    def bench():
+        ds = lance.dataset(dataset_uri)
+        ds.to_table(offset=num_rows - 100, limit=50)
+
+    benchmark.pedantic(bench, rounds=1, iterations=1)
diff --git a/python/python/ci_benchmarks/benchmarks/test_search.py b/python/python/ci_benchmarks/benchmarks/test_search.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import lance
+import pytest
+from ci_benchmarks.datasets import get_dataset_uri
+
+COLUMN_LABELS = ["bools", "normals"]
+COLUMNS = [["bools"], ["normals"]]
+FILTERS = [None, "bools IS TRUE"]
+
+
+@pytest.mark.parametrize("columns", COLUMNS, ids=COLUMN_LABELS)
+@pytest.mark.parametrize("filt", FILTERS)
+def test_eda_search(benchmark, columns, filt):
+    dataset_uri = get_dataset_uri("image_eda")
+
+    batch_size = 32 if columns == ["image_data"] else None
+    limit = None if filter is None else 100000
+    frag_readahead = (
+        4
+        if (columns == ["image_data"] or columns == ["strings"]) and filter is None
+        else None
+    )
+
+    def bench():
+        ds = lance.dataset(dataset_uri)
+        ds.to_table(
+            columns=columns,
+            filter=filt,
+            batch_size=batch_size,
+            fragment_readahead=frag_readahead,
+            limit=limit,
+        )
+
+    benchmark.pedantic(bench, rounds=1, iterations=1)
diff --git a/python/python/ci_benchmarks/datagen/__init__.py b/python/python/ci_benchmarks/datagen/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
diff --git a/python/python/ci_benchmarks/datagen/gen_all.py b/python/python/ci_benchmarks/datagen/gen_all.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import logging
+
+from ci_benchmarks.datagen.lineitems import gen_tcph
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    gen_tcph()
diff --git a/python/python/ci_benchmarks/datagen/lineitems.py b/python/python/ci_benchmarks/datagen/lineitems.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+# Creates a dataset containing the TPC-H lineitems table using a prebuilt Parquet file
+import logging
+
+import duckdb
+import lance
+
+from ci_benchmarks.datasets import get_dataset_uri
+
+NUM_ROWS = 59986052
+
+
+def _gen_data():
+    logging.info("Using DuckDB to generate TPC-H dataset")
+    con = duckdb.connect(database=":memory:")
+    con.execute("INSTALL tpch; LOAD tpch")
+    con.execute("CALL dbgen(sf=10)")
+    res = con.query("SELECT * FROM lineitem")
+    return res.to_arrow_table()
+
+
+def _create(dataset_uri: str):
+    try:
+        ds = lance.dataset(dataset_uri)
+        print(ds.count_rows())
+        if ds.count_rows() == NUM_ROWS:
+            return
+        elif ds.count_rows() == 0:
+            lance.write_dataset(
+                _gen_data(), dataset_uri, mode="append", use_legacy_format=False
+            )
+        else:
+            raise Exception(
+                "Cannot generate TPC-H dataset because a dataset with the URI "
+                f"{dataset_uri} already exists and doesn't appear to be the "
+                "same dataset"
+            )
+    except ValueError:
+        lance.write_dataset(
+            _gen_data(), dataset_uri, mode="create", use_legacy_format=False
+        )
+
+
+def gen_tcph():
+    dataset_uri = get_dataset_uri("tpch")
+    _create(dataset_uri)
diff --git a/python/python/ci_benchmarks/datasets.py b/python/python/ci_benchmarks/datasets.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+import logging
+from functools import cache
+from pathlib import Path
+
+import requests
+
+
+def _is_on_google() -> bool:
+    logging.info("Testing if running on Google Cloud")
+    try:
+        rsp = requests.get("http://metadata.google.internal", timeout=5)
+        logging.info("Metadata-Flavor: %s", rsp.headers.get("Metadata-Flavor"))
+        return rsp.headers["Metadata-Flavor"] == "Google"
+    except requests.exceptions.RequestException as ex:
+        logging.info("Failed to connect to metadata server: %s", ex)
+        return False
+
+
+@cache
+def _get_base_uri() -> str:
+    if _is_on_google():
+        logging.info(
+            "Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/"
+        )
+        return "gs://lance-benchmarks-ci-datasets/"
+    else:
+        data_path = Path.home() / "lance-benchmarks-ci-datasets"
+        if not data_path.exists():
+            data_path.mkdir(parents=True, exist_ok=True)
+        logging.info("Running locally, using %s", data_path)
+        return f"{data_path}/"
+
+
+def get_dataset_uri(name: str) -> str:
+    """Given a dataset name, return the URI appropriate for the current environment."""
+    # This is a custom-built dataset, on a unique bucket, that is too big to reproduce
+    # locally
+    if name == "image_eda":
+        if not _is_on_google():
+            raise ValueError("The image_eda dataset is only available on Google Cloud")
+        return "gs://lance-benchmarks-ci-datasets/image_eda.lance"
+    return f"{_get_base_uri()}{name}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-License-Identifier: Apache-2.0
		# SPDX-FileCopyrightText: Copyright The Lance Authors