Skip to content

Commit

Permalink
ci: add benchmark suite (#3165)
Browse files Browse the repository at this point in the history
Benchmarks report to
https://bencher.dev/console/projects/weston-lancedb/plots

At some point it may be nice for these to be used for regression
detection in PRs. However, we need to get a stable baseline first.

These benchmarks rely on a private runner hosted by LanceDB and some
private datasets. They run against GCS. It would be good to get some
NVME & Azure & S3 benchmarks at some point.
  • Loading branch information
westonpace authored Nov 25, 2024
1 parent 196ec06 commit e6c2343
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 0 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/ci-benchmarks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Run Regression Benchmarks

on:
push:
branches:
- main

jobs:
bench_regress:
timeout-minutes: 30
runs-on: warp-custom-gcp-storage-benchmark
env:
# Need up-to-date compilers for kernels
CC: clang-18
CXX: clang-18
defaults:
run:
shell: bash
working-directory: python
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
lfs: true
- name: Authenticate with GCS
uses: "google-github-actions/auth@v2"
with:
credentials_json: "${{ secrets.GCLOUD_BENCH_STORAGE_USER_KEY }}"
- name: Install bencher
uses: bencherdev/bencher@main
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.11 # Ray does not support 3.12 yet.
- uses: Swatinem/rust-cache@v2
with:
workspaces: python
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Build
run: |
python -m venv venv
source venv/bin/activate
pip install maturin duckdb requests pytest pytest-benchmark
maturin develop --locked --release
- name: Generate datasets
run: |
python -m venv venv
source venv/bin/activate
python python/ci_benchmarks/datagen/gen_all.py
- name: Run benchmarks
run: |
python -m venv venv
source venv/bin/activate
bencher run --project weston-lancedb --token ${{ secrets.LANCE_BENCHER_TOKEN }} --adapter python_pytest \
--branch main --testbed google-genoa --err --file results.json "python -mpytest --benchmark-json \
results.json python/ci_benchmarks"
2 changes: 2 additions & 0 deletions python/python/ci_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
24 changes: 24 additions & 0 deletions python/python/ci_benchmarks/benchmarks/test_random_access.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import random

import lance
import pytest
from ci_benchmarks.datasets import get_dataset_uri

DATASETS = ["tpch"]


@pytest.mark.parametrize("dataset", DATASETS)
def test_random_access(benchmark, dataset):
NUM_INDICES = 10
dataset_uri = get_dataset_uri(dataset)

ds = lance.dataset(dataset_uri)
random_indices = [random.randint(0, ds.count_rows()) for _ in range(NUM_INDICES)]

def bench(random_indices):
ds.take(random_indices)

benchmark.pedantic(bench, args=(random_indices,), rounds=5)
33 changes: 33 additions & 0 deletions python/python/ci_benchmarks/benchmarks/test_scan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import lance
import pytest
from ci_benchmarks.datasets import get_dataset_uri

DATASETS = ["tpch"]


@pytest.mark.parametrize("dataset", DATASETS)
def test_full_scan(benchmark, dataset):
dataset_uri = get_dataset_uri(dataset)

def bench():
ds = lance.dataset(dataset_uri)
ds.to_table()

benchmark.pedantic(bench, rounds=1, iterations=1)


@pytest.mark.parametrize("dataset", DATASETS)
def test_scan_slice(benchmark, dataset):
dataset_uri = get_dataset_uri(dataset)

ds = lance.dataset(dataset_uri)
num_rows = ds.count_rows()

def bench():
ds = lance.dataset(dataset_uri)
ds.to_table(offset=num_rows - 100, limit=50)

benchmark.pedantic(bench, rounds=1, iterations=1)
36 changes: 36 additions & 0 deletions python/python/ci_benchmarks/benchmarks/test_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import lance
import pytest
from ci_benchmarks.datasets import get_dataset_uri

COLUMN_LABELS = ["bools", "normals"]
COLUMNS = [["bools"], ["normals"]]
FILTERS = [None, "bools IS TRUE"]


@pytest.mark.parametrize("columns", COLUMNS, ids=COLUMN_LABELS)
@pytest.mark.parametrize("filt", FILTERS)
def test_eda_search(benchmark, columns, filt):
dataset_uri = get_dataset_uri("image_eda")

batch_size = 32 if columns == ["image_data"] else None
limit = None if filter is None else 100000
frag_readahead = (
4
if (columns == ["image_data"] or columns == ["strings"]) and filter is None
else None
)

def bench():
ds = lance.dataset(dataset_uri)
ds.to_table(
columns=columns,
filter=filt,
batch_size=batch_size,
fragment_readahead=frag_readahead,
limit=limit,
)

benchmark.pedantic(bench, rounds=1, iterations=1)
2 changes: 2 additions & 0 deletions python/python/ci_benchmarks/datagen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors
10 changes: 10 additions & 0 deletions python/python/ci_benchmarks/datagen/gen_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import logging

from ci_benchmarks.datagen.lineitems import gen_tcph

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
gen_tcph()
48 changes: 48 additions & 0 deletions python/python/ci_benchmarks/datagen/lineitems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

# Creates a dataset containing the TPC-H lineitems table using a prebuilt Parquet file
import logging

import duckdb
import lance

from ci_benchmarks.datasets import get_dataset_uri

NUM_ROWS = 59986052


def _gen_data():
logging.info("Using DuckDB to generate TPC-H dataset")
con = duckdb.connect(database=":memory:")
con.execute("INSTALL tpch; LOAD tpch")
con.execute("CALL dbgen(sf=10)")
res = con.query("SELECT * FROM lineitem")
return res.to_arrow_table()


def _create(dataset_uri: str):
try:
ds = lance.dataset(dataset_uri)
print(ds.count_rows())
if ds.count_rows() == NUM_ROWS:
return
elif ds.count_rows() == 0:
lance.write_dataset(
_gen_data(), dataset_uri, mode="append", use_legacy_format=False
)
else:
raise Exception(
"Cannot generate TPC-H dataset because a dataset with the URI "
f"{dataset_uri} already exists and doesn't appear to be the "
"same dataset"
)
except ValueError:
lance.write_dataset(
_gen_data(), dataset_uri, mode="create", use_legacy_format=False
)


def gen_tcph():
dataset_uri = get_dataset_uri("tpch")
_create(dataset_uri)
45 changes: 45 additions & 0 deletions python/python/ci_benchmarks/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The Lance Authors

import logging
from functools import cache
from pathlib import Path

import requests


def _is_on_google() -> bool:
logging.info("Testing if running on Google Cloud")
try:
rsp = requests.get("http://metadata.google.internal", timeout=5)
logging.info("Metadata-Flavor: %s", rsp.headers.get("Metadata-Flavor"))
return rsp.headers["Metadata-Flavor"] == "Google"
except requests.exceptions.RequestException as ex:
logging.info("Failed to connect to metadata server: %s", ex)
return False


@cache
def _get_base_uri() -> str:
if _is_on_google():
logging.info(
"Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/"
)
return "gs://lance-benchmarks-ci-datasets/"
else:
data_path = Path.home() / "lance-benchmarks-ci-datasets"
if not data_path.exists():
data_path.mkdir(parents=True, exist_ok=True)
logging.info("Running locally, using %s", data_path)
return f"{data_path}/"


def get_dataset_uri(name: str) -> str:
"""Given a dataset name, return the URI appropriate for the current environment."""
# This is a custom-built dataset, on a unique bucket, that is too big to reproduce
# locally
if name == "image_eda":
if not _is_on_google():
raise ValueError("The image_eda dataset is only available on Google Cloud")
return "gs://lance-benchmarks-ci-datasets/image_eda.lance"
return f"{_get_base_uri()}{name}"

0 comments on commit e6c2343

Please sign in to comment.