Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/python/ci_benchmarks/benchmarks/test_random_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import lance
import pytest
from ci_benchmarks.datasets import open_dataset
from ci_benchmarks.datasets import is_on_google, open_dataset

# POSIX fadvise flag to drop page cache
POSIX_FADV_DONTNEED = 4
Expand Down Expand Up @@ -65,6 +65,7 @@ def setup():

@pytest.mark.parametrize("dataset", DATASETS)
@pytest.mark.parametrize("rows_per_take", [1, 10, 100])
@pytest.mark.skipif(is_on_google(), reason="Requires too many IOPS for cloud storage")
def test_parallel_random_access(benchmark, dataset, rows_per_take):
TAKES_PER_ITER = 100

Expand Down
17 changes: 15 additions & 2 deletions python/python/ci_benchmarks/benchmarks/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import lance
import pytest
from ci_benchmarks.datasets import get_dataset_uri
from ci_benchmarks.datasets import get_dataset_uri, is_on_google
from ci_benchmarks.utils import wipe_os_cache

COLUMN_LABELS = ["bools", "normals"]
Expand Down Expand Up @@ -177,11 +177,24 @@ def test_basic_btree_search(
]


# Don't run the no_cache test on Google Cloud as it is way too expensive at the moment
def use_cache_param():
if is_on_google():
return [True]
return [True, False]


def use_cache_ids():
if is_on_google():
return ["cache"]
return ["cache", "no_cache"]


# Repeats the same test for the basic dataset which is easier to test with locally
# This benchmark is not part of the CI job as the EDA dataset is better for that
@pytest.mark.parametrize("filt", BASIC_BITMAP_FILTERS, ids=BASIC_BITMAP_FILTER_LABELS)
@pytest.mark.parametrize("payload", [None, "small_strings", "integers"])
@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
@pytest.mark.parametrize("use_cache", use_cache_param(), ids=use_cache_ids())
def test_basic_bitmap_search(
benchmark, filt: str | None, payload: str | None, use_cache: bool
):
Expand Down
6 changes: 3 additions & 3 deletions python/python/ci_benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from lance.log import LOGGER


def _is_on_google() -> bool:
def is_on_google() -> bool:
LOGGER.info("Testing if running on Google Cloud")
try:
rsp = requests.get("http://metadata.google.internal", timeout=5)
Expand All @@ -22,7 +22,7 @@ def _is_on_google() -> bool:

@cache
def _get_base_uri() -> str:
if _is_on_google():
if is_on_google():
LOGGER.info("Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/")
return "gs://lance-benchmarks-ci-datasets/"
else:
Expand All @@ -38,7 +38,7 @@ def get_dataset_uri(name: str) -> str:
# This is a custom-built dataset, on a unique bucket, that is too big to reproduce
# locally
if name == "image_eda":
if not _is_on_google():
if not is_on_google():
raise ValueError("The image_eda dataset is only available on Google Cloud")
return "gs://lance-benchmarks-ci-datasets/image_eda.lance"
return f"{_get_base_uri()}{name}"
Expand Down
Loading