From 51d1cc7440d54528111b2bb3534f1692afb9fe51 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 8 Dec 2025 11:16:14 -0800 Subject: [PATCH 1/3] Don't run parallel random access on GCS --- .../python/ci_benchmarks/benchmarks/test_random_access.py | 3 ++- python/python/ci_benchmarks/datasets.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/python/ci_benchmarks/benchmarks/test_random_access.py b/python/python/ci_benchmarks/benchmarks/test_random_access.py index e5fea790224..dc86d1c4b5c 100644 --- a/python/python/ci_benchmarks/benchmarks/test_random_access.py +++ b/python/python/ci_benchmarks/benchmarks/test_random_access.py @@ -9,7 +9,7 @@ import lance import pytest -from ci_benchmarks.datasets import open_dataset +from ci_benchmarks.datasets import is_on_google, open_dataset # POSIX fadvise flag to drop page cache POSIX_FADV_DONTNEED = 4 @@ -65,6 +65,7 @@ def setup(): @pytest.mark.parametrize("dataset", DATASETS) @pytest.mark.parametrize("rows_per_take", [1, 10, 100]) +@pytest.mark.skipif(is_on_google(), reason="Requires too many IOPS for cloud storage") def test_parallel_random_access(benchmark, dataset, rows_per_take): TAKES_PER_ITER = 100 diff --git a/python/python/ci_benchmarks/datasets.py b/python/python/ci_benchmarks/datasets.py index 3fc901ff3c8..fa2070a26b1 100644 --- a/python/python/ci_benchmarks/datasets.py +++ b/python/python/ci_benchmarks/datasets.py @@ -9,7 +9,7 @@ from lance.log import LOGGER -def _is_on_google() -> bool: +def is_on_google() -> bool: LOGGER.info("Testing if running on Google Cloud") try: rsp = requests.get("http://metadata.google.internal", timeout=5) @@ -22,7 +22,7 @@ def _is_on_google() -> bool: @cache def _get_base_uri() -> str: - if _is_on_google(): + if is_on_google(): LOGGER.info("Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/") return "gs://lance-benchmarks-ci-datasets/" else: @@ -38,7 +38,7 @@ def get_dataset_uri(name: str) -> str: # This is a custom-built dataset, on a unique bucket, that is too big to reproduce # locally if name == "image_eda": - if not _is_on_google(): + if not is_on_google(): raise ValueError("The image_eda dataset is only available on Google Cloud") return "gs://lance-benchmarks-ci-datasets/image_eda.lance" return f"{_get_base_uri()}{name}" From 33ea6eed61ade89bc411391559f7ca7662cda829 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 8 Dec 2025 13:04:37 -0800 Subject: [PATCH 2/3] Don't run bitmap use_cache=False on GCS --- python/python/ci_benchmarks/benchmarks/test_search.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/python/ci_benchmarks/benchmarks/test_search.py b/python/python/ci_benchmarks/benchmarks/test_search.py index f7ab517701d..203fb97f13c 100644 --- a/python/python/ci_benchmarks/benchmarks/test_search.py +++ b/python/python/ci_benchmarks/benchmarks/test_search.py @@ -5,7 +5,7 @@ import lance import pytest -from ci_benchmarks.datasets import get_dataset_uri +from ci_benchmarks.datasets import get_dataset_uri, is_on_google from ci_benchmarks.utils import wipe_os_cache COLUMN_LABELS = ["bools", "normals"] @@ -177,6 +177,13 @@ def test_basic_btree_search( ] +# Don't run the no_cache test on Google Cloud as it is way too expensive at the moment +def use_cache_param(): + if is_on_google(): + return [True] + return [True, False] + + # Repeats the same test for the basic dataset which is easier to test with locally # This benchmark is not part of the CI job as the EDA dataset is better for that @pytest.mark.parametrize("filt", BASIC_BITMAP_FILTERS, ids=BASIC_BITMAP_FILTER_LABELS) From b2466e59683972af5c667848371f70fc4c947e35 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 8 Dec 2025 13:50:31 -0800 Subject: [PATCH 3/3] Don't run bitmap use_cache=False on GCS --- python/python/ci_benchmarks/benchmarks/test_search.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/python/ci_benchmarks/benchmarks/test_search.py b/python/python/ci_benchmarks/benchmarks/test_search.py index 203fb97f13c..7f0eb2f84b3 100644 --- a/python/python/ci_benchmarks/benchmarks/test_search.py +++ b/python/python/ci_benchmarks/benchmarks/test_search.py @@ -184,11 +184,17 @@ def use_cache_param(): return [True, False] +def use_cache_ids(): + if is_on_google(): + return ["cache"] + return ["cache", "no_cache"] + + # Repeats the same test for the basic dataset which is easier to test with locally # This benchmark is not part of the CI job as the EDA dataset is better for that @pytest.mark.parametrize("filt", BASIC_BITMAP_FILTERS, ids=BASIC_BITMAP_FILTER_LABELS) @pytest.mark.parametrize("payload", [None, "small_strings", "integers"]) -@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"]) +@pytest.mark.parametrize("use_cache", use_cache_param(), ids=use_cache_ids()) def test_basic_bitmap_search( benchmark, filt: str | None, payload: str | None, use_cache: bool ):