lance-format · westonpace · Dec 9, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/python/python/ci_benchmarks/benchmarks/test_random_access.py b/python/python/ci_benchmarks/benchmarks/test_random_access.py
@@ -9,7 +9,7 @@
 
 import lance
 import pytest
-from ci_benchmarks.datasets import open_dataset
+from ci_benchmarks.datasets import is_on_google, open_dataset
 
 # POSIX fadvise flag to drop page cache
 POSIX_FADV_DONTNEED = 4
@@ -65,6 +65,7 @@ def setup():
 
 @pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("rows_per_take", [1, 10, 100])
+@pytest.mark.skipif(is_on_google(), reason="Requires too many IOPS for cloud storage")
 def test_parallel_random_access(benchmark, dataset, rows_per_take):
     TAKES_PER_ITER = 100
 

diff --git a/python/python/ci_benchmarks/benchmarks/test_search.py b/python/python/ci_benchmarks/benchmarks/test_search.py
@@ -5,7 +5,7 @@
 
 import lance
 import pytest
-from ci_benchmarks.datasets import get_dataset_uri
+from ci_benchmarks.datasets import get_dataset_uri, is_on_google
 from ci_benchmarks.utils import wipe_os_cache
 
 COLUMN_LABELS = ["bools", "normals"]
@@ -177,11 +177,24 @@ def test_basic_btree_search(
 ]
 
 
+# Don't run the no_cache test on Google Cloud as it is way too expensive at the moment
+def use_cache_param():
+    if is_on_google():
+        return [True]
+    return [True, False]
+
+
+def use_cache_ids():
+    if is_on_google():
+        return ["cache"]
+    return ["cache", "no_cache"]
+
+
 # Repeats the same test for the basic dataset which is easier to test with locally
 # This benchmark is not part of the CI job as the EDA dataset is better for that
 @pytest.mark.parametrize("filt", BASIC_BITMAP_FILTERS, ids=BASIC_BITMAP_FILTER_LABELS)
 @pytest.mark.parametrize("payload", [None, "small_strings", "integers"])
-@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
+@pytest.mark.parametrize("use_cache", use_cache_param(), ids=use_cache_ids())
 def test_basic_bitmap_search(
     benchmark, filt: str | None, payload: str | None, use_cache: bool
 ):

diff --git a/python/python/ci_benchmarks/datasets.py b/python/python/ci_benchmarks/datasets.py
@@ -9,7 +9,7 @@
 from lance.log import LOGGER
 
 
-def _is_on_google() -> bool:
+def is_on_google() -> bool:
     LOGGER.info("Testing if running on Google Cloud")
     try:
         rsp = requests.get("http://metadata.google.internal", timeout=5)
@@ -22,7 +22,7 @@ def _is_on_google() -> bool:
 
 @cache
 def _get_base_uri() -> str:
-    if _is_on_google():
+    if is_on_google():
         LOGGER.info("Running on Google Cloud, using gs://lance-benchmarks-ci-datasets/")
         return "gs://lance-benchmarks-ci-datasets/"
     else:
@@ -38,7 +38,7 @@ def get_dataset_uri(name: str) -> str:
     # This is a custom-built dataset, on a unique bucket, that is too big to reproduce
     # locally
     if name == "image_eda":
-        if not _is_on_google():
+        if not is_on_google():
             raise ValueError("The image_eda dataset is only available on Google Cloud")
         return "gs://lance-benchmarks-ci-datasets/image_eda.lance"
     return f"{_get_base_uri()}{name}"