From 05bc413c495dd947b89afa2a8e4bbe2c0621114b Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 30 Dec 2025 11:56:16 +1100 Subject: [PATCH 1/2] fix statpopgen early exit Signed-off-by: Connor Tsui --- .github/workflows/sql-benchmarks.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 051bfdf265e..b674cd9fdbb 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -196,8 +196,11 @@ jobs: # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas # Lance is filtered out of df_formats because it uses a separate binary (lance-bench). - df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') - ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') + # + # The `|| true` is needed because some benchmarks don't use all engines. grep returns exit code 1 when no matches are found, and + # GitHub Actions uses `bash -eo pipefail` by default. + df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' || true) | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') + ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false") # Build options string if scale_factor is set @@ -266,8 +269,11 @@ jobs: # Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet" # # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas - df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') - ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') + # + # The `|| true` is needed because some benchmarks don't use all engines. grep returns exit + # code 1 when no matches are found, and GitHub Actions uses `bash -eo pipefail`. + df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') + ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') # Build options string if scale_factor is set opts="--opt remote-data-dir=${{ matrix.remote_storage }}" From a91fe81f83504a0c2cc405dd407e525cbd047360 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 30 Dec 2025 12:07:44 +1100 Subject: [PATCH 2/2] extract inline bash script from `sql-benchmarks.yml` Signed-off-by: Connor Tsui --- .github/scripts/run-sql-bench.sh | 128 +++++++++++++++++++++++++++ .github/workflows/sql-benchmarks.yml | 109 ++--------------------- 2 files changed, 134 insertions(+), 103 deletions(-) create mode 100755 .github/scripts/run-sql-bench.sh diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh new file mode 100755 index 00000000000..682702a0966 --- /dev/null +++ b/.github/scripts/run-sql-bench.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets. +# This script is used by the sql-benchmarks.yml workflow. +# +# Usage: +# run-sql-bench.sh [options] +# +# Arguments: +# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds) +# targets Comma-separated list of engine:format pairs +# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet") +# +# Options: +# --scale-factor Scale factor for the benchmark (e.g., 1.0, 10.0) +# --remote-storage Remote storage URL (e.g., s3://bucket/path/) +# If provided, runs in remote mode (no lance support). +# --benchmark-id Benchmark ID for error messages (e.g., tpch-s3) + +set -Eeu -o pipefail + +subcommand="$1" +targets="$2" +shift 2 + +scale_factor="" +remote_storage="" +benchmark_id="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --scale-factor) + scale_factor="$2" + shift 2 + ;; + --remote-storage) + remote_storage="$2" + shift 2 + ;; + --benchmark-id) + benchmark_id="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +is_remote=false +if [[ -n "$remote_storage" ]]; then + is_remote=true +fi + +# Lance on remote storage is not supported. The infrastructure to generate and upload lance files +# to S3 does not exist. If you need lance on S3, you must first implement: +# 1. Lance data generation in data-gen (or a separate step) +# 2. Lance data upload to S3 before this step runs +if $is_remote && echo "$targets" | grep -q 'lance'; then + echo "ERROR: Lance format is not supported for remote storage benchmarks." + echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'." + exit 1 +fi + +# Extract formats for each engine from the targets string. +# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet" +# +# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas +# +# Lance is filtered out of df_formats because it uses a separate binary (lance-bench). +# +# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has +# duckdb targets). grep returns exit code 1 when no matches are found. +df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' || true) | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') +ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') +has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false") + +# Build options string. +opts="" +if $is_remote; then + opts="--opt remote-data-dir=$remote_storage" +fi +if [[ -n "$scale_factor" ]]; then + if [[ -n "$opts" ]]; then + opts="--opt scale-factor=$scale_factor $opts" + else + opts="--opt scale-factor=$scale_factor" + fi +fi + +touch results.json + +if [[ -n "$df_formats" ]]; then + # shellcheck disable=SC2086 + target/release_debug/datafusion-bench "$subcommand" \ + -d gh-json \ + --formats "$df_formats" \ + $opts \ + -o df-results.json + + cat df-results.json >> results.json +fi + +if [[ -n "$ddb_formats" ]]; then + # shellcheck disable=SC2086 + target/release_debug/duckdb-bench "$subcommand" \ + -d gh-json \ + --formats "$ddb_formats" \ + $opts \ + --delete-duckdb-database \ + -o ddb-results.json + + cat ddb-results.json >> results.json +fi + +# Lance-bench only runs for local benchmarks. +if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then + # shellcheck disable=SC2086 + target/release_debug/lance-bench "$subcommand" \ + -d gh-json \ + $opts \ + -o lance-results.json + + cat lance-results.json >> results.json +fi diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index b674cd9fdbb..a99c324801d 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -190,59 +190,8 @@ jobs: OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}" OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}" run: | - # Extract formats for each engine from the targets string. - # Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet" - # - # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas - - # Lance is filtered out of df_formats because it uses a separate binary (lance-bench). - # - # The `|| true` is needed because some benchmarks don't use all engines. grep returns exit code 1 when no matches are found, and - # GitHub Actions uses `bash -eo pipefail` by default. - df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' || true) | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') - ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') - has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false") - - # Build options string if scale_factor is set - opts="" - if [ -n "${{ matrix.scale_factor }}" ]; then - opts="--opt scale-factor=${{ matrix.scale_factor }}" - fi - - touch results.json - - # Run datafusion-bench - if [ -n "$df_formats" ]; then - target/release_debug/datafusion-bench ${{ matrix.subcommand }} \ - -d gh-json \ - --formats "$df_formats" \ - $opts \ - -o df-results.json - - cat df-results.json >> results.json - fi - - # Run duckdb-bench - if [ -n "$ddb_formats" ]; then - target/release_debug/duckdb-bench ${{ matrix.subcommand }} \ - -d gh-json \ - --formats "$ddb_formats" \ - $opts \ - --delete-duckdb-database \ - -o ddb-results.json - - cat ddb-results.json >> results.json - fi - - # Run lance-bench - if [ "$has_lance" = "true" ] && [ -f "target/release_debug/lance-bench" ]; then - target/release_debug/lance-bench ${{ matrix.subcommand }} \ - -d gh-json \ - $opts \ - -o lance-results.json - - cat lance-results.json >> results.json - fi + .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \ + ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} - name: Run ${{ matrix.name }} benchmark (remote) if: matrix.remote_storage != null && (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) @@ -255,56 +204,10 @@ jobs: OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}" OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}" run: | - # Lance on remote storage is not supported. The infrastructure to generate and upload - # lance files to S3 does not exist. If you need lance on S3, you must first implement: - # 1. Lance data generation in data-gen (or a separate step) - # 2. Lance data upload to S3 before this step runs - if echo "${{ matrix.targets }}" | grep -q 'lance'; then - echo "ERROR: Lance format is not supported for remote storage benchmarks." - echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'." - exit 1 - fi - - # Extract formats for each engine from the targets string. - # Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet" - # - # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas - # - # The `|| true` is needed because some benchmarks don't use all engines. grep returns exit - # code 1 when no matches are found, and GitHub Actions uses `bash -eo pipefail`. - df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') - ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') - - # Build options string if scale_factor is set - opts="--opt remote-data-dir=${{ matrix.remote_storage }}" - if [ -n "${{ matrix.scale_factor }}" ]; then - opts="--opt scale-factor=${{ matrix.scale_factor }} ${opts}" - fi - - touch results.json - - # Run datafusion-bench with remote storage - if [ -n "$df_formats" ]; then - target/release_debug/datafusion-bench ${{ matrix.subcommand }} \ - -d gh-json \ - --formats "$df_formats" \ - $opts \ - -o df-results.json - - cat df-results.json >> results.json - fi - - # Run duckdb-bench with remote storage - if [ -n "$ddb_formats" ]; then - target/release_debug/duckdb-bench ${{ matrix.subcommand }} \ - -d gh-json \ - --formats "$ddb_formats" \ - $opts \ - --delete-duckdb-database \ - -o ddb-results.json - - cat ddb-results.json >> results.json - fi + .github/scripts/run-sql-bench.sh "${{ matrix.subcommand }}" "${{ matrix.targets }}" \ + --remote-storage "${{ matrix.remote_storage }}" \ + --benchmark-id "${{ matrix.id }}" \ + ${{ matrix.scale_factor && format('--scale-factor {0}', matrix.scale_factor) || '' }} - name: Install uv if: inputs.mode == 'pr'