From 448d2df68c541f1826b7abde18d7fc640368aeac Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Dec 2025 12:58:59 -0500 Subject: [PATCH 1/4] Automatically download tpcds data --- benchmarks/bench.sh | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 975f4ec08fa88..fe2eeda06fdd6 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -631,20 +631,22 @@ data_tpch() { # Points to TPCDS data generation instructions data_tpcds() { - TPCDS_DIR="${DATA_DIR}" - - # Check if TPCDS data directory exists - if [ ! -d "${TPCDS_DIR}" ]; then - echo "" - echo "For TPC-DS data generation, please clone the datafusion-benchmarks repository:" - echo " git clone https://github.com/apache/datafusion-benchmarks" - echo "" - return 1 + TPCDS_DIR="${DATA_DIR}/tpcds_sf1" + + # Check if `web_site.parquet` exists in the TPCDS data directory to verify data presence + echo "Checking TPC-DS data directory: ${TPCDS_DIR}" + if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then + mkdir -p "${TPCDS_DIR}" + # Download the DataFusion benchmarks repository zip if it is not already downloaded + if [ ! -f "${DATA_DIR}/datafusion-benchmarks.zip" ]; then + echo "Downloading DataFusion benchmarks repository zip to: ${DATA_DIR}/datafusion-benchmarks.zip" + wget -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip + fi + echo "Extracting TPC-DS parquet data to ${TPCDS_DIR}..." + unzip -o -j -d "${TPCDS_DIR}" "${DATA_DIR}/datafusion-benchmarks.zip" datafusion-benchmarks-main/tpcds/data/sf1/* + echo "TPC-DS data extracted." fi - - echo "" - echo "TPC-DS data already exists in ${TPCDS_DIR}" - echo "" + echo "Done." } # Runs the tpch benchmark @@ -682,11 +684,11 @@ run_tpch_mem() { # Runs the tpcds benchmark run_tpcds() { - TPCDS_DIR="${DATA_DIR}" + TPCDS_DIR="${DATA_DIR}/tpcds_sf1" # Check if TPCDS data directory exists if [ ! -d "${TPCDS_DIR}" ]; then - echo "Error: TPC-DS data directory does not exist: ${TPCDS_DIR}" >&2 + echo "Creating TPC-DS data directory: ${TPCDS_DIR}" >&2 echo "" >&2 echo "Please prepare TPC-DS data first by following instructions:" >&2 echo " ./bench.sh data tpcds" >&2 From 8adbf6f9bcaf1f5b882fa676ee70271b206070eb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 9 Dec 2025 13:02:38 -0500 Subject: [PATCH 2/4] cleanup --- benchmarks/bench.sh | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index fe2eeda06fdd6..293d5888674ea 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -686,19 +686,8 @@ run_tpch_mem() { run_tpcds() { TPCDS_DIR="${DATA_DIR}/tpcds_sf1" - # Check if TPCDS data directory exists - if [ ! -d "${TPCDS_DIR}" ]; then - echo "Creating TPC-DS data directory: ${TPCDS_DIR}" >&2 - echo "" >&2 - echo "Please prepare TPC-DS data first by following instructions:" >&2 - echo " ./bench.sh data tpcds" >&2 - echo "" >&2 - exit 1 - fi - - # Check if directory contains parquet files - if ! find "${TPCDS_DIR}" -name "*.parquet" -print -quit | grep -q .; then - echo "Error: TPC-DS data directory exists but contains no parquet files: ${TPCDS_DIR}" >&2 + # Check if TPCDS data directory and representative file exists + if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then echo "" >&2 echo "Please prepare TPC-DS data first by following instructions:" >&2 echo " ./bench.sh data tpcds" >&2 From 0462ee2224f85047a8fcd8c6687457a5a130192e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Dec 2025 17:15:37 -0500 Subject: [PATCH 3/4] update comment --- benchmarks/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 293d5888674ea..64af9c95a145b 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -629,7 +629,7 @@ data_tpch() { exit 1 } -# Points to TPCDS data generation instructions +# Downloads TPC-DS data data_tpcds() { TPCDS_DIR="${DATA_DIR}/tpcds_sf1" From 714d7219aef7f42629a898b7d449113a03e07f33 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Dec 2025 17:17:39 -0500 Subject: [PATCH 4/4] Apply suggestion from @martin-g Co-authored-by: Martin Grigorov --- benchmarks/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 293d5888674ea..9932f0c76dc0d 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -640,7 +640,7 @@ data_tpcds() { # Download the DataFusion benchmarks repository zip if it is not already downloaded if [ ! -f "${DATA_DIR}/datafusion-benchmarks.zip" ]; then echo "Downloading DataFusion benchmarks repository zip to: ${DATA_DIR}/datafusion-benchmarks.zip" - wget -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip + wget --timeout=30 --tries=3 -O "${DATA_DIR}/datafusion-benchmarks.zip" https://github.com/apache/datafusion-benchmarks/archive/refs/heads/main.zip fi echo "Extracting TPC-DS parquet data to ${TPCDS_DIR}..." unzip -o -j -d "${TPCDS_DIR}" "${DATA_DIR}/datafusion-benchmarks.zip" datafusion-benchmarks-main/tpcds/data/sf1/*