From 4a8af102c3dc9d210c781263a2f1cc01c88d6d49 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 8 Jul 2023 13:14:53 -0400 Subject: [PATCH 1/3] Minor: Add TPCH scale factor 10 to bench.sh --- benchmarks/bench.sh | 81 +++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index dee6896aec388..aacbe65ad0095 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -66,6 +66,8 @@ compare: Comares results from benchmark runs all(default): Data/Run/Compare for all benchmarks tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory +tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~1GB), single parquet file per table +tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~1GB), query from memory parquet: Benchmark of parquet reader's filtering speed sort: Benchmark of sorting speed @@ -124,14 +126,22 @@ main() { echo "***************************" case "$BENCHMARK" in all) - data_tpch + data_tpch "1" + data_tpch "10" ;; tpch) - data_tpch + data_tpch "1" ;; tpch_mem) - # same data for tpch_mem - data_tpch + # same data as for tpch + data_tpch "1" + ;; + tpch10) + data_tpch "10" + ;; + tpch_mem10) + # same data as for tpch10 + data_tpch "10" ;; *) echo "Error: unknown benchmark '$BENCHMARK' for data generation" @@ -162,16 +172,24 @@ main() { mkdir -p "${RESULTS_DIR}" case "$BENCHMARK" in all) - run_tpch - run_tpch_mem + run_tpch "1" + run_tpch_mem "1" + run_tpch "10" + run_tpch_mem "10" run_parquet run_sort ;; tpch) - run_tpch + run_tpch "1" ;; tpch_mem) - run_tpch_mem + run_tpch_mem "1" + ;; + tpch10) + run_tpch "10" + ;; + tpch_mem10) + run_tpch_mem "10" ;; parquet) run_parquet @@ -202,59 +220,80 @@ main() { # Creates TPCH data if it doesn't already exist +# call like: data_tpch($scale_factor) data_tpch() { - echo "Creating tpch dataset..." + SCALE_FACTOR=$1 + if [ -z "$SCALE_FACTOR" ] ; then + echo "Internal error: Scale factor not specified" + exit 1 + fi + + TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}" + echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in ${TPCH_DIR}..." # Ensure the target data directory exists - mkdir -p "${DATA_DIR}" + mkdir -p "${TPCH_DIR}" # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already exist - SCALE_FACTOR=1 - FILE="${DATA_DIR}/supplier.tbl" + FILE="${TPCH_DIR}/supplier.tbl" if test -f "${FILE}"; then echo " tbl files exist ($FILE exists)." else echo " creating tbl files with tpch_dbgen..." - docker run -v "${DATA_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR} + docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/databloom-ai/tpch-docker:main -vf -s ${SCALE_FACTOR} fi # Copy expected answers into the ./data/answers directory if it does not already exist - FILE="${DATA_DIR}/answers/q1.out" + FILE="${TPCH_DIR}/answers/q1.out" if test -f "${FILE}"; then echo " Expected answers exist (${FILE} exists)." else - echo " Copying answers to ${DATA_DIR}/answers" - mkdir -p "${DATA_DIR}/answers" - docker run -v "${DATA_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" + echo " Copying answers to ${TPCH_DIR}/answers" + mkdir -p "${TPCH_DIR}/answers" + docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm ghcr.io/databloom-ai/tpch-docker:main -c "cp -f /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" fi # Create 'parquet' files from tbl - FILE="${DATA_DIR}/supplier" + FILE="${TPCH_DIR}/supplier" if test -d "${FILE}"; then echo " parquet files exist ($FILE exists)." else echo " creating parquet files using benchmark binary ..." pushd "${SCRIPT_DIR}" > /dev/null - $CARGO_COMMAND --bin tpch -- convert --input "${DATA_DIR}" --output "${DATA_DIR}" --format parquet + $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet popd > /dev/null fi } # Runs the tpch benchmark run_tpch() { + SCALE_FACTOR=$1 + if [ -z "$SCALE_FACTOR" ] ; then + echo "Internal error: Scale factor not specified" + exit 1 + fi + TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}" + RESULTS_FILE="${RESULTS_DIR}/tpch.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running tpch benchmark..." - $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" --format parquet -o ${RESULTS_FILE} + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE} } # Runs the tpch in memory run_tpch_mem() { + SCALE_FACTOR=$1 + if [ -z "$SCALE_FACTOR" ] ; then + echo "Internal error: Scale factor not specified" + exit 1 + fi + TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}" + RESULTS_FILE="${RESULTS_DIR}/tpch_mem.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running tpch_mem benchmark..." # -m means in memory - $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE} + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE} } # Runs the parquet filter benchmark From 2cc5c55dc53c1e7a6d3726a1ccaea5ade7ab2cfd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 9 Jul 2023 06:24:12 -0400 Subject: [PATCH 2/3] improve comments --- benchmarks/bench.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index aacbe65ad0095..d318b35992479 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -66,8 +66,8 @@ compare: Comares results from benchmark runs all(default): Data/Run/Compare for all benchmarks tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory -tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~1GB), single parquet file per table -tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~1GB), query from memory +tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table +tpch10_mem: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory parquet: Benchmark of parquet reader's filtering speed sort: Benchmark of sorting speed @@ -219,8 +219,14 @@ main() { -# Creates TPCH data if it doesn't already exist +# Creates TPCH data at a certain scale factor, if it doesn't already +# exist +# # call like: data_tpch($scale_factor) +# +# Creates data in $DATA_DIR/tpch_sf1 for scale factor 1 +# Creates data in $DATA_DIR/tpch_sf10 for scale factor 10 +# etc data_tpch() { SCALE_FACTOR=$1 if [ -z "$SCALE_FACTOR" ] ; then From bd6812d3d6708a27f33c6a813397b96c23e62519 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 10 Jul 2023 15:46:08 -0400 Subject: [PATCH 3/3] Run 10 iterations --- benchmarks/bench.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index d318b35992479..05236ad5ade6d 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -283,7 +283,7 @@ run_tpch() { RESULTS_FILE="${RESULTS_DIR}/tpch.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running tpch benchmark..." - $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE} + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" --format parquet -o ${RESULTS_FILE} } # Runs the tpch in memory @@ -299,7 +299,7 @@ run_tpch_mem() { echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running tpch_mem benchmark..." # -m means in memory - $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE} + $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 10 --path "${TPCH_DIR}" -m --format parquet -o ${RESULTS_FILE} } # Runs the parquet filter benchmark @@ -307,7 +307,7 @@ run_parquet() { RESULTS_FILE="${RESULTS_DIR}/parquet.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running parquet filter benchmark..." - $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE} + $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE} } # Runs the sort benchmark @@ -315,7 +315,7 @@ run_sort() { RESULTS_FILE="${RESULTS_DIR}/sort.json" echo "RESULTS_FILE: ${RESULTS_FILE}" echo "Running sort benchmark..." - $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE} + $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 1.0 --iterations 10 -o ${RESULTS_FILE} } compare_benchmarks() {