From e42d8df1d9514c89e42c1a7414953eff3c40b61d Mon Sep 17 00:00:00 2001 From: Qi Zhu <821684824@qq.com> Date: Tue, 31 Mar 2026 10:41:09 +0800 Subject: [PATCH] feat: generate reversed-name data for sort pushdown benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use tpchgen --parts=3 to create 3 sorted non-overlapping parquet files, then rename so alphabetical order is reversed vs key order: a_part3.parquet (highest keys, sorts first alphabetically) b_part2.parquet c_part1.parquet (lowest keys, sorts last alphabetically) This is much simpler than the previous approach (no datafusion-cli needed). Release benchmark results (6M rows, single partition): Q1 ORDER BY ASC: main 259ms → PR 122ms (53% faster) Q3 SELECT * ORDER BY: main 700ms → PR 353ms (50% faster) --- benchmarks/bench.sh | 49 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index b8c9ff5c8ec4..badf9ce4352a 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -314,8 +314,7 @@ main() { data_tpch "1" "parquet" ;; sort_pushdown|sort_pushdown_sorted) - # same data as for tpch - data_tpch "1" "parquet" + data_sort_pushdown ;; sort_tpch) # same data as for tpch @@ -1085,19 +1084,57 @@ run_external_aggr() { } # Runs the sort pushdown benchmark (without WITH ORDER) +# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts, +# renamed so alphabetical order does NOT match sort key order. +# This forces the sort pushdown optimizer to reorder files by statistics. +# +# tpchgen produces 3 sorted, non-overlapping parquet files: +# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys) +# lineitem.2.parquet: l_orderkey 2M ~ 4M +# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys) +# +# We rename them so alphabetical order is reversed: +# a_part3.parquet (highest keys, sorts first alphabetically) +# b_part2.parquet +# c_part1.parquet (lowest keys, sorts last alphabetically) +data_sort_pushdown() { + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem" + if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then + echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}" + return + fi + + echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..." + + TEMP_DIR="${DATA_DIR}/sort_pushdown_temp" + mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}" + + tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}" + + # Rename: reverse alphabetical order vs key order + mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet" + mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet" + mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet" + + rm -rf "${TEMP_DIR}" + + echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}" + ls -la "${SORT_PUSHDOWN_DIR}" +} + run_sort_pushdown() { - TPCH_DIR="${DATA_DIR}/tpch_sf1" + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown" RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json" echo "Running sort pushdown benchmark (no WITH ORDER)..." - debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} + debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} } # Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination) run_sort_pushdown_sorted() { - TPCH_DIR="${DATA_DIR}/tpch_sf1" + SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown" RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json" echo "Running sort pushdown benchmark (with WITH ORDER)..." - debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} + debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG} } # Runs the sort integration benchmark