From ff1ef378542523ca2b1ee28899c3587653941a63 Mon Sep 17 00:00:00 2001 From: comphead Date: Mon, 29 Dec 2025 09:52:41 -0800 Subject: [PATCH 1/2] chore: Add TPCDS benchmark comparison for PR --- benchmarks/README.md | 17 ++++++++++++++++- benchmarks/bench.sh | 2 +- datafusion/expr/src/udf.rs | 6 +++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 0b71628b2db1..b0aa4277e687 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -157,7 +157,7 @@ To get data in `DATA_DIR` for TPCDS, please follow instructions in `./benchmarks DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ ./benchmarks/compare_tpcds.sh main mybranch ``` -Alternatively you can compare manually followng the example velor +Alternatively, you can compare manually following the example below ```shell git checkout main @@ -240,6 +240,21 @@ Benchmark tpch_mem.json └──────────────┴──────────────┴──────────────┴───────────────┘ ``` +## Comparing performance of main and a PR + +### TPCDS + +Considering you already have TPCDS data locally + +```shell +export DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ +export PR_NUMBER=19464 +gh pr checkout $PR_NUMBER --repo apache/datafusion -b pr-$PR_NUMBER +git checkout main +git pull +./benchmarks/compare_tpcds.sh main pr-$PR_NUMBER +``` + ### Running Benchmarks Manually Assuming data is in the `data` directory, the `tpch` benchmark can be run with a command like this: diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index d5fa52d7f00e..6679405623d0 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -684,7 +684,7 @@ run_tpch_mem() { # Runs the tpcds benchmark run_tpcds() { - TPCDS_DIR="${DATA_DIR}/tpcds_sf1" + TPCDS_DIR="${DATA_DIR}" # Check if TPCDS data directory and representative file exists if [ ! -f "${TPCDS_DIR}/web_site.parquet" ]; then diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 26d7fc99cb17..28a07ad76101 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -24,10 +24,10 @@ use crate::sort_properties::{ExprProperties, SortProperties}; use crate::udf_eq::UdfEq; use crate::{ColumnarValue, Documentation, Expr, Signature}; use arrow::datatypes::{DataType, Field, FieldRef}; +#[cfg(debug_assertions)] +use datafusion_common::assert_or_internal_err; use datafusion_common::config::ConfigOptions; -use datafusion_common::{ - ExprSchema, Result, ScalarValue, assert_or_internal_err, not_impl_err, -}; +use datafusion_common::{ExprSchema, Result, ScalarValue, not_impl_err}; use datafusion_expr_common::dyn_eq::{DynEq, DynHash}; use datafusion_expr_common::interval_arithmetic::Interval; use std::any::Any; From e8f5f97c021591bb8a4b1f942837943954a297a2 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 1 Jan 2026 19:55:26 -0800 Subject: [PATCH 2/2] chore: Add TPCDS benchmark comparison for PR --- benchmarks/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index b0aa4277e687..7e9818aef24f 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -249,12 +249,14 @@ Considering you already have TPCDS data locally ```shell export DATA_DIR=../../datafusion-benchmarks/tpcds/data/sf1/ export PR_NUMBER=19464 -gh pr checkout $PR_NUMBER --repo apache/datafusion -b pr-$PR_NUMBER +git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER git checkout main git pull ./benchmarks/compare_tpcds.sh main pr-$PR_NUMBER ``` +Note: if `gh` is installed, you can also run `gh pr checkout $PR_NUMBER` instead of `git fetch upstream pull/$PR_NUMBER/head:pr-$PR_NUMBER` + ### Running Benchmarks Manually Assuming data is in the `data` directory, the `tpch` benchmark can be run with a command like this: