From 288313e72cd25d4fdcc8156d1ff78d1cd77ea76b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 16:03:57 +0800 Subject: [PATCH 01/28] Add UDF equality checks and new test module for UDFs --- datafusion/expr/src/test/mod.rs | 2 + datafusion/expr/src/test/udf_equals.rs | 86 ++++++++++++++++++++++++++ datafusion/expr/src/udf.rs | 9 ++- 3 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 datafusion/expr/src/test/udf_equals.rs diff --git a/datafusion/expr/src/test/mod.rs b/datafusion/expr/src/test/mod.rs index 04e1ccc47465..f58e6902e0bb 100644 --- a/datafusion/expr/src/test/mod.rs +++ b/datafusion/expr/src/test/mod.rs @@ -16,3 +16,5 @@ // under the License. pub mod function_stub; +#[cfg(test)] +pub mod udf_equals; diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs new file mode 100644 index 000000000000..083bd35aad4a --- /dev/null +++ b/datafusion/expr/src/test/udf_equals.rs @@ -0,0 +1,86 @@ +use crate::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, +}; +use arrow::datatypes::DataType; +use datafusion_common::{not_impl_err, Result}; +use std::any::Any; + +#[derive(Debug)] +#[allow(dead_code)] +struct ParamUdf { + param: i32, + signature: Signature, +} + +impl ParamUdf { + fn new(param: i32) -> Self { + Self { + param, + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for ParamUdf { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "param_udf" + } + fn signature(&self) -> &Signature { + &self.signature + } + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + not_impl_err!("not used") + } +} + +#[derive(Debug)] +#[allow(dead_code)] +struct AnotherParamUdf { + signature: Signature, +} + +impl AnotherParamUdf { + fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for AnotherParamUdf { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "param_udf" + } + fn signature(&self) -> &Signature { + &self.signature + } + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + not_impl_err!("not used") + } +} + +#[test] +fn different_instances_not_equal() { + let udf1 = ScalarUDF::from(ParamUdf::new(1)); + let udf2 = ScalarUDF::from(ParamUdf::new(2)); + assert_ne!(udf1, udf2); +} + +#[test] +fn different_types_not_equal() { + let udf1 = ScalarUDF::from(ParamUdf::new(1)); + let udf2 = ScalarUDF::from(AnotherParamUdf::new()); + assert_ne!(udf1, udf2); +} diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 81865a836d2c..6badb08b6db1 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -703,9 +703,14 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// - symmetric: `a.equals(b)` implies `b.equals(a)`; /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. /// - /// By default, compares [`Self::name`] and [`Self::signature`]. + /// By default, compares [`Self::as_any().type_id`], [`Self::name`] and + /// [`Self::signature`]. Different instances of the same function type are + /// therefore considered equal only if they are pointer equal. fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - self.name() == other.name() && self.signature() == other.signature() + std::ptr::eq(self.as_any(), other.as_any()) + || (self.as_any().type_id() == other.as_any().type_id() + && self.name() == other.name() + && self.signature() == other.signature()) } /// Returns a hash value for this scalar UDF. From 37a37f113574dfc5bb6bd5e14652b02cee616074 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 16:23:54 +0800 Subject: [PATCH 02/28] Enhance equality check for Scalar UDFs to improve accuracy in comparison logic --- datafusion/expr/src/udf.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 6badb08b6db1..c3a489edf1e9 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -707,10 +707,30 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// [`Self::signature`]. Different instances of the same function type are /// therefore considered equal only if they are pointer equal. fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - std::ptr::eq(self.as_any(), other.as_any()) - || (self.as_any().type_id() == other.as_any().type_id() - && self.name() == other.name() - && self.signature() == other.signature()) + // if the pointers are the same, the UDFs are the same + if std::ptr::eq(self.as_any(), other.as_any()) { + return true; + } + + // if the types are different, the UDFs are not the same + if self.as_any().type_id() != other.as_any().type_id() { + return false; + } + + // if the names are different, the UDFs are not the same + if self.name() != other.name() { + return false; + } + + // if the signatures are different, the UDFs are not the same + if self.signature() != other.signature() { + return false; + } + + // if the types, names and signatures are the same, we can't know if they are the same so we + // assume they are not. + // If a UDF has internal state that should be compared, it should implement this method + false } /// Returns a hash value for this scalar UDF. From b49303ef4279e4f036f8747338b7a460f2b597c7 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 16:38:51 +0800 Subject: [PATCH 03/28] Add equality tests for Scalar UDF instances and types --- datafusion/expr/src/test/udf_equals.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index 083bd35aad4a..33a70f747c6a 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -84,3 +84,17 @@ fn different_types_not_equal() { let udf2 = ScalarUDF::from(AnotherParamUdf::new()); assert_ne!(udf1, udf2); } + +#[test] +fn same_instances_equal() { + let udf1 = ScalarUDF::from(ParamUdf::new(1)); + let udf2 = ScalarUDF::from(ParamUdf::new(1)); + assert_eq!(udf1, udf2); +} + +#[test] +fn same_types_equal() { + let udf1 = ScalarUDF::from(AnotherParamUdf::new()); + let udf2 = ScalarUDF::from(AnotherParamUdf::new()); + assert_eq!(udf1, udf2); +} \ No newline at end of file From 0184ef2ab978ab2a9ba2792fbe5d7a0d21e4e4cc Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:03:02 +0800 Subject: [PATCH 04/28] Refactor equality check in ScalarUDFImpl to simplify comparison logic --- datafusion/expr/src/udf.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index c3a489edf1e9..d0cd19630738 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -712,21 +712,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { return true; } - // if the types are different, the UDFs are not the same - if self.as_any().type_id() != other.as_any().type_id() { - return false; - } - - // if the names are different, the UDFs are not the same - if self.name() != other.name() { - return false; - } - - // if the signatures are different, the UDFs are not the same - if self.signature() != other.signature() { - return false; - } - // if the types, names and signatures are the same, we can't know if they are the same so we // assume they are not. // If a UDF has internal state that should be compared, it should implement this method From 3b1bd00e933acfa03bb33faff38a30306b2a79a1 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:03:29 +0800 Subject: [PATCH 05/28] Implement equality checks for ParamUdf and AnotherParamUdf to enhance comparison logic --- datafusion/expr/src/test/udf_equals.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index 33a70f747c6a..c85c9646f188 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -37,6 +37,13 @@ impl ScalarUDFImpl for ParamUdf { fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { not_impl_err!("not used") } + fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + if let Some(other) = other.as_any().downcast_ref::() { + self.param == other.param && self.type_id() == other.type_id() + } else { + false + } + } } #[derive(Debug)] @@ -69,6 +76,13 @@ impl ScalarUDFImpl for AnotherParamUdf { fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { not_impl_err!("not used") } + fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + if let Some(other) = other.as_any().downcast_ref::() { + self.type_id() == other.type_id() + } else { + false + } + } } #[test] @@ -86,7 +100,7 @@ fn different_types_not_equal() { } #[test] -fn same_instances_equal() { +fn same_state_equal() { let udf1 = ScalarUDF::from(ParamUdf::new(1)); let udf2 = ScalarUDF::from(ParamUdf::new(1)); assert_eq!(udf1, udf2); @@ -97,4 +111,4 @@ fn same_types_equal() { let udf1 = ScalarUDF::from(AnotherParamUdf::new()); let udf2 = ScalarUDF::from(AnotherParamUdf::new()); assert_eq!(udf1, udf2); -} \ No newline at end of file +} From d0b131800f52021e38d2fc05c02ee5c039e20d37 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:05:09 +0800 Subject: [PATCH 06/28] Refactor AnotherParamUdf to SignatureUdf for consistency in UDF implementation --- datafusion/expr/src/test/udf_equals.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index c85c9646f188..3008a5211ecb 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -48,11 +48,11 @@ impl ScalarUDFImpl for ParamUdf { #[derive(Debug)] #[allow(dead_code)] -struct AnotherParamUdf { +struct SignatureUdf { signature: Signature, } -impl AnotherParamUdf { +impl SignatureUdf { fn new() -> Self { Self { signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), @@ -60,7 +60,7 @@ impl AnotherParamUdf { } } -impl ScalarUDFImpl for AnotherParamUdf { +impl ScalarUDFImpl for SignatureUdf { fn as_any(&self) -> &dyn Any { self } @@ -77,7 +77,7 @@ impl ScalarUDFImpl for AnotherParamUdf { not_impl_err!("not used") } fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - if let Some(other) = other.as_any().downcast_ref::() { + if let Some(other) = other.as_any().downcast_ref::() { self.type_id() == other.type_id() } else { false @@ -95,7 +95,7 @@ fn different_instances_not_equal() { #[test] fn different_types_not_equal() { let udf1 = ScalarUDF::from(ParamUdf::new(1)); - let udf2 = ScalarUDF::from(AnotherParamUdf::new()); + let udf2 = ScalarUDF::from(SignatureUdf::new()); assert_ne!(udf1, udf2); } @@ -108,7 +108,7 @@ fn same_state_equal() { #[test] fn same_types_equal() { - let udf1 = ScalarUDF::from(AnotherParamUdf::new()); - let udf2 = ScalarUDF::from(AnotherParamUdf::new()); + let udf1 = ScalarUDF::from(SignatureUdf::new()); + let udf2 = ScalarUDF::from(SignatureUdf::new()); assert_eq!(udf1, udf2); } From 612877995c81843c244cf0dd29f1daf13b946c31 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:09:03 +0800 Subject: [PATCH 07/28] Add DefaultParamUdf implementation and equality tests for comparison logic --- datafusion/expr/src/test/udf_equals.rs | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index 3008a5211ecb..6265ac72ab9e 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -85,6 +85,40 @@ impl ScalarUDFImpl for SignatureUdf { } } +#[derive(Debug)] +#[allow(dead_code)] +struct DefaultParamUdf { + param: i32, + signature: Signature, +} + +impl DefaultParamUdf { + fn new(param: i32) -> Self { + Self { + param, + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for DefaultParamUdf { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "default_param_udf" + } + fn signature(&self) -> &Signature { + &self.signature + } + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + not_impl_err!("not used") + } +} + #[test] fn different_instances_not_equal() { let udf1 = ScalarUDF::from(ParamUdf::new(1)); @@ -112,3 +146,17 @@ fn same_types_equal() { let udf2 = ScalarUDF::from(SignatureUdf::new()); assert_eq!(udf1, udf2); } + +#[test] +fn default_udfs_with_same_param_not_equal() { + let udf1 = ScalarUDF::from(DefaultParamUdf::new(1)); + let udf2 = ScalarUDF::from(DefaultParamUdf::new(1)); + assert_ne!(udf1, udf2); +} + +#[test] +fn default_udfs_with_different_param_not_equal() { + let udf1 = ScalarUDF::from(DefaultParamUdf::new(1)); + let udf2 = ScalarUDF::from(DefaultParamUdf::new(2)); + assert_ne!(udf1, udf2); +} From 579f3c49a07ac71480733f879fbbcf9bb11ef293 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:17:49 +0800 Subject: [PATCH 08/28] Enhance documentation for equals method in ScalarUDFImpl to clarify equality rules and provide implementation examples --- datafusion/expr/src/udf.rs | 39 ++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index d0cd19630738..4b4eb3d53b17 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -696,16 +696,39 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// Return true if this scalar UDF is equal to the other. /// - /// Allows customizing the equality of scalar UDFs. - /// Must be consistent with [`Self::hash_value`] and follow the same rules as [`Eq`]: + /// This method allows customizing the equality of scalar UDFs. It must adhere to the rules of equivalence: /// - /// - reflexive: `a.equals(a)`; - /// - symmetric: `a.equals(b)` implies `b.equals(a)`; - /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. + /// - Reflexive: `a.equals(a)` must return true. + /// - Symmetric: `a.equals(b)` implies `b.equals(a)`. + /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`. /// - /// By default, compares [`Self::as_any().type_id`], [`Self::name`] and - /// [`Self::signature`]. Different instances of the same function type are - /// therefore considered equal only if they are pointer equal. + /// # Default Behavior + /// By default, this method compares the type IDs, names, and signatures of the two UDFs. If these match, + /// the method assumes the UDFs are not equal unless their pointers are the same. This conservative approach + /// ensures that different instances of the same function type are not mistakenly considered equal. + /// + /// # Custom Implementation + /// If a UDF has internal state or additional properties that should be considered for equality, this method + /// should be overridden. For example, a UDF with parameters might compare those parameters in addition to + /// the default checks. + /// + /// # Example + /// ```rust + /// impl ScalarUDFImpl for MyUdf { + /// fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + /// if let Some(other) = other.as_any().downcast_ref::() { + /// self.param == other.param && self.name() == other.name() + /// } else { + /// false + /// } + /// } + /// } + /// ``` + /// + /// # Notes + /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, + /// their hash values must also be the same. + /// - Ensure that the implementation does not panic or cause undefined behavior for any input. fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { // if the pointers are the same, the UDFs are the same if std::ptr::eq(self.as_any(), other.as_any()) { From 83b7d352c712d4d27e11933b4a38da986e696019 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:25:42 +0800 Subject: [PATCH 09/28] Add hash_value method to ParamUdf for improved equality checks --- datafusion/expr/src/test/udf_equals.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index 6265ac72ab9e..de333a9287e0 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -44,6 +44,12 @@ impl ScalarUDFImpl for ParamUdf { false } } + fn hash_value(&self) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + self.param.hash(&mut hasher); + self.type_id().hash(&mut hasher); + hasher.finish() + } } #[derive(Debug)] From a7a8c021a3b901fcf858c65a8213eec532a0634e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:25:50 +0800 Subject: [PATCH 10/28] Document breaking change in ScalarUDFImpl::equals method default implementation --- docs/source/library-user-guide/upgrading.md | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index dd702c263614..9164ba73926e 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -62,6 +62,32 @@ DataFusionError::SchemaError( [#16652]: https://github.com/apache/datafusion/issues/16652 +#### `ScalarUDFImpl::equals` Default Implementation + +The default implementation of the `equals` method in the `ScalarUDFImpl` trait has been updated. Previously, it compared only the type IDs, names, and signatures of UDFs. Now, it assumes UDFs are not equal unless their pointers are the same. + +**Impact:** +- This change may affect any custom UDF implementations relying on the default `equals` behavior. +- If your UDFs have internal state or additional properties that should be considered for equality, you must override the `equals` method to include those comparisons. + +**Action Required:** +- Review your UDF implementations and ensure the `equals` method is overridden where necessary. +- Update any tests or logic that depend on the previous default behavior. + +**Example:** +```rust +impl ScalarUDFImpl for MyUdf { + fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + if let Some(other) = other.as_any().downcast_ref::() { + self.param == other.param && self.name() == other.name() + } else { + false + } + } +} +``` +[#16677] https://github.com/apache/datafusion/issues/16677 + ### `datafusion.execution.collect_statistics` now defaults to `true` The default value of the `datafusion.execution.collect_statistics` configuration @@ -765,3 +791,4 @@ take care of constructing the `TypeSignature` for you: - `Signature::array` [ticket]: https://github.com/apache/datafusion/issues/13286 + From c3030710536e500b1614510ed15f9b6443ca03d4 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 17:32:08 +0800 Subject: [PATCH 11/28] Add missing import for ScalarUDFImpl in udf.rs --- datafusion/expr/src/test/udf_equals.rs | 8 +++++--- datafusion/expr/src/udf.rs | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index de333a9287e0..d0fdbfc95833 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -3,8 +3,10 @@ use crate::{ }; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, Result}; -use std::any::Any; - +use std::{ + any::Any, + hash::{Hash, Hasher}, +}; #[derive(Debug)] #[allow(dead_code)] struct ParamUdf { @@ -71,7 +73,7 @@ impl ScalarUDFImpl for SignatureUdf { self } fn name(&self) -> &str { - "param_udf" + "signature_udf" } fn signature(&self) -> &Signature { &self.signature diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 4b4eb3d53b17..48fd3291bad7 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -714,7 +714,32 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// /// # Example /// ```rust + /// use std::any::Any; + /// use arrow::datatypes::DataType; + /// use datafusion_common::{not_impl_err, Result}; + /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; + /// #[derive(Debug)] + /// struct MyUdf { + /// param: i32, + /// signature: Signature, + /// } + /// /// impl ScalarUDFImpl for MyUdf { + /// fn as_any(&self) -> &dyn Any { + /// self + /// } + /// fn name(&self) -> &str { + /// "my_udf" + /// } + /// fn signature(&self) -> &Signature { + /// &self.signature + /// } + /// fn return_type(&self, _arg_types: &[DataType]) -> Result { + /// Ok(DataType::Int32) + /// } + /// fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + /// not_impl_err!("not used") + /// } /// fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { /// if let Some(other) = other.as_any().downcast_ref::() { /// self.param == other.param && self.name() == other.name() From 186f70d540912c6697595de5947b8d6c7cc5ea26 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 18:00:46 +0800 Subject: [PATCH 12/28] Add hash_value method to ScalarUDFImpl for improved hashing functionality --- datafusion/expr/src/udf.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 48fd3291bad7..faa3e9acea48 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -715,6 +715,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// # Example /// ```rust /// use std::any::Any; + /// use std::hash::{DefaultHasher, Hash, Hasher}; /// use arrow::datatypes::DataType; /// use datafusion_common::{not_impl_err, Result}; /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; @@ -747,6 +748,12 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// false /// } /// } + /// fn hash_value(&self) -> u64 { + /// let mut hasher = DefaultHasher::new(); + /// self.param.hash(&mut hasher); + /// self.name().hash(&mut hasher); + /// hasher.finish() + /// } /// } /// ``` /// From 88afc7a6168196869267ddd7a3d134dd23f75641 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 18:40:03 +0800 Subject: [PATCH 13/28] Add license header to udf_equals.rs --- datafusion/expr/src/test/udf_equals.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index d0fdbfc95833..ccf99f257699 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::{ ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, }; From 7e41e4ec025712e25e7f1090e87471c5e06d3a8b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 18:48:28 +0800 Subject: [PATCH 14/28] fix prettier errors --- docs/source/library-user-guide/upgrading.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 9164ba73926e..b4d2f3e1ef22 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -67,14 +67,17 @@ DataFusionError::SchemaError( The default implementation of the `equals` method in the `ScalarUDFImpl` trait has been updated. Previously, it compared only the type IDs, names, and signatures of UDFs. Now, it assumes UDFs are not equal unless their pointers are the same. **Impact:** + - This change may affect any custom UDF implementations relying on the default `equals` behavior. - If your UDFs have internal state or additional properties that should be considered for equality, you must override the `equals` method to include those comparisons. **Action Required:** + - Review your UDF implementations and ensure the `equals` method is overridden where necessary. - Update any tests or logic that depend on the previous default behavior. **Example:** + ```rust impl ScalarUDFImpl for MyUdf { fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { @@ -86,6 +89,7 @@ impl ScalarUDFImpl for MyUdf { } } ``` + [#16677] https://github.com/apache/datafusion/issues/16677 ### `datafusion.execution.collect_statistics` now defaults to `true` @@ -791,4 +795,3 @@ take care of constructing the `TypeSignature` for you: - `Signature::array` [ticket]: https://github.com/apache/datafusion/issues/13286 - From ec5dcb391c71385f3882f96d57afc6f078cb7f16 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 4 Jul 2025 22:27:52 +0800 Subject: [PATCH 15/28] Improve ScalarUDF equality check for pointer equality optimization --- .../physical_optimizer/projection_pushdown.rs | 20 +++++++++++++++---- datafusion/expr/src/udf.rs | 6 +++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 6964965a6431..ced8e450df28 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -100,6 +100,7 @@ impl ScalarUDFImpl for DummyUDF { #[test] fn test_update_matching_exprs() -> Result<()> { + let udf = Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())); let exprs: Vec> = vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("a", 3)), @@ -114,7 +115,7 @@ fn test_update_matching_exprs() -> Result<()> { Arc::new(NegativeExpr::new(Arc::new(Column::new("f", 4)))), Arc::new(ScalarFunctionExpr::new( "scalar_expr", - Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())), + udf.clone(), vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("b", 1)), @@ -179,7 +180,7 @@ fn test_update_matching_exprs() -> Result<()> { Arc::new(NegativeExpr::new(Arc::new(Column::new("f", 5)))), Arc::new(ScalarFunctionExpr::new( "scalar_expr", - Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())), + udf, vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("b", 1)), @@ -231,8 +232,19 @@ fn test_update_matching_exprs() -> Result<()> { Ok(()) } +#[test] +fn test_scalar_udf_pointer_equality() { + let udf_a = ScalarUDF::new_from_impl(DummyUDF::new()); + let udf_b = ScalarUDF::new_from_impl(DummyUDF::new()); + assert_ne!(udf_a, udf_b); + + let udf_a_clone = udf_a.clone(); + assert_eq!(udf_a, udf_a_clone); +} + #[test] fn test_update_projected_exprs() -> Result<()> { + let udf = Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())); let exprs: Vec> = vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("a", 3)), @@ -247,7 +259,7 @@ fn test_update_projected_exprs() -> Result<()> { Arc::new(NegativeExpr::new(Arc::new(Column::new("f", 4)))), Arc::new(ScalarFunctionExpr::new( "scalar_expr", - Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())), + udf.clone(), vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("b", 1)), @@ -312,7 +324,7 @@ fn test_update_projected_exprs() -> Result<()> { Arc::new(NegativeExpr::new(Arc::new(Column::new("f_new", 5)))), Arc::new(ScalarFunctionExpr::new( "scalar_expr", - Arc::new(ScalarUDF::new_from_impl(DummyUDF::new())), + udf, vec![ Arc::new(BinaryExpr::new( Arc::new(Column::new("b_new", 1)), diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index faa3e9acea48..1d6d07b8f52a 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -60,7 +60,11 @@ pub struct ScalarUDF { impl PartialEq for ScalarUDF { fn eq(&self, other: &Self) -> bool { - self.inner.equals(other.inner.as_ref()) + if Arc::ptr_eq(&self.inner, &other.inner) { + true + } else { + self.inner.equals(other.inner.as_ref()) + } } } From 8dc7e11fde62ceb203b1aa938d343056cc1310ee Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 09:39:01 +0800 Subject: [PATCH 16/28] Refactor: derive PartialEq ParamUdf equality and hashing implementations for improved clarity and correctness --- datafusion/expr/src/test/udf_equals.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index ccf99f257699..5ba056c20925 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -24,7 +24,7 @@ use std::{ any::Any, hash::{Hash, Hasher}, }; -#[derive(Debug)] +#[derive(Debug, PartialEq)] #[allow(dead_code)] struct ParamUdf { param: i32, @@ -58,7 +58,7 @@ impl ScalarUDFImpl for ParamUdf { } fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { if let Some(other) = other.as_any().downcast_ref::() { - self.param == other.param && self.type_id() == other.type_id() + self == other } else { false } @@ -66,7 +66,7 @@ impl ScalarUDFImpl for ParamUdf { fn hash_value(&self) -> u64 { let mut hasher = std::collections::hash_map::DefaultHasher::new(); self.param.hash(&mut hasher); - self.type_id().hash(&mut hasher); + self.signature.hash(&mut hasher); hasher.finish() } } From 676545ca661d86138049bcfe835bcab9d8824d74 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 09:56:34 +0800 Subject: [PATCH 17/28] Enhance ScalarUDFImpl equality method documentation with alternative comparison approach details --- datafusion/expr/src/test/udf_equals.rs | 1 - datafusion/expr/src/udf.rs | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index 5ba056c20925..50f1e0fc4ba3 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -25,7 +25,6 @@ use std::{ hash::{Hash, Hasher}, }; #[derive(Debug, PartialEq)] -#[allow(dead_code)] struct ParamUdf { param: i32, signature: Signature, diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 1d6d07b8f52a..f21e35aa576c 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -774,6 +774,13 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { // if the types, names and signatures are the same, we can't know if they are the same so we // assume they are not. // If a UDF has internal state that should be compared, it should implement this method + // + // Alternative approach: we could potentially do bit-by-bit comparison if both objects + // are the same concrete type, but this requires: + // 1. Both objects to have identical TypeId + // 2. Careful handling of potential padding bytes in structs + // 3. The concrete type to be safely comparable via memcmp + // For now, we use the conservative approach of returning false false } From dc6264b0a178772aa96fd0507e6dff2879a53725 Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 7 Jul 2025 10:14:16 +0800 Subject: [PATCH 18/28] Minor comment adjustment of use statement Co-authored-by: Piotr Findeisen --- datafusion/expr/src/udf.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 1d6d07b8f52a..8aaf60fee86d 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -723,6 +723,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// use arrow::datatypes::DataType; /// use datafusion_common::{not_impl_err, Result}; /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; + /// /// #[derive(Debug)] /// struct MyUdf { /// param: i32, From d5b813b75a70e66ac92a39f57cadf0b9fb998152 Mon Sep 17 00:00:00 2001 From: kosiew Date: Mon, 7 Jul 2025 10:14:54 +0800 Subject: [PATCH 19/28] Add blankline Co-authored-by: Piotr Findeisen --- datafusion/expr/src/test/udf_equals.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/src/test/udf_equals.rs index ccf99f257699..b80100407c1b 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/src/test/udf_equals.rs @@ -24,6 +24,7 @@ use std::{ any::Any, hash::{Hash, Hasher}, }; + #[derive(Debug)] #[allow(dead_code)] struct ParamUdf { From 2ee9c8f081dd19d34b3ce3b14dc8c8bd99b04eea Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 10:18:57 +0800 Subject: [PATCH 20/28] Refactor: derive PartialEq for MyUdf struct and simplify equality check in equals method --- datafusion/expr/src/udf.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index c9400ae65281..a45d01a4aeb4 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -724,7 +724,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// use datafusion_common::{not_impl_err, Result}; /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature}; /// - /// #[derive(Debug)] + /// #[derive(Debug, PartialEq)] /// struct MyUdf { /// param: i32, /// signature: Signature, @@ -748,7 +748,7 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// } /// fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { /// if let Some(other) = other.as_any().downcast_ref::() { - /// self.param == other.param && self.name() == other.name() + /// self == other /// } else { /// false /// } From f9c8408c39d7bf7177a90c48ac7a4135857aef3a Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 10:33:26 +0800 Subject: [PATCH 21/28] Move udf_equals module to datafusion/expr/tests --- datafusion/expr/src/test/mod.rs | 2 -- datafusion/expr/{src/test => tests}/udf_equals.rs | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) rename datafusion/expr/{src/test => tests}/udf_equals.rs (99%) diff --git a/datafusion/expr/src/test/mod.rs b/datafusion/expr/src/test/mod.rs index f58e6902e0bb..04e1ccc47465 100644 --- a/datafusion/expr/src/test/mod.rs +++ b/datafusion/expr/src/test/mod.rs @@ -16,5 +16,3 @@ // under the License. pub mod function_stub; -#[cfg(test)] -pub mod udf_equals; diff --git a/datafusion/expr/src/test/udf_equals.rs b/datafusion/expr/tests/udf_equals.rs similarity index 99% rename from datafusion/expr/src/test/udf_equals.rs rename to datafusion/expr/tests/udf_equals.rs index 50f1e0fc4ba3..7fc2ab6dc00f 100644 --- a/datafusion/expr/src/test/udf_equals.rs +++ b/datafusion/expr/tests/udf_equals.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, -}; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, Result}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, +}; use std::{ any::Any, hash::{Hash, Hasher}, From 73678dbb7855daf0aa6e2f152bfc6068ae785185 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 17:06:57 +0800 Subject: [PATCH 22/28] Fix doc tests --- docs/source/library-user-guide/upgrading.md | 35 +++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index b4d2f3e1ef22..6206c0727dfa 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -79,7 +79,42 @@ The default implementation of the `equals` method in the `ScalarUDFImpl` trait h **Example:** ```rust +# use datafusion::logical_expr::{ScalarUDFImpl, Signature, Volatility}; +# use datafusion_common::{DataFusionError, Result}; +# use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +# use arrow::datatypes::DataType; +# use std::any::Any; +# +# #[derive(Debug)] +# struct MyUdf { +# param: i32, +# } +# +# impl MyUdf { +# fn name(&self) -> &str { "my_udf" } +# } +# impl ScalarUDFImpl for MyUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "my_udf" + } + + fn signature(&self) -> &Signature { + todo!() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + todo!() + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + todo!() + } + fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { if let Some(other) = other.as_any().downcast_ref::() { self.param == other.param && self.name() == other.name() From ec7930c824baafd689a9d0de62e06328f54d3f69 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Mon, 7 Jul 2025 17:11:49 +0800 Subject: [PATCH 23/28] Fix prettier error --- docs/source/library-user-guide/upgrading.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 6206c0727dfa..7c820ebd66a6 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -84,16 +84,16 @@ The default implementation of the `equals` method in the `ScalarUDFImpl` trait h # use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; # use arrow::datatypes::DataType; # use std::any::Any; -# +# # #[derive(Debug)] # struct MyUdf { # param: i32, # } -# +# # impl MyUdf { # fn name(&self) -> &str { "my_udf" } # } -# +# impl ScalarUDFImpl for MyUdf { fn as_any(&self) -> &dyn Any { self From dd4aec2b8d5042cbdb9b3b30fd3780f5d1e9e83b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 10 Jul 2025 15:52:37 +0800 Subject: [PATCH 24/28] resolve merge conflict --- docs/source/library-user-guide/upgrading.md | 64 --------------------- 1 file changed, 64 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 04689f29d290..36d51b008328 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -127,70 +127,6 @@ impl ScalarUDFImpl for MyUdf { [#16677] https://github.com/apache/datafusion/issues/16677 -#### `ScalarUDFImpl::equals` Default Implementation - -The default implementation of the `equals` method in the `ScalarUDFImpl` trait has been updated. Previously, it compared only the type IDs, names, and signatures of UDFs. Now, it assumes UDFs are not equal unless their pointers are the same. - -**Impact:** - -- This change may affect any custom UDF implementations relying on the default `equals` behavior. -- If your UDFs have internal state or additional properties that should be considered for equality, you must override the `equals` method to include those comparisons. - -**Action Required:** - -- Review your UDF implementations and ensure the `equals` method is overridden where necessary. -- Update any tests or logic that depend on the previous default behavior. - -**Example:** - -```rust -# use datafusion::logical_expr::{ScalarUDFImpl, Signature, Volatility}; -# use datafusion_common::{DataFusionError, Result}; -# use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -# use arrow::datatypes::DataType; -# use std::any::Any; -# -# #[derive(Debug)] -# struct MyUdf { -# param: i32, -# } -# -# impl MyUdf { -# fn name(&self) -> &str { "my_udf" } -# } -# -impl ScalarUDFImpl for MyUdf { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - "my_udf" - } - - fn signature(&self) -> &Signature { - todo!() - } - - fn return_type(&self, _arg_types: &[DataType]) -> Result { - todo!() - } - - fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { - todo!() - } - - fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - if let Some(other) = other.as_any().downcast_ref::() { - self.param == other.param && self.name() == other.name() - } else { - false - } - } -} -``` - -[#16677] https://github.com/apache/datafusion/issues/16677 ### Metadata on Arrow Types is now represented by `FieldMetadata` From 67db2dfe91eb3b36e008510fabc65257a16443fb Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 10 Jul 2025 16:05:48 +0800 Subject: [PATCH 25/28] fix prettier error --- docs/source/library-user-guide/upgrading.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 36d51b008328..ed02f830fc4a 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -127,7 +127,6 @@ impl ScalarUDFImpl for MyUdf { [#16677] https://github.com/apache/datafusion/issues/16677 - ### Metadata on Arrow Types is now represented by `FieldMetadata` Metadata from the Arrow `Field` is now stored using the `FieldMetadata` From e8f22aa5f632f59930e4bb94360172c632f46d94 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 15 Jul 2025 17:26:36 +0800 Subject: [PATCH 26/28] refactor: improve equality check for ScalarUDFImpl to ensure type safety and handle padding considerations --- datafusion/expr/src/udf.rs | 46 ++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index a45d01a4aeb4..6c213d2b22e0 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -766,24 +766,36 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, /// their hash values must also be the same. /// - Ensure that the implementation does not panic or cause undefined behavior for any input. - fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - // if the pointers are the same, the UDFs are the same - if std::ptr::eq(self.as_any(), other.as_any()) { - return true; - } + use std::{any::TypeId, mem, ptr, slice}; + +fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + // 1. If the pointers are identical, it’s definitely the same UDF. + if ptr::eq(self.as_any(), other.as_any()) { + return true; + } + + // 2. Otherwise, check that they’re the same concrete Rust type. + let self_any = self.as_any(); + let other_any = other.as_any(); + if TypeId::of_val(self_any) != TypeId::of_val(other_any) { + // Different types can never be equal. + return false; + } + + // 3. Now we know they're the same struct type. In theory, since Rust moves + // values by `memcpy`-ing their bytes, we could `memcmp` them byte-for-byte: + // + // However, Rust doesn't guarantee that padding bytes are set the same way, + // so two equal structs might have different padding and compare as not equal. + // + // If your UDF type has no padding, or you make sure all padding is zeroed + // (for example, with #[repr(C)] and a safe initializer), you can use memcmp + // Otherwise, it's safer to just return false. + + // 4. Fallback: we can’t prove they’re identical, so we say “not equal.” + false +} - // if the types, names and signatures are the same, we can't know if they are the same so we - // assume they are not. - // If a UDF has internal state that should be compared, it should implement this method - // - // Alternative approach: we could potentially do bit-by-bit comparison if both objects - // are the same concrete type, but this requires: - // 1. Both objects to have identical TypeId - // 2. Careful handling of potential padding bytes in structs - // 3. The concrete type to be safely comparable via memcmp - // For now, we use the conservative approach of returning false - false - } /// Returns a hash value for this scalar UDF. /// From 1581b7605ecb3d92ea7d5f2ec4c1d49d30adc777 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 15 Jul 2025 17:42:05 +0800 Subject: [PATCH 27/28] refactor: streamline use --- datafusion/expr/src/udf.rs | 70 +++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 6c213d2b22e0..0792f9f04ad3 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -25,12 +25,14 @@ use crate::{ColumnarValue, Documentation, Expr, Signature}; use arrow::datatypes::{DataType, Field, FieldRef}; use datafusion_common::{not_impl_err, ExprSchema, Result, ScalarValue}; use datafusion_expr_common::interval_arithmetic::Interval; -use std::any::Any; -use std::cmp::Ordering; -use std::fmt::Debug; -use std::hash::{DefaultHasher, Hash, Hasher}; -use std::sync::Arc; - +use std::{ + any::Any, + cmp::Ordering, + fmt::Debug, + hash::{DefaultHasher, Hash, Hasher}, + ptr, + sync::Arc, +}; /// Logical representation of a Scalar User Defined Function. /// /// A scalar function produces a single row output for each row of input. This @@ -766,36 +768,34 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, /// their hash values must also be the same. /// - Ensure that the implementation does not panic or cause undefined behavior for any input. - use std::{any::TypeId, mem, ptr, slice}; - -fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { - // 1. If the pointers are identical, it’s definitely the same UDF. - if ptr::eq(self.as_any(), other.as_any()) { - return true; - } - - // 2. Otherwise, check that they’re the same concrete Rust type. - let self_any = self.as_any(); - let other_any = other.as_any(); - if TypeId::of_val(self_any) != TypeId::of_val(other_any) { - // Different types can never be equal. - return false; - } - - // 3. Now we know they're the same struct type. In theory, since Rust moves - // values by `memcpy`-ing their bytes, we could `memcmp` them byte-for-byte: - // - // However, Rust doesn't guarantee that padding bytes are set the same way, - // so two equal structs might have different padding and compare as not equal. - // - // If your UDF type has no padding, or you make sure all padding is zeroed - // (for example, with #[repr(C)] and a safe initializer), you can use memcmp - // Otherwise, it's safer to just return false. - - // 4. Fallback: we can’t prove they’re identical, so we say “not equal.” - false -} + fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { + // 1. If the pointers are identical, it’s definitely the same UDF. + if ptr::eq(self.as_any(), other.as_any()) { + return true; + } + + // 2. Otherwise, check that they’re the same concrete Rust type. + let self_any = self.as_any(); + let other_any = other.as_any(); + if self_any.type_id() != other_any.type_id() { + // Different types can never be equal. + return false; + } + + // 3. Now we know they're the same struct type. In theory, since Rust moves + // values by `memcpy`-ing their bytes, we could `memcmp` them byte-for-byte: + // + // However, Rust doesn't guarantee that padding bytes are set the same way, + // so two equal structs might have different padding and compare as not equal. + // + // If your UDF type has no padding, or you make sure all padding is zeroed + // (for example, with #[repr(C)] and a safe initializer), you can use memcmp + // Otherwise, it's safer to just return false. + + // 4. Fallback: we can’t prove they’re identical, so we say “not equal.” + false + } /// Returns a hash value for this scalar UDF. /// From 3e466d763cc73b506a25121c813156dd62366ba0 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 16 Jul 2025 22:31:08 +0800 Subject: [PATCH 28/28] fix clippy error --- datafusion/expr/src/udf.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 0792f9f04ad3..6d2a62333b5a 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -768,7 +768,6 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// - This method must be consistent with [`Self::hash_value`]. If `equals` returns true for two UDFs, /// their hash values must also be the same. /// - Ensure that the implementation does not panic or cause undefined behavior for any input. - fn equals(&self, other: &dyn ScalarUDFImpl) -> bool { // 1. If the pointers are identical, it’s definitely the same UDF. if ptr::eq(self.as_any(), other.as_any()) {