Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,16 +481,22 @@ fn type_union_resolution_coercion(
}
}

/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation
/// Unlike `coerced_from`, usually the coerced type is for comparison only.
/// For example, compare with Dictionary and Dictionary, only value type is what we care about
/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a
/// comparison operation
///
/// Example comparison operations are `lhs = rhs` and `lhs > rhs`
///
/// Binary comparison kernels require the two arguments to be the (exact) same
/// data type. However, users can write queries where the two arguments are
/// different data types. In such cases, the data types are automatically cast
/// (coerced) to a single data type to pass to the kernels.
pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
if lhs_type == rhs_type {
// same type => equality is possible
return Some(lhs_type.clone());
}
binary_numeric_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_coercion(lhs_type, rhs_type, true))
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, true))
.or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
.or_else(|| list_coercion(lhs_type, rhs_type))
Expand All @@ -501,7 +507,11 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D
.or_else(|| struct_coercion(lhs_type, rhs_type))
}

/// Coerce `lhs_type` and `rhs_type` to a common type for value exprs
/// Coerce `lhs_type` and `rhs_type` to a common type for `VALUES` expression
///
/// For example `VALUES (1, 2), (3.0, 4.0)` where the first row is `Int32` and
/// the second row is `Float64` will coerce to `Float64`
///
pub fn values_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
if lhs_type == rhs_type {
// same type => equality is possible
Expand Down Expand Up @@ -883,7 +893,7 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) ->
///
/// Not all operators support dictionaries, if `preserve_dictionaries` is true
/// dictionaries will be preserved if possible
fn dictionary_coercion(
fn dictionary_comparison_coercion(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the only code change (and it is not a public function and thus this is not an API change)

lhs_type: &DataType,
rhs_type: &DataType,
preserve_dictionaries: bool,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ideally dictionary_comparison_coercion should not preserve the dictionary (always false), only value type matters. But since it is used in comparison_coercion, the name looks good to me

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my mind, most single argument functions / operations / coercions would preserve dictionary encoding if possible as that would keep the data encoded in a more efficent form (dictionary) longer

For example substr(dict_column, 3, 10) would ideally produce a Dictionary as output (after applying the function to the values)

However, most of our functions don't know how to work with Dictionary arrays, so this would be a larger change.

Expand Down Expand Up @@ -1044,7 +1054,7 @@ pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
string_coercion(lhs_type, rhs_type)
.or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
.or_else(|| null_coercion(lhs_type, rhs_type))
}
Expand All @@ -1064,7 +1074,7 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}

Expand Down Expand Up @@ -1324,38 +1334,50 @@ mod tests {

let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32));
let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
assert_eq!(dictionary_coercion(&lhs_type, &rhs_type, true), Some(Int32));
assert_eq!(
dictionary_coercion(&lhs_type, &rhs_type, false),
dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
Some(Int32)
);
assert_eq!(
dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
Some(Int32)
);

// Since we can coerce values of Int16 to Utf8 can support this
let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16));
assert_eq!(dictionary_coercion(&lhs_type, &rhs_type, true), Some(Utf8));
assert_eq!(
dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
Some(Utf8)
);

// Since we can coerce values of Utf8 to Binary can support this
let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary));
assert_eq!(
dictionary_coercion(&lhs_type, &rhs_type, true),
dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
Some(Binary)
);

let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
let rhs_type = Utf8;
assert_eq!(dictionary_coercion(&lhs_type, &rhs_type, false), Some(Utf8));
assert_eq!(
dictionary_coercion(&lhs_type, &rhs_type, true),
dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
Some(Utf8)
);
assert_eq!(
dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
Some(lhs_type.clone())
);

let lhs_type = Utf8;
let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8));
assert_eq!(dictionary_coercion(&lhs_type, &rhs_type, false), Some(Utf8));
assert_eq!(
dictionary_coercion(&lhs_type, &rhs_type, true),
dictionary_comparison_coercion(&lhs_type, &rhs_type, false),
Some(Utf8)
);
assert_eq!(
dictionary_comparison_coercion(&lhs_type, &rhs_type, true),
Some(rhs_type.clone())
);
}
Expand Down