Describe the bug
While upgrading to DataFusion 52 (I know, I'm getting to 53 slowly) tests started failing for CASE WHEN for us because we have cases where we use custom Column PhysicalExpr rather than DataFusion one.
this should not affect correctness.
This was done as an optimization in this PR by @pepijnve
because it find the columns referenced by traversing the expression tree and finding the Column Physical Expression. however, when having custom Column impl it will not add that used column to the projection
To Reproduce
This is the same test as:
|
#[test] |
|
fn case_without_expr() -> Result<()> { |
|
let batch = case_test_batch()?; |
|
let schema = batch.schema(); |
|
|
|
// CASE WHEN a = 'foo' THEN 123 WHEN a = 'bar' THEN 456 END |
|
let when1 = binary( |
|
col("a", &schema)?, |
|
Operator::Eq, |
|
lit("foo"), |
|
&batch.schema(), |
|
)?; |
|
let then1 = lit(123i32); |
|
let when2 = binary( |
|
col("a", &schema)?, |
|
Operator::Eq, |
|
lit("bar"), |
|
&batch.schema(), |
|
)?; |
|
let then2 = lit(456i32); |
|
|
|
let expr = generate_case_when_with_type_coercion( |
|
None, |
|
vec![(when1, then1), (when2, then2)], |
|
None, |
|
schema.as_ref(), |
|
)?; |
|
let result = expr |
|
.evaluate(&batch)? |
|
.into_array(batch.num_rows()) |
|
.expect("Failed to convert to array"); |
|
let result = as_int32_array(&result)?; |
|
|
|
let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]); |
|
|
|
assert_eq!(expected, result); |
|
|
|
Ok(()) |
|
} |
but using Custom impl of Column
#[test]
fn case_without_expr_and_with_custom_column_impl() -> Result<()> {
/// Custom impl of column
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
pub struct CustomColumn {
/// The name of the column (used for debugging and display purposes)
name: String,
/// The index of the column in its schema
index: usize,
data_type: DataType,
nullable: bool,
}
impl CustomColumn {
pub fn new_with_schema(name: &str, schema: &Schema) -> Result<Arc<dyn PhysicalExpr>> {
let index = schema.index_of(name)?;
let field = schema.field(index);
Ok(Arc::new(CustomColumn {
name: name.to_string(),
index,
data_type: field.data_type().clone(),
nullable: field.is_nullable(),
}))
}
}
impl std::fmt::Display for CustomColumn {
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
write!(f, "{}@{}", self.name, self.index)
}
}
impl PhysicalExpr for CustomColumn {
fn as_any(&self) -> &dyn Any {
self
}
fn data_type(&self, _input_schema: &Schema) -> Result<DataType> {
Ok(self.data_type.clone())
}
fn nullable(&self, _input_schema: &Schema) -> Result<bool> {
Ok(self.nullable)
}
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
self.bounds_check(batch.schema().as_ref())?;
Ok(ColumnarValue::Array(Arc::clone(batch.column(self.index))))
}
fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
vec![]
}
fn with_new_children(
self: Arc<Self>,
_children: Vec<Arc<dyn PhysicalExpr>>,
) -> Result<Arc<dyn PhysicalExpr>> {
Ok(self)
}
fn fmt_sql(&self, _: &mut Formatter<'_>) -> std::fmt::Result {
unimplemented!()
}
}
impl CustomColumn {
fn bounds_check(&self, input_schema: &Schema) -> Result<()> {
if self.index < input_schema.fields.len() {
Ok(())
} else {
internal_err!(
"PhysicalExpr CustomColumn references column '{}' at index {} (zero-based) but input schema only has {} columns: {:?}",
self.name,
self.index,
input_schema.fields.len(),
input_schema.fields().iter().map(|f| f.name()).collect::<Vec<_>>()
)
}
}
}
let batch = case_test_batch()?;
let schema = batch.schema();
// CASE WHEN a = 'foo' THEN 123 WHEN a = 'bar' THEN 456 END
let when1 = binary(
CustomColumn::new_with_schema("a", &schema)?,
Operator::Eq,
lit("foo"),
&batch.schema(),
)?;
let then1 = lit(123i32);
let when2 = binary(
CustomColumn::new_with_schema("a", &schema)?,
Operator::Eq,
lit("bar"),
&batch.schema(),
)?;
let then2 = lit(456i32);
let expr = generate_case_when_with_type_coercion(
None,
vec![(when1, then1), (when2, then2)],
None,
schema.as_ref(),
)?;
let result = expr
.evaluate(&batch)?
.into_array(batch.num_rows())
.expect("Failed to convert to array");
let result = as_int32_array(&result)?;
let expected = &Int32Array::from(vec![Some(123), None, None, Some(456)]);
assert_eq!(expected, result);
Ok(())
}
Expected behavior
to work with any custom column impl, and hopefully still having that optimization
Additional context
also might affect lambda support:
Describe the bug
While upgrading to DataFusion 52 (I know, I'm getting to 53 slowly) tests started failing for CASE WHEN for us because we have cases where we use custom
ColumnPhysicalExprrather than DataFusion one.this should not affect correctness.
This was done as an optimization in this PR by @pepijnve
CASEevaluation #18329because it find the columns referenced by traversing the expression tree and finding the
ColumnPhysical Expression. however, when having customColumnimpl it will not add that used column to the projectionTo Reproduce
This is the same test as:
datafusion/datafusion/physical-expr/src/expressions/case.rs
Lines 1694 to 1732 in 1a0af76
but using Custom impl of
ColumnExpected behavior
to work with any custom column impl, and hopefully still having that optimization
Additional context
also might affect lambda support: