apache · andygrove · Mar 28, 2020 · Mar 28, 2020 · Mar 28, 2020 · Mar 28, 2020
diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs
@@ -39,9 +39,11 @@ use crate::execution::physical_plan::expressions::{
 };
 use crate::execution::physical_plan::hash_aggregate::HashAggregateExec;
 use crate::execution::physical_plan::limit::LimitExec;
+use crate::execution::physical_plan::math_expressions::register_math_functions;
 use crate::execution::physical_plan::merge::MergeExec;
 use crate::execution::physical_plan::projection::ProjectionExec;
 use crate::execution::physical_plan::selection::SelectionExec;
+use crate::execution::physical_plan::udf::{ScalarFunction, ScalarFunctionExpr};
 use crate::execution::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr};
 use crate::execution::table_impl::TableImpl;
 use crate::logicalplan::*;
@@ -57,14 +59,18 @@ use sqlparser::sqlast::{SQLColumnDef, SQLType};
 /// Execution context for registering data sources and executing queries
 pub struct ExecutionContext {
     datasources: HashMap<String, Box<dyn TableProvider>>,
+    scalar_functions: HashMap<String, Box<ScalarFunction>>,
 }
 
 impl ExecutionContext {
     /// Create a new execution context for in-memory queries
     pub fn new() -> Self {
-        Self {
+        let mut ctx = Self {
             datasources: HashMap::new(),
-        }
+            scalar_functions: HashMap::new(),
+        };
+        register_math_functions(&mut ctx);
+        ctx
     }
 
     /// Execute a SQL query and produce a Relation (a schema-aware iterator over a series
@@ -120,6 +126,7 @@ impl ExecutionContext {
             DFASTNode::ANSI(ansi) => {
                 let schema_provider = ExecutionContextSchemaProvider {
                     datasources: &self.datasources,
+                    scalar_functions: &self.scalar_functions,
                 };
 
                 // create a query planner
@@ -150,6 +157,16 @@ impl ExecutionContext {
         }
     }
 
+    /// Register a scalar UDF
+    pub fn register_udf(&mut self, f: ScalarFunction) {
+        self.scalar_functions.insert(f.name.clone(), Box::new(f));
+    }
+
+    /// Get a reference to the registered scalar functions
+    pub fn scalar_functions(&self) -> &HashMap<String, Box<ScalarFunction>> {
+        &self.scalar_functions
+    }
+
     fn build_schema(&self, columns: Vec<SQLColumnDef>) -> Result<Schema> {
         let mut fields = Vec::new();
 
@@ -239,7 +256,7 @@ impl ExecutionContext {
         let rules: Vec<Box<dyn OptimizerRule>> = vec![
             Box::new(ResolveColumnsRule::new()),
             Box::new(ProjectionPushDown::new()),
-            Box::new(TypeCoercionRule::new()),
+            Box::new(TypeCoercionRule::new(&self.scalar_functions)),
         ];
         let mut plan = plan.clone();
         for mut rule in rules {
@@ -403,6 +420,28 @@ impl ExecutionContext {
                 input_schema,
                 data_type.clone(),
             )?)),
+            Expr::ScalarFunction {
+                name,
+                args,
+                return_type,
+            } => match &self.scalar_functions.get(name) {
+                Some(f) => {
+                    let mut physical_args = vec![];
+                    for e in args {
+                        physical_args.push(self.create_physical_expr(e, input_schema)?);
+                    }
+                    Ok(Arc::new(ScalarFunctionExpr::new(
+                        name,
+                        Box::new(f.fun.clone()),
+                        physical_args,
+                        return_type,
+                    )))
+                }
+                _ => Err(ExecutionError::General(format!(
+                    "Invalid scalar function '{:?}'",
+                    name
+                ))),
+            },
             other => Err(ExecutionError::NotImplemented(format!(
                 "Physical plan does not support logical expression {:?}",
                 other
@@ -519,23 +558,35 @@ impl ExecutionContext {
 
 struct ExecutionContextSchemaProvider<'a> {
     datasources: &'a HashMap<String, Box<dyn TableProvider>>,
+    scalar_functions: &'a HashMap<String, Box<ScalarFunction>>,
 }
 
 impl SchemaProvider for ExecutionContextSchemaProvider<'_> {
     fn get_table_meta(&self, name: &str) -> Option<Arc<Schema>> {
         self.datasources.get(name).map(|ds| ds.schema().clone())
     }
 
-    fn get_function_meta(&self, _name: &str) -> Option<Arc<FunctionMeta>> {
-        None
+    fn get_function_meta(&self, name: &str) -> Option<Arc<FunctionMeta>> {
+        self.scalar_functions.get(name).map(|f| {
+            Arc::new(FunctionMeta::new(
+                name.to_owned(),
+                f.args.clone(),
+                f.return_type.clone(),
+                FunctionType::Scalar,
+            ))
+        })
     }
 }
 
 #[cfg(test)]
 mod tests {
 
     use super::*;
+    use crate::datasource::MemTable;
+    use crate::execution::physical_plan::udf::ScalarUdf;
     use crate::test;
+    use arrow::array::{ArrayRef, Int32Array};
+    use arrow::compute::add;
     use std::fs::File;
     use std::io::prelude::*;
     use tempdir::TempDir;
@@ -806,6 +857,99 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn scalar_udf() -> Result<()> {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 10, 10, 100])),
+                Arc::new(Int32Array::from(vec![2, 12, 12, 120])),
+            ],
+        )?;
+
+        let mut ctx = ExecutionContext::new();
+
+        let provider = MemTable::new(schema, vec![batch])?;
+        ctx.register_table("t", Box::new(provider));
+
+        let myfunc: ScalarUdf = |args: &Vec<ArrayRef>| {
+            let l = &args[0]
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("cast failed");
+            let r = &args[1]
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("cast failed");
+            Ok(Arc::new(add(l, r)?))
+        };
+
+        let my_add = ScalarFunction::new(
+            "my_add",
+            vec![
+                Field::new("a", DataType::Int32, true),
+                Field::new("b", DataType::Int32, true),
+            ],
+            DataType::Int32,
+            myfunc,
+        );
+
+        ctx.register_udf(my_add);
+
+        let t = ctx.table("t")?;
+
+        let plan = LogicalPlanBuilder::from(&t.to_logical_plan())
+            .project(vec![
+                col("a"),
+                col("b"),
+                scalar_function("my_add", vec![col("a"), col("b")], DataType::Int32),
+            ])?
+            .build()?;
+
+        assert_eq!(
+            format!("{:?}", plan),
+            "Projection: #a, #b, my_add(#a, #b)\n  TableScan: t projection=None"
+        );
+
+        let plan = ctx.optimize(&plan)?;
+        let plan = ctx.create_physical_plan(&plan, 1024)?;
+        let result = ctx.collect(plan.as_ref())?;
+
+        let batch = &result[0];
+        assert_eq!(3, batch.num_columns());
+        assert_eq!(4, batch.num_rows());
+
+        let a = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("failed to cast a");
+        let b = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("failed to cast b");
+        let sum = batch
+            .column(2)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("failed to cast sum");
+
+        assert_eq!(4, a.len());
+        assert_eq!(4, b.len());
+        assert_eq!(4, sum.len());
+        for i in 0..sum.len() {
+            assert_eq!(a.value(i) + b.value(i), sum.value(i));
+        }
+
+        Ok(())
+    }
+
     /// Execute SQL and return results
     fn collect(ctx: &mut ExecutionContext, sql: &str) -> Result<Vec<RecordBatch>> {
         let logical_plan = ctx.create_logical_plan(sql)?;

diff --git a/rust/datafusion/src/execution/physical_plan/math_expressions.rs b/rust/datafusion/src/execution/physical_plan/math_expressions.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Math expressions
+
+use crate::error::ExecutionError;
+use crate::execution::context::ExecutionContext;
+use crate::execution::physical_plan::udf::ScalarFunction;
+
+use arrow::array::{Array, ArrayRef, Float64Array, Float64Builder};
+use arrow::datatypes::{DataType, Field};
+
+use std::sync::Arc;
+
+macro_rules! math_unary_function {
+    ($NAME:expr, $FUNC:ident) => {
+        ScalarFunction::new(
+            $NAME,
+            vec![Field::new("n", DataType::Float64, true)],
+            DataType::Float64,
+            |args: &Vec<ArrayRef>| {
+                let n = &args[0].as_any().downcast_ref::<Float64Array>();
+                match n {
+                    Some(array) => {
+                        let mut builder = Float64Builder::new(array.len());
+                        for i in 0..array.len() {
+                            if array.is_null(i) {
+                                builder.append_null()?;
+                            } else {
+                                builder.append_value(array.value(i).$FUNC())?;
+                            }
+                        }
+                        Ok(Arc::new(builder.finish()))
+                    }
+                    _ => Err(ExecutionError::General(format!(
+                        "Invalid data type for {}",
+                        $NAME
+                    ))),
+                }
+            },
+        )
+    };
+}
+
+/// Register math scalar functions with the context
+pub fn register_math_functions(ctx: &mut ExecutionContext) {
+    ctx.register_udf(math_unary_function!("sqrt", sqrt));
+    ctx.register_udf(math_unary_function!("sin", sin));
+    ctx.register_udf(math_unary_function!("cos", cos));
+    ctx.register_udf(math_unary_function!("tan", tan));
+    ctx.register_udf(math_unary_function!("asin", asin));
+    ctx.register_udf(math_unary_function!("acos", acos));
+    ctx.register_udf(math_unary_function!("atan", atan));
+    ctx.register_udf(math_unary_function!("floor", floor));
+    ctx.register_udf(math_unary_function!("ceil", ceil));
+    ctx.register_udf(math_unary_function!("round", round));
+    ctx.register_udf(math_unary_function!("trunc", trunc));
+    ctx.register_udf(math_unary_function!("abs", abs));
+    ctx.register_udf(math_unary_function!("signum", signum));
+    ctx.register_udf(math_unary_function!("exp", exp));
+    ctx.register_udf(math_unary_function!("log", ln));
+    ctx.register_udf(math_unary_function!("log2", log2));
+    ctx.register_udf(math_unary_function!("log10", log10));
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::error::Result;
+    use crate::logicalplan::{sqrt, Expr, LogicalPlanBuilder};
+    use arrow::datatypes::Schema;
+
+    #[test]
+    fn cast_i8_input() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c0", DataType::Int8, true)]);
+        let plan = LogicalPlanBuilder::scan("", "", &schema, None)?
+            .project(vec![sqrt(Expr::UnresolvedColumn("c0".to_owned()))])?
+            .build()?;
+        let ctx = ExecutionContext::new();
+        let plan = ctx.optimize(&plan)?;
+        let expected = "Projection: sqrt(CAST(#0 AS Float64))\
+        \n  TableScan:  projection=Some([0])";
+        assert_eq!(format!("{:?}", plan), expected);
+        Ok(())
+    }
+
+    #[test]
+    fn no_cast_f64_input() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c0", DataType::Float64, true)]);
+        let plan = LogicalPlanBuilder::scan("", "", &schema, None)?
+            .project(vec![sqrt(Expr::UnresolvedColumn("c0".to_owned()))])?
+            .build()?;
+        let ctx = ExecutionContext::new();
+        let plan = ctx.optimize(&plan)?;
+        let expected = "Projection: sqrt(#0)\
+        \n  TableScan:  projection=Some([0])";
+        assert_eq!(format!("{:?}", plan), expected);
+        Ok(())
+    }
+}
diff --git a/rust/datafusion/src/execution/physical_plan/mod.rs b/rust/datafusion/src/execution/physical_plan/mod.rs
@@ -91,7 +91,9 @@ pub mod datasource;
 pub mod expressions;
 pub mod hash_aggregate;
 pub mod limit;
+pub mod math_expressions;
 pub mod merge;
 pub mod parquet;
 pub mod projection;
 pub mod selection;
+pub mod udf;