diff --git a/datafusion/functions/benches/encoding.rs b/datafusion/functions/benches/encoding.rs index 8a7c2b7b664b7..af0385b6fc90d 100644 --- a/datafusion/functions/benches/encoding.rs +++ b/datafusion/functions/benches/encoding.rs @@ -19,7 +19,7 @@ extern crate criterion; use arrow::array::Array; use arrow::datatypes::{DataType, Field}; -use arrow::util::bench_util::create_string_array_with_len; +use arrow::util::bench_util::create_binary_array; use criterion::{Criterion, criterion_group, criterion_main}; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; @@ -32,20 +32,22 @@ fn criterion_benchmark(c: &mut Criterion) { let config_options = Arc::new(ConfigOptions::default()); for size in [1024, 4096, 8192] { - let str_array = Arc::new(create_string_array_with_len::(size, 0.2, 32)); + let bin_array = Arc::new(create_binary_array::(size, 0.2)); c.bench_function(&format!("base64_decode/{size}"), |b| { let method = ColumnarValue::Scalar("base64".into()); let encoded = encoding::encode() .invoke_with_args(ScalarFunctionArgs { - args: vec![ColumnarValue::Array(str_array.clone()), method.clone()], + args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()], arg_fields: vec![ - Field::new("a", str_array.data_type().to_owned(), true).into(), + Field::new("a", bin_array.data_type().to_owned(), true).into(), Field::new("b", method.data_type().to_owned(), true).into(), ], number_rows: size, return_field: Field::new("f", DataType::Utf8, true).into(), config_options: Arc::clone(&config_options), }) + .unwrap() + .cast_to(&DataType::Binary, None) .unwrap(); let arg_fields = vec![ @@ -61,7 +63,7 @@ fn criterion_benchmark(c: &mut Criterion) { args: args.clone(), arg_fields: arg_fields.clone(), number_rows: size, - return_field: Field::new("f", DataType::Utf8, true).into(), + return_field: Field::new("f", DataType::Binary, true).into(), config_options: Arc::clone(&config_options), }) .unwrap(), @@ -72,24 +74,26 @@ fn criterion_benchmark(c: &mut Criterion) { c.bench_function(&format!("hex_decode/{size}"), |b| { let method = ColumnarValue::Scalar("hex".into()); let arg_fields = vec![ - Field::new("a", str_array.data_type().to_owned(), true).into(), + Field::new("a", bin_array.data_type().to_owned(), true).into(), Field::new("b", method.data_type().to_owned(), true).into(), ]; let encoded = encoding::encode() .invoke_with_args(ScalarFunctionArgs { - args: vec![ColumnarValue::Array(str_array.clone()), method.clone()], + args: vec![ColumnarValue::Array(bin_array.clone()), method.clone()], arg_fields, number_rows: size, return_field: Field::new("f", DataType::Utf8, true).into(), config_options: Arc::clone(&config_options), }) + .unwrap() + .cast_to(&DataType::Binary, None) .unwrap(); let arg_fields = vec![ Field::new("a", encoded.data_type().to_owned(), true).into(), Field::new("b", method.data_type().to_owned(), true).into(), ]; - let return_field = Field::new("f", DataType::Utf8, true).into(); + let return_field = Field::new("f", DataType::Binary, true).into(); let args = vec![encoded, method]; b.iter(|| { diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index ce7f534506d61..4ad67b78178f2 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -52,6 +52,12 @@ const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new( .with_decode_padding_mode(DecodePaddingMode::Indifferent), ); +// Generate padding characters when encoding +const BASE64_ENGINE_PADDED: GeneralPurpose = GeneralPurpose::new( + &base64::alphabet::STANDARD, + GeneralPurposeConfig::new().with_encode_padding(true), +); + #[user_doc( doc_section(label = "Binary String Functions"), description = "Encode binary data into a textual representation.", @@ -62,7 +68,7 @@ const BASE64_ENGINE: GeneralPurpose = GeneralPurpose::new( ), argument( name = "format", - description = "Supported formats are: `base64`, `hex`" + description = "Supported formats are: `base64`, `base64pad`, `hex`" ), related_udf(name = "decode") )] @@ -319,12 +325,18 @@ fn decode_array(array: &ArrayRef, encoding: Encoding) -> Result { #[derive(Debug, Copy, Clone)] enum Encoding { Base64, + Base64Padded, Hex, } impl fmt::Display for Encoding { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", format!("{self:?}").to_lowercase()) + let name = match self { + Self::Base64 => "base64", + Self::Base64Padded => "base64pad", + Self::Hex => "hex", + }; + write!(f, "{name}") } } @@ -345,9 +357,10 @@ impl TryFrom<&ColumnarValue> for Encoding { }; match encoding { "base64" => Ok(Self::Base64), + "base64pad" => Ok(Self::Base64Padded), "hex" => Ok(Self::Hex), _ => { - let options = [Self::Base64, Self::Hex] + let options = [Self::Base64, Self::Base64Padded, Self::Hex] .iter() .map(|i| i.to_string()) .collect::>() @@ -364,15 +377,18 @@ impl Encoding { fn encode_bytes(self, value: &[u8]) -> String { match self { Self::Base64 => BASE64_ENGINE.encode(value), + Self::Base64Padded => BASE64_ENGINE_PADDED.encode(value), Self::Hex => hex::encode(value), } } fn decode_bytes(self, value: &[u8]) -> Result> { match self { - Self::Base64 => BASE64_ENGINE.decode(value).map_err(|e| { - exec_datafusion_err!("Failed to decode value using base64: {e}") - }), + Self::Base64 | Self::Base64Padded => { + BASE64_ENGINE.decode(value).map_err(|e| { + exec_datafusion_err!("Failed to decode value using {self}: {e}") + }) + } Self::Hex => hex::decode(value).map_err(|e| { exec_datafusion_err!("Failed to decode value using hex: {e}") }), @@ -396,6 +412,13 @@ impl Encoding { .collect(); Ok(Arc::new(array)) } + Self::Base64Padded => { + let array: GenericStringArray = array + .iter() + .map(|x| x.map(|x| BASE64_ENGINE_PADDED.encode(x))) + .collect(); + Ok(Arc::new(array)) + } Self::Hex => { let array: GenericStringArray = array.iter().map(|x| x.map(hex::encode)).collect(); @@ -430,7 +453,7 @@ impl Encoding { } match self { - Self::Base64 => { + Self::Base64 | Self::Base64Padded => { let upper_bound = base64::decoded_len_estimate(approx_data_size); delegated_decode::<_, _, OutputOffset>(base64_decode, value, upper_bound) } diff --git a/datafusion/spark/src/function/string/base64.rs b/datafusion/spark/src/function/string/base64.rs new file mode 100644 index 0000000000000..a171d4823b0fa --- /dev/null +++ b/datafusion/spark/src/function/string/base64.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::DataType; +use datafusion_common::arrow::datatypes::{Field, FieldRef}; +use datafusion_common::types::{NativeType, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err, internal_err}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{Coercion, Expr, ReturnFieldArgs, TypeSignatureClass, lit}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_functions::expr_fn::{decode, encode}; + +/// Apache Spark base64 uses padded base64 encoding. +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkBase64 { + signature: Signature, +} + +impl Default for SparkBase64 { + fn default() -> Self { + Self::new() + } +} + +impl SparkBase64 { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![Coercion::new_implicit( + TypeSignatureClass::Binary, + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Binary, + )], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkBase64 { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "base64" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_type should not be called for {}", self.name()) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result { + let [bin] = take_function_args(self.name(), args.arg_fields)?; + let return_type = match bin.data_type() { + DataType::LargeBinary => DataType::LargeUtf8, + _ => DataType::Utf8, + }; + Ok(Arc::new(Field::new( + self.name(), + return_type, + bin.is_nullable(), + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + exec_err!( + "invoke should not be called on a simplified {} function", + self.name() + ) + } + + fn simplify( + &self, + args: Vec, + _info: &SimplifyContext, + ) -> Result { + let [bin] = take_function_args(self.name(), args)?; + Ok(ExprSimplifyResult::Simplified(encode( + bin, + lit("base64pad"), + ))) + } +} + +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkUnBase64 { + signature: Signature, +} + +impl Default for SparkUnBase64 { + fn default() -> Self { + Self::new() + } +} + +impl SparkUnBase64 { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![Coercion::new_implicit( + TypeSignatureClass::Binary, + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Binary, + )], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkUnBase64 { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "unbase64" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_type should not be called for {}", self.name()) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result { + let [str] = take_function_args(self.name(), args.arg_fields)?; + let return_type = match str.data_type() { + DataType::LargeBinary => DataType::LargeBinary, + _ => DataType::Binary, + }; + Ok(Arc::new(Field::new( + self.name(), + return_type, + str.is_nullable(), + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + exec_err!("{} should have been simplified", self.name()) + } + + fn simplify( + &self, + args: Vec, + _info: &SimplifyContext, + ) -> Result { + let [bin] = take_function_args(self.name(), args)?; + Ok(ExprSimplifyResult::Simplified(decode( + bin, + lit("base64pad"), + ))) + } +} diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 1f0108cf509c7..8859beca77996 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -16,6 +16,7 @@ // under the License. pub mod ascii; +pub mod base64; pub mod char; pub mod concat; pub mod elt; @@ -32,6 +33,7 @@ use datafusion_functions::make_udf_function; use std::sync::Arc; make_udf_function!(ascii::SparkAscii, ascii); +make_udf_function!(base64::SparkBase64, base64); make_udf_function!(char::CharFunc, char); make_udf_function!(concat::SparkConcat, concat); make_udf_function!(ilike::SparkILike, ilike); @@ -42,6 +44,7 @@ make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check); make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); +make_udf_function!(base64::SparkUnBase64, unbase64); pub mod expr_fn { use datafusion_functions::export_functions; @@ -51,6 +54,11 @@ pub mod expr_fn { "Returns the ASCII code point of the first character of string.", arg1 )); + export_functions!(( + base64, + "Encodes the input binary `bin` into a base64 string.", + bin + )); export_functions!(( char, "Returns the ASCII character having the binary equivalent to col. If col is larger than 256 the result is equivalent to char(col % 256).", @@ -97,11 +105,17 @@ pub mod expr_fn { "Returns the substring from string `str` starting at position `pos` with length `length.", str pos length )); + export_functions!(( + unbase64, + "Decodes the input string `str` from a base64 string into binary data.", + str + )); } pub fn functions() -> Vec> { vec![ ascii(), + base64(), char(), concat(), elt(), @@ -112,5 +126,6 @@ pub fn functions() -> Vec> { format_string(), space(), substring(), + unbase64(), ] } diff --git a/datafusion/sqllogictest/test_files/encoding.slt b/datafusion/sqllogictest/test_files/encoding.slt index 1b1acbc385348..b04d5061825b4 100644 --- a/datafusion/sqllogictest/test_files/encoding.slt +++ b/datafusion/sqllogictest/test_files/encoding.slt @@ -20,21 +20,41 @@ SELECT encode(arrow_cast('tom', 'Utf8View'),'base64'); ---- dG9t +query T +SELECT encode(arrow_cast('tommy', 'Utf8View'),'base64pad'); +---- +dG9tbXk= + query T SELECT arrow_cast(decode(arrow_cast('dG9t', 'Utf8View'),'base64'), 'Utf8'); ---- tom +query T +SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'Utf8View'),'base64pad'), 'Utf8'); +---- +tommy + query T SELECT encode(arrow_cast('tom', 'BinaryView'),'base64'); ---- dG9t +query T +SELECT encode(arrow_cast('tommy', 'BinaryView'),'base64pad'); +---- +dG9tbXk= + query T SELECT arrow_cast(decode(arrow_cast('dG9t', 'BinaryView'),'base64'), 'Utf8'); ---- tom +query T +SELECT arrow_cast(decode(arrow_cast('dG9tbXk=', 'BinaryView'),'base64pad'), 'Utf8'); +---- +tommy + # test for hex digest query T select encode(digest('hello', 'sha256'), 'hex'); @@ -61,10 +81,10 @@ select encode(12, 'hex'); query error DataFusion error: Error during planning: Function 'decode' requires TypeSignatureClass::Binary, but received Int64 \(DataType: Int64\) select decode(12, 'hex'); -query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex +query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex select encode('', 'non_encoding'); -query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, hex +query error DataFusion error: Error during planning: There is no built\-in encoding named 'non_encoding', currently supported encodings are: base64, base64pad, hex select decode('', 'non_encoding'); query error DataFusion error: Execution error: Encoding must be a non-null string @@ -124,11 +144,21 @@ select encode(bin_field, 'base64') FROM test WHERE num = 3; ---- j1DT9g6uNw3b+FyGIZxVEIo1AWU +query T +select encode(bin_field, 'base64pad') FROM test WHERE num = 3; +---- +j1DT9g6uNw3b+FyGIZxVEIo1AWU= + query B select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3; ---- true +query B +select decode(encode(bin_field, 'base64pad'), 'base64pad') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3; +---- +true + statement ok drop table test @@ -144,18 +174,20 @@ FROM VALUES ('Raphael', 'R'), (NULL, 'R'); -query TTTT +query TTTTTT SELECT encode(column1_utf8view, 'base64') AS column1_base64, + encode(column1_utf8view, 'base64pad') AS column1_base64pad, encode(column1_utf8view, 'hex') AS column1_hex, encode(column2_utf8view, 'base64') AS column2_base64, + encode(column2_utf8view, 'base64pad') AS column2_base64pad, encode(column2_utf8view, 'hex') AS column2_hex FROM test_utf8view; ---- -QW5kcmV3 416e64726577 WA 58 -WGlhbmdwZW5n 5869616e6770656e67 WGlhbmdwZW5n 5869616e6770656e67 -UmFwaGFlbA 5261706861656c Ug 52 -NULL NULL Ug 52 +QW5kcmV3 QW5kcmV3 416e64726577 WA WA== 58 +WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67 WGlhbmdwZW5n WGlhbmdwZW5n 5869616e6770656e67 +UmFwaGFlbA UmFwaGFlbA== 5261706861656c Ug Ug== 52 +NULL NULL NULL Ug Ug== 52 query TTTTTT SELECT @@ -172,6 +204,22 @@ WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA UmFwaGFlbA NULL NULL NULL NULL NULL NULL + +query TTTTTT +SELECT + encode(arrow_cast(column1_utf8view, 'Utf8'), 'base64pad'), + encode(arrow_cast(column1_utf8view, 'LargeUtf8'), 'base64pad'), + encode(arrow_cast(column1_utf8view, 'Utf8View'), 'base64pad'), + encode(arrow_cast(column1_utf8view, 'Binary'), 'base64pad'), + encode(arrow_cast(column1_utf8view, 'LargeBinary'), 'base64pad'), + encode(arrow_cast(column1_utf8view, 'BinaryView'), 'base64pad') +FROM test_utf8view; +---- +QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 QW5kcmV3 +WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n WGlhbmdwZW5n +UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== UmFwaGFlbA== +NULL NULL NULL NULL NULL NULL + statement ok drop table test_utf8view @@ -180,26 +228,31 @@ statement ok CREATE TABLE test_fsb AS SELECT arrow_cast(X'0123456789ABCDEF', 'FixedSizeBinary(8)') as fsb_col; -query ?? +query ??? SELECT decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64'), 'base64'), + decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'base64pad'), 'base64pad'), decode(encode(arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)'), 'hex'), 'hex'); ---- -0123456789abcdef 0123456789abcdef +0123456789abcdef 0123456789abcdef 0123456789abcdef -query ?? +query ??? SELECT decode(encode(column1, 'base64'), 'base64'), + decode(encode(column1, 'base64pad'), 'base64pad'), decode(encode(column1, 'hex'), 'hex') FROM values (arrow_cast(X'0123456789abcdef', 'FixedSizeBinary(8)')), (arrow_cast(X'ffffffffffffffff', 'FixedSizeBinary(8)')); ---- -0123456789abcdef 0123456789abcdef -ffffffffffffffff ffffffffffffffff +0123456789abcdef 0123456789abcdef 0123456789abcdef +ffffffffffffffff ffffffffffffffff ffffffffffffffff query error DataFusion error: Execution error: Failed to decode value using base64 select decode('invalid', 'base64'); +query error DataFusion error: Execution error: Failed to decode value using base64pad +select decode('invalid', 'base64pad'); + query error DataFusion error: Execution error: Failed to decode value using hex select decode('invalid', 'hex'); diff --git a/datafusion/sqllogictest/test_files/spark/string/base64.slt b/datafusion/sqllogictest/test_files/spark/string/base64.slt index 66edbe8442158..03b488de0ee9a 100644 --- a/datafusion/sqllogictest/test_files/spark/string/base64.slt +++ b/datafusion/sqllogictest/test_files/spark/string/base64.slt @@ -15,18 +15,101 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT base64('Spark SQL'); -## PySpark 3.5.5 Result: {'base64(Spark SQL)': 'U3BhcmsgU1FM', 'typeof(base64(Spark SQL))': 'string', 'typeof(Spark SQL)': 'string'} -#query -#SELECT base64('Spark SQL'::string); - -## Original Query: SELECT base64(x'537061726b2053514c'); -## PySpark 3.5.5 Result: {"base64(X'537061726B2053514C')": 'U3BhcmsgU1FM', "typeof(base64(X'537061726B2053514C'))": 'string', "typeof(X'537061726B2053514C')": 'binary'} -#query -#SELECT base64(X'537061726B2053514C'::binary); +query T +SELECT base64('Spark SQL'::string); +---- +U3BhcmsgU1FM + +query T +SELECT base64('Spark SQ'::string); +---- +U3BhcmsgU1E= + +query T +SELECT base64('Spark S'::string); +---- +U3BhcmsgUw== + +query T +SELECT base64('Spark SQL'::bytea); +---- +U3BhcmsgU1FM + +query T +SELECT base64(NULL::string); +---- +NULL + +query T +SELECT base64(NULL::bytea); +---- +NULL + +query T +SELECT base64(column1) +FROM VALUES +('Spark SQL'::bytea), +('Spark SQ'::bytea), +('Spark S'::bytea), +(NULL::bytea); +---- +U3BhcmsgU1FM +U3BhcmsgU1E= +U3BhcmsgUw== +NULL + +query error Function 'base64' requires TypeSignatureClass::Binary, but received Int32 \(DataType: Int32\) +SELECT base64(12::integer); + + +query T +SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::string), 'Utf8'); +---- +Spark SQL + +query T +SELECT arrow_cast(unbase64('U3BhcmsgU1E='::string), 'Utf8'); +---- +Spark SQ + +query T +SELECT arrow_cast(unbase64('U3BhcmsgUw=='::string), 'Utf8'); +---- +Spark S + +query T +SELECT arrow_cast(unbase64('U3BhcmsgU1FM'::bytea), 'Utf8'); +---- +Spark SQL + +query ? +SELECT unbase64(NULL::string); +---- +NULL + +query ? +SELECT unbase64(NULL::bytea); +---- +NULL + +query T +SELECT arrow_cast(unbase64(column1), 'Utf8') +FROM VALUES +('U3BhcmsgU1FM'::string), +('U3BhcmsgU1E='::string), +('U3BhcmsgUw=='::string), +(NULL::string); +---- +Spark SQL +Spark SQ +Spark S +NULL + +query error Failed to decode value using base64 +SELECT unbase64('123'::string); + +query error Failed to decode value using base64 +SELECT unbase64('123'::bytea); + +query error Function 'unbase64' requires TypeSignatureClass::Binary, but received Int32 \(DataType: Int32\) +SELECT unbase64(12::integer); diff --git a/datafusion/sqllogictest/test_files/spark/string/unbase64.slt b/datafusion/sqllogictest/test_files/spark/string/unbase64.slt deleted file mode 100644 index 5cf3fbee0455d..0000000000000 --- a/datafusion/sqllogictest/test_files/spark/string/unbase64.slt +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT unbase64('U3BhcmsgU1FM'); -## PySpark 3.5.5 Result: {'unbase64(U3BhcmsgU1FM)': bytearray(b'Spark SQL'), 'typeof(unbase64(U3BhcmsgU1FM))': 'binary', 'typeof(U3BhcmsgU1FM)': 'string'} -#query -#SELECT unbase64('U3BhcmsgU1FM'::string); diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 605c3285c322c..473d7ad84b982 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2175,7 +2175,7 @@ encode(expression, format) #### Arguments - **expression**: Expression containing string or binary data -- **format**: Supported formats are: `base64`, `hex` +- **format**: Supported formats are: `base64`, `base64pad`, `hex` **Related functions**: