From dd90917de8d91a813985616a6baf79fe1a699e83 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 25 Dec 2020 23:56:39 +0000 Subject: [PATCH 1/5] Added bench for cast. --- rust/arrow/benches/cast_kernels.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs index b9e33cce978..c73442d228d 100644 --- a/rust/arrow/benches/cast_kernels.rs +++ b/rust/arrow/benches/cast_kernels.rs @@ -192,6 +192,9 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("cast utf8 to f32", |b| { b.iter(|| cast_array(&f32_utf8_array, DataType::Float32)) + }); + c.bench_function("cast i64 to string 512", |b| { + b.iter(|| cast_array(&i64_array, DataType::Utf8)) }); c.bench_function("cast timestamp_ms to i64 512", |b| { From 15dcf80c0b7c729b948f0bc71cf3363b07a8f3be Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 25 Dec 2020 18:45:55 +0000 Subject: [PATCH 2/5] Improved performance of casting. --- rust/arrow/src/compute/kernels/cast.rs | 62 +++++++++----------------- 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index f1128769525..dc8201bd911 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -351,17 +351,13 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { Float32 => cast_bool_to_numeric::(array), Float64 => cast_bool_to_numeric::(array), Utf8 => { - let from = array.as_any().downcast_ref::().unwrap(); - let mut b = StringBuilder::new(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - b.append(false)?; - } else { - b.append_value(if from.value(i) { "1" } else { "0" })?; - } - } - - Ok(Arc::new(b.finish()) as ArrayRef) + let array = array.as_any().downcast_ref::().unwrap(); + Ok(Arc::new( + array + .iter() + .map(|value| value.map(|value| if value { "1" } else { "0" })) + .collect::(), + )) } _ => Err(ArrowError::ComputeError(format!( "Casting from {:?} to {:?} not supported", @@ -431,20 +427,15 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { Float32 => cast_numeric_to_string::(array), Float64 => cast_numeric_to_string::(array), Binary => { - let from = array.as_any().downcast_ref::().unwrap(); - let mut b = StringBuilder::new(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - b.append_null()?; - } else { - match str::from_utf8(from.value(i)) { - Ok(s) => b.append_value(s)?, - Err(_) => b.append_null()?, // not valid UTF8 - } - } - } - - Ok(Arc::new(b.finish()) as ArrayRef) + let array = array.as_any().downcast_ref::().unwrap(); + Ok(Arc::new( + array + .iter() + .map(|maybe_value| { + maybe_value.and_then(|value| str::from_utf8(value).ok()) + }) + .collect::(), + )) } _ => Err(ArrowError::ComputeError(format!( "Casting from {:?} to {:?} not supported", @@ -892,31 +883,22 @@ where FROM: ArrowNumericType, FROM::Native: std::string::ToString, { - numeric_to_string_cast::( + Ok(Arc::new(numeric_to_string_cast::( array .as_any() .downcast_ref::>() .unwrap(), - ) - .map(|to| Arc::new(to) as ArrayRef) + ))) } -fn numeric_to_string_cast(from: &PrimitiveArray) -> Result +fn numeric_to_string_cast(from: &PrimitiveArray) -> StringArray where T: ArrowPrimitiveType + ArrowNumericType, T::Native: std::string::ToString, { - let mut b = StringBuilder::new(from.len()); - - for i in 0..from.len() { - if from.is_null(i) { - b.append(false)?; - } else { - b.append_value(&from.value(i).to_string())?; - } - } - - Ok(b.finish()) + from.iter() + .map(|maybe_value| maybe_value.map(|value| value.to_string())) + .collect() } /// Cast numeric types to Utf8 From b4691cddbaa664b58c75a693b3b02aa4180406a3 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 26 Dec 2020 05:35:40 +0000 Subject: [PATCH 3/5] Minor improvement. --- rust/arrow/src/compute/kernels/cast.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs index dc8201bd911..6725fc0926c 100644 --- a/rust/arrow/src/compute/kernels/cast.rs +++ b/rust/arrow/src/compute/kernels/cast.rs @@ -2696,11 +2696,8 @@ mod tests { fn test_cast_string_array_to_dict() { use DataType::*; - let mut builder = StringBuilder::new(10); - builder.append_value("one").unwrap(); - builder.append_null().unwrap(); - builder.append_value("three").unwrap(); - let array: ArrayRef = Arc::new(builder.finish()); + let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) + as ArrayRef; let expected = vec!["one", "null", "three"]; From 341979f95481f8fd16cf0b79fe7c3697ab643979 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 26 Dec 2020 05:50:17 +0000 Subject: [PATCH 4/5] Extended idea to readers. --- rust/arrow/src/csv/reader.rs | 15 ++++----------- rust/arrow/src/json/reader.rs | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index ac779b0331c..3fca7b23e8c 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -51,7 +51,7 @@ use std::sync::Arc; use csv as csv_crate; -use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringBuilder}; +use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray}; use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::RecordBatch; @@ -449,16 +449,9 @@ fn parse( &DataType::Date64(_) => { build_primitive_array::(line_number, rows, i) } - &DataType::Utf8 => { - let mut builder = StringBuilder::new(rows.len()); - for row in rows.iter() { - match row.get(i) { - Some(s) => builder.append_value(s).unwrap(), - _ => builder.append(false).unwrap(), - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } + &DataType::Utf8 => Ok(Arc::new( + rows.iter().map(|row| row.get(i)).collect::(), + ) as ArrayRef), other => Err(ArrowError::ParseError(format!( "Unsupported data type {:?}", other diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index d43b02cee98..a4e6f932110 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -1115,21 +1115,14 @@ impl Decoder { t ))), }, - DataType::Utf8 => { - let mut builder = StringBuilder::new(rows.len()); - for row in rows { - if let Some(value) = row.get(field.name()) { - if let Some(str_v) = value.as_str() { - builder.append_value(str_v)? - } else { - builder.append(false)? - } - } else { - builder.append(false)? - } - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } + DataType::Utf8 => Ok(Arc::new( + rows.iter() + .map(|row| { + let maybe_value = row.get(field.name()); + maybe_value.and_then(|value| value.as_str()) + }) + .collect::(), + ) as ArrayRef), DataType::List(ref list_field) => { match list_field.data_type() { DataType::Dictionary(ref key_ty, _) => { From 52e1ecbddd029f3e043f707bafbc89e20c3ef45c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Thu, 31 Dec 2020 07:14:49 +0000 Subject: [PATCH 5/5] Ben --- rust/arrow/benches/cast_kernels.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs index c73442d228d..81232e5fdc1 100644 --- a/rust/arrow/benches/cast_kernels.rs +++ b/rust/arrow/benches/cast_kernels.rs @@ -192,7 +192,7 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("cast utf8 to f32", |b| { b.iter(|| cast_array(&f32_utf8_array, DataType::Float32)) - }); + }); c.bench_function("cast i64 to string 512", |b| { b.iter(|| cast_array(&i64_array, DataType::Utf8)) });