From 160c244974c8cb148221a1e60d584a1f01d0cf88 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Sat, 10 Oct 2020 22:27:44 +0200 Subject: [PATCH 1/5] ARROW-10249 - Support nested dictionaries inside list arrays for arrow json reader --- rust/arrow/src/json/reader.rs | 298 ++++++++++++++---- .../test/data/list_string_dict_nested.json | 3 + 2 files changed, 238 insertions(+), 63 deletions(-) create mode 100644 rust/arrow/test/data/list_string_dict_nested.json diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index aaec8459102..fa570771bbf 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -532,64 +532,18 @@ impl Reader { DataType::Float64 => self.build_list_array::(rows, field.name()), DataType::Null => unimplemented!(), DataType::Boolean => self.build_boolean_list_array(rows, field.name()), - DataType::Utf8 => { - let values_builder = StringBuilder::new(rows.len() * 5); - let mut builder = ListBuilder::new(values_builder); - for row in rows { - if let Some(value) = row.get(field.name()) { - // value can be an array or a scalar - let vals: Vec> = if let Value::String(v) = value { - vec![Some(v.to_string())] - } else if let Value::Array(n) = value { - n.iter().map(|v: &Value| { - if v.is_string() { - Some(v.as_str().unwrap().to_string()) - } else if v.is_array() || v.is_object() { - // implicitly drop nested values - // TODO support deep-nesting - None - } else { - Some(v.to_string()) - } - }).collect() - } else if let Value::Null = value { - vec![None] - } else if !value.is_object() { - vec![Some(value.to_string())] - } else { - return Err(ArrowError::JsonError("Only scalars are currently supported in JSON arrays".to_string())); - }; - for val in vals { - if let Some(v) = val { - builder.values().append_value(&v)? - } else { - builder.values().append_null()? - }; - } - } - builder.append(true)? - } - Ok(Arc::new(builder.finish()) as ArrayRef) - } - _ => Err(ArrowError::JsonError("Data type is currently not supported in a list".to_string())), + ref dtype @ DataType::Utf8 => { + // UInt64Type passed down below is a fake type for dictionary builder. + // It is there to make compiler happy. + self.list_array_string_array_builder::(&dtype, field.name(), rows) + }, + DataType::Dictionary(ref key_ty, _) => { + self.build_wrapped_list_array(rows, field.name(), key_ty) + }, + ref e => Err(ArrowError::JsonError(format!("Data type is currently not supported in a list : {:?}", e))), }, - DataType::Dictionary(ref key_typ, ref value_type) => { - if let DataType::Utf8 = **value_type { - match **key_typ { - DataType::Int8 => self.build_dictionary_array::(rows, field.name()), - DataType::Int16 => self.build_dictionary_array::(rows, field.name()), - DataType::Int32 => self.build_dictionary_array::(rows, field.name()), - DataType::Int64 => self.build_dictionary_array::(rows, field.name()), - DataType::UInt8 => self.build_dictionary_array::(rows, field.name()), - DataType::UInt16 => self.build_dictionary_array::(rows, field.name()), - DataType::UInt32 => self.build_dictionary_array::(rows, field.name()), - DataType::UInt64 => self.build_dictionary_array::(rows, field.name()), - _ => Err(ArrowError::JsonError("unsupported dictionary key type".to_string())) - } - } else { - Err(ArrowError::JsonError("dictionary types other than UTF-8 not yet supported".to_string())) - } - } + DataType::Dictionary(ref key_ty, ref val_ty) => + self.build_string_dictionary_array(rows, field.name(), key_ty, val_ty), DataType::Struct(_) => Err(ArrowError::JsonError("struct types are not yet supported".to_string())), _ => Err(ArrowError::JsonError(format!("{:?} type is not supported", field.data_type()))), } @@ -612,6 +566,175 @@ impl Reader { arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some)) } + fn build_wrapped_list_array( + &self, + rows: &[Value], + col_name: &str, + key_type: &DataType + ) -> Result { + match *key_type { + DataType::Int8 => { + let dtype = DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + DataType::Int16 => { + let dtype = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + DataType::Int32 => { + let dtype = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + DataType::Int64 => { + let dtype = DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + DataType::UInt8 => { + let dtype = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + DataType::UInt16 => { + let dtype = DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + } + DataType::UInt32 => { + let dtype = DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + } + DataType::UInt64 => { + let dtype = DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)); + self.list_array_string_array_builder::(&dtype, col_name, rows) + }, + ref e => Err(ArrowError::JsonError(format!("Data type is currently not supported for dictionaries in list : {:?}", e))), + } + } + + #[inline(always)] + fn list_array_string_array_builder( + &self, + data_type: &DataType, + col_name: &str, + rows: &[Value] + ) -> Result + where + DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType + { + let builder: Box = match data_type { + DataType::Utf8 => { + let values_builder = StringBuilder::new(rows.len() * 5); + Box::new(ListBuilder::new(values_builder)) + }, + DataType::Dictionary(_, _) => { + let values_builder = self.build_string_dictionary_builder::(rows.len() * 5)?; + Box::new(ListBuilder::new(values_builder)) + } + e => return Err(ArrowError::JsonError( + format!("Nested list data builder type is not supported: {:?}", e) + )) + }; + let mut builder = Box::leak(builder); + + for row in rows { + if let Some(value) = row.get(col_name) { + // value can be an array or a scalar + let vals: Vec> = if let Value::String(v) = value { + vec![Some(v.to_string())] + } else if let Value::Array(n) = value { + n.iter().map(|v: &Value| { + if v.is_string() { + Some(v.as_str().unwrap().to_string()) + } else if v.is_array() || v.is_object() { + // implicitly drop nested values + // TODO support deep-nesting + None + } else { + Some(v.to_string()) + } + }).collect() + } else if let Value::Null = value { + vec![None] + } else if !value.is_object() { + vec![Some(value.to_string())] + } else { + return Err(ArrowError::JsonError("Only scalars are currently supported in JSON arrays".to_string())); + }; + + // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them. + match data_type { + DataType::Utf8 => { + let builder: &mut &mut ListBuilder = unsafe { std::mem::transmute(&mut builder) }; + for val in vals { + if let Some(v) = val { + builder.values().append_value(&v)? + } else { + builder.values().append_null()? + }; + } + + // Amend to the list + builder.append(true)?; + }, + DataType::Dictionary(_, _) => { + let builder: &mut &mut ListBuilder> = unsafe { std::mem::transmute(&mut builder) }; + for val in vals { + if let Some(v) = val { + let _ = builder.values().append(&v)?; + } else { + builder.values().append_null()? + }; + } + + // Amend to the list + builder.append(true)?; + }, + e => return Err(ArrowError::JsonError( + format!("Nested list data builder type is not supported: {:?}", e) + )) + } + + } + } + unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) } + } + + #[inline(always)] + fn build_string_dictionary_builder( + &self, + row_len: usize + ) -> Result> + where + T: ArrowPrimitiveType + ArrowDictionaryKeyType + { + let key_builder = PrimitiveBuilder::::new(row_len); + let values_builder = StringBuilder::new(row_len * 5); + Ok(StringDictionaryBuilder::new(key_builder, values_builder)) + } + + #[inline(always)] + fn build_string_dictionary_array( + &self, + rows: &[Value], + col_name: &str, + key_type: &DataType, + value_type: &DataType + ) -> Result { + if let DataType::Utf8 = *value_type { + match *key_type { + DataType::Int8 => self.build_dictionary_array::(rows, col_name), + DataType::Int16 => self.build_dictionary_array::(rows, col_name), + DataType::Int32 => self.build_dictionary_array::(rows, col_name), + DataType::Int64 => self.build_dictionary_array::(rows, col_name), + DataType::UInt8 => self.build_dictionary_array::(rows, col_name), + DataType::UInt16 => self.build_dictionary_array::(rows, col_name), + DataType::UInt32 => self.build_dictionary_array::(rows, col_name), + DataType::UInt64 => self.build_dictionary_array::(rows, col_name), + _ => Err(ArrowError::JsonError("unsupported dictionary key type".to_string())) + } + } else { + Err(ArrowError::JsonError("dictionary types other than UTF-8 not yet supported".to_string())) + } + } + fn build_boolean_array(&self, rows: &[Value], col_name: &str) -> Result { let mut builder = BooleanBuilder::new(rows.len()); for row in rows { @@ -722,18 +845,18 @@ impl Reader { Ok(Arc::new(builder.finish())) } - fn build_dictionary_array( + #[inline(always)] + fn build_dictionary_array( &self, rows: &[Value], col_name: &str, ) -> Result where T::Native: num::NumCast, - T: ArrowDictionaryKeyType, + T: ArrowPrimitiveType + ArrowDictionaryKeyType, { - let key_builder = PrimitiveBuilder::::new(rows.len()); - let value_builder = StringBuilder::new(100); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); + let mut builder: StringDictionaryBuilder = + self.build_string_dictionary_builder(rows.len())?; for row in rows { if let Some(value) = row.get(&col_name) { if let Some(str_v) = value.as_str() { @@ -855,7 +978,7 @@ impl ReaderBuilder { #[cfg(test)] mod tests { - use crate::datatypes::DataType::Dictionary; + use crate::datatypes::DataType::{Dictionary, List}; use super::*; use flate2::read::GzDecoder; @@ -1385,6 +1508,54 @@ mod tests { ); } + #[test] + fn test_list_of_string_dictionary_from_json() { + let schema = Schema::new(vec![Field::new( + "events", + List(Box::new(Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)))), + true, + )]); + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .with_batch_size(64); + let mut reader: Reader = builder + .build::(File::open("test/data/list_string_dict_nested.json").unwrap()) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + + assert_eq!(1, batch.num_columns()); + assert_eq!(3, batch.num_rows()); + + let schema = reader.schema(); + let batch_schema = batch.schema(); + assert_eq!(schema, batch_schema); + + let events = schema.column_with_name("events").unwrap(); + assert_eq!( + &List(Box::new(Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)))), + events.1.data_type() + ); + + let evs_list = batch + .column(events.0) + .as_any() + .downcast_ref::() + .unwrap(); + let evs_list = evs_list.values(); + let evs_list = evs_list.as_any().downcast_ref::>().unwrap(); + assert_eq!(6, evs_list.len()); + assert_eq!(true, evs_list.is_valid(1)); + assert_eq!(DataType::Utf8, evs_list.value_type()); + + // dict from the events list + let dict_el = evs_list.values(); + let dict_el = dict_el.as_any().downcast_ref::().unwrap(); + assert_eq!(3, dict_el.len()); + assert_eq!("Elect Leader", dict_el.value(0)); + assert_eq!("Do Ballot", dict_el.value(1)); + assert_eq!("Send Data", dict_el.value(2)); + } + #[test] fn test_dictionary_from_json_uint8() { let schema = Schema::new(vec![Field::new( @@ -1471,6 +1642,7 @@ mod tests { d.1.data_type() ); } + #[test] fn test_with_multiple_batches() { let builder = ReaderBuilder::new() diff --git a/rust/arrow/test/data/list_string_dict_nested.json b/rust/arrow/test/data/list_string_dict_nested.json new file mode 100644 index 00000000000..d215b318bae --- /dev/null +++ b/rust/arrow/test/data/list_string_dict_nested.json @@ -0,0 +1,3 @@ +{"machine": "a", "events": ["Elect Leader", "Do Ballot"]} +{"machine": "b", "events": ["Do Ballot", "Send Data", "Elect Leader"]} +{"machine": "c", "events": ["Send Data"]} From 60055af859aeb99d4f595808d29f7821dda4ce53 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Sat, 10 Oct 2020 22:42:55 +0200 Subject: [PATCH 2/5] Use explicit pointer cast --- rust/arrow/src/json/reader.rs | 446 +++++++++++++++++++++++----------- 1 file changed, 299 insertions(+), 147 deletions(-) diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index fa570771bbf..3eabfb51135 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -448,107 +448,185 @@ impl Reader { let rows = &rows[..]; let projection = self.projection.clone().unwrap_or_else(Vec::new); - let arrays: Result> = self - .schema - .clone() - .fields() - .iter() - .filter(|field| { - if projection.is_empty() { - return true; - } - projection.contains(field.name()) - }) - .map(|field| { - match field.data_type().clone() { - DataType::Null => unimplemented!(), - DataType::Boolean => self.build_boolean_array(rows, field.name()), - DataType::Float64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Float32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::Int64 => self.build_primitive_array::(rows, field.name()), - DataType::Int32 => self.build_primitive_array::(rows, field.name()), - DataType::Int16 => self.build_primitive_array::(rows, field.name()), - DataType::Int8 => self.build_primitive_array::(rows, field.name()), - DataType::UInt64 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt32 => { - self.build_primitive_array::(rows, field.name()) - } - DataType::UInt16 => { - self.build_primitive_array::(rows, field.name()) + let arrays: Result> = + self.schema + .clone() + .fields() + .iter() + .filter(|field| { + if projection.is_empty() { + return true; } - DataType::UInt8 => self.build_primitive_array::(rows, field.name()), - DataType::Timestamp(unit, _) => - match unit { - TimeUnit::Second => self.build_primitive_array::(rows, field.name()), - TimeUnit::Microsecond => self.build_primitive_array::(rows, field.name()), - TimeUnit::Millisecond => self.build_primitive_array::(rows, field.name()), - TimeUnit::Nanosecond => self.build_primitive_array::(rows, field.name()), + projection.contains(field.name()) + }) + .map(|field| { + match field.data_type().clone() { + DataType::Null => unimplemented!(), + DataType::Boolean => self.build_boolean_array(rows, field.name()), + DataType::Float64 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Float32 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Int64 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Int32 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Int16 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Int8 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::UInt64 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::UInt32 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::UInt16 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::UInt8 => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => self + .build_primitive_array::( + rows, + field.name(), + ), + TimeUnit::Microsecond => self + .build_primitive_array::( + rows, + field.name(), + ), + TimeUnit::Millisecond => self + .build_primitive_array::( + rows, + field.name(), + ), + TimeUnit::Nanosecond => self + .build_primitive_array::( + rows, + field.name(), + ), }, - DataType::Date64(_) => self.build_primitive_array::(rows, field.name()), - DataType::Date32(_) => self.build_primitive_array::(rows, field.name()), - DataType::Time64(unit) => - match unit { - TimeUnit::Microsecond => self.build_primitive_array::(rows, field.name()), - TimeUnit::Nanosecond => self.build_primitive_array::(rows, field.name()), + DataType::Date64(_) => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Date32(_) => { + self.build_primitive_array::(rows, field.name()) + } + DataType::Time64(unit) => match unit { + TimeUnit::Microsecond => self + .build_primitive_array::( + rows, + field.name(), + ), + TimeUnit::Nanosecond => self + .build_primitive_array::( + rows, + field.name(), + ), _ => unimplemented!(), }, - DataType::Time32(unit) => - match unit { - TimeUnit::Second => self.build_primitive_array::(rows, field.name()), - TimeUnit::Millisecond => self.build_primitive_array::(rows, field.name()), + DataType::Time32(unit) => match unit { + TimeUnit::Second => self + .build_primitive_array::( + rows, + field.name(), + ), + TimeUnit::Millisecond => self + .build_primitive_array::( + rows, + field.name(), + ), _ => unimplemented!(), }, - DataType::Utf8 => { - let mut builder = StringBuilder::new(rows.len()); - for row in rows { - if let Some(value) = row.get(field.name()) { - if let Some(str_v) = value.as_str() { - builder.append_value(str_v)? + DataType::Utf8 => { + let mut builder = StringBuilder::new(rows.len()); + for row in rows { + if let Some(value) = row.get(field.name()) { + if let Some(str_v) = value.as_str() { + builder.append_value(str_v)? + } else { + builder.append(false)? + } } else { builder.append(false)? } - } else { - builder.append(false)? + } + Ok(Arc::new(builder.finish()) as ArrayRef) + } + DataType::List(ref t) => { + match **t { + DataType::Int8 => { + self.build_list_array::(rows, field.name()) + } + DataType::Int16 => { + self.build_list_array::(rows, field.name()) + } + DataType::Int32 => { + self.build_list_array::(rows, field.name()) + } + DataType::Int64 => { + self.build_list_array::(rows, field.name()) + } + DataType::UInt8 => { + self.build_list_array::(rows, field.name()) + } + DataType::UInt16 => self + .build_list_array::(rows, field.name()), + DataType::UInt32 => self + .build_list_array::(rows, field.name()), + DataType::UInt64 => self + .build_list_array::(rows, field.name()), + DataType::Float32 => self + .build_list_array::(rows, field.name()), + DataType::Float64 => self + .build_list_array::(rows, field.name()), + DataType::Null => unimplemented!(), + DataType::Boolean => { + self.build_boolean_list_array(rows, field.name()) + } + ref dtype @ DataType::Utf8 => { + // UInt64Type passed down below is a fake type for dictionary builder. + // It is there to make compiler happy. + self.list_array_string_array_builder::( + &dtype, + field.name(), + rows, + ) + } + DataType::Dictionary(ref key_ty, _) => self + .build_wrapped_list_array(rows, field.name(), key_ty), + ref e => Err(ArrowError::JsonError(format!( + "Data type is currently not supported in a list : {:?}", + e + ))), } } - Ok(Arc::new(builder.finish()) as ArrayRef) + DataType::Dictionary(ref key_ty, ref val_ty) => self + .build_string_dictionary_array( + rows, + field.name(), + key_ty, + val_ty, + ), + DataType::Struct(_) => Err(ArrowError::JsonError( + "struct types are not yet supported".to_string(), + )), + _ => Err(ArrowError::JsonError(format!( + "{:?} type is not supported", + field.data_type() + ))), } - DataType::List(ref t) => match **t { - DataType::Int8 => self.build_list_array::(rows, field.name()), - DataType::Int16 => self.build_list_array::(rows, field.name()), - DataType::Int32 => self.build_list_array::(rows, field.name()), - DataType::Int64 => self.build_list_array::(rows, field.name()), - DataType::UInt8 => self.build_list_array::(rows, field.name()), - DataType::UInt16 => self.build_list_array::(rows, field.name()), - DataType::UInt32 => self.build_list_array::(rows, field.name()), - DataType::UInt64 => self.build_list_array::(rows, field.name()), - DataType::Float32 => self.build_list_array::(rows, field.name()), - DataType::Float64 => self.build_list_array::(rows, field.name()), - DataType::Null => unimplemented!(), - DataType::Boolean => self.build_boolean_list_array(rows, field.name()), - ref dtype @ DataType::Utf8 => { - // UInt64Type passed down below is a fake type for dictionary builder. - // It is there to make compiler happy. - self.list_array_string_array_builder::(&dtype, field.name(), rows) - }, - DataType::Dictionary(ref key_ty, _) => { - self.build_wrapped_list_array(rows, field.name(), key_ty) - }, - ref e => Err(ArrowError::JsonError(format!("Data type is currently not supported in a list : {:?}", e))), - }, - DataType::Dictionary(ref key_ty, ref val_ty) => - self.build_string_dictionary_array(rows, field.name(), key_ty, val_ty), - DataType::Struct(_) => Err(ArrowError::JsonError("struct types are not yet supported".to_string())), - _ => Err(ArrowError::JsonError(format!("{:?} type is not supported", field.data_type()))), - } - }) - .collect(); + }) + .collect(); let projected_fields: Vec = if projection.is_empty() { self.schema.fields().to_vec() @@ -570,42 +648,69 @@ impl Reader { &self, rows: &[Value], col_name: &str, - key_type: &DataType + key_type: &DataType, ) -> Result { match *key_type { DataType::Int8 => { - let dtype = DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::Int8), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, + } DataType::Int16 => { - let dtype = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::Int16), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, + } DataType::Int32 => { - let dtype = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, + } DataType::Int64 => { - let dtype = DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::Int64), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, + } DataType::UInt8 => { - let dtype = DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::UInt8), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, + } DataType::UInt16 => { - let dtype = DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::UInt16), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) } DataType::UInt32 => { - let dtype = DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::UInt32), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) } DataType::UInt64 => { - let dtype = DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)); + let dtype = DataType::Dictionary( + Box::new(DataType::UInt64), + Box::new(DataType::Utf8), + ); self.list_array_string_array_builder::(&dtype, col_name, rows) - }, - ref e => Err(ArrowError::JsonError(format!("Data type is currently not supported for dictionaries in list : {:?}", e))), + } + ref e => Err(ArrowError::JsonError(format!( + "Data type is currently not supported for dictionaries in list : {:?}", + e + ))), } } @@ -614,23 +719,27 @@ impl Reader { &self, data_type: &DataType, col_name: &str, - rows: &[Value] + rows: &[Value], ) -> Result where - DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType + DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType, { let builder: Box = match data_type { DataType::Utf8 => { let values_builder = StringBuilder::new(rows.len() * 5); Box::new(ListBuilder::new(values_builder)) - }, + } DataType::Dictionary(_, _) => { - let values_builder = self.build_string_dictionary_builder::(rows.len() * 5)?; + let values_builder = + self.build_string_dictionary_builder::(rows.len() * 5)?; Box::new(ListBuilder::new(values_builder)) } - e => return Err(ArrowError::JsonError( - format!("Nested list data builder type is not supported: {:?}", e) - )) + e => { + return Err(ArrowError::JsonError(format!( + "Nested list data builder type is not supported: {:?}", + e + ))) + } }; let mut builder = Box::leak(builder); @@ -640,29 +749,36 @@ impl Reader { let vals: Vec> = if let Value::String(v) = value { vec![Some(v.to_string())] } else if let Value::Array(n) = value { - n.iter().map(|v: &Value| { - if v.is_string() { - Some(v.as_str().unwrap().to_string()) - } else if v.is_array() || v.is_object() { - // implicitly drop nested values - // TODO support deep-nesting - None - } else { - Some(v.to_string()) - } - }).collect() + n.iter() + .map(|v: &Value| { + if v.is_string() { + Some(v.as_str().unwrap().to_string()) + } else if v.is_array() || v.is_object() { + // implicitly drop nested values + // TODO support deep-nesting + None + } else { + Some(v.to_string()) + } + }) + .collect() } else if let Value::Null = value { vec![None] } else if !value.is_object() { vec![Some(value.to_string())] } else { - return Err(ArrowError::JsonError("Only scalars are currently supported in JSON arrays".to_string())); + return Err(ArrowError::JsonError( + "Only scalars are currently supported in JSON arrays".to_string(), + )); }; // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them. match data_type { DataType::Utf8 => { - let builder: &mut &mut ListBuilder = unsafe { std::mem::transmute(&mut builder) }; + let builder: &mut &mut ListBuilder = unsafe { + &mut *(&mut builder as *mut &mut dyn ArrayBuilder + as *mut &mut ListBuilder) + }; for val in vals { if let Some(v) = val { builder.values().append_value(&v)? @@ -673,9 +789,16 @@ impl Reader { // Amend to the list builder.append(true)?; - }, + } DataType::Dictionary(_, _) => { - let builder: &mut &mut ListBuilder> = unsafe { std::mem::transmute(&mut builder) }; + let builder: &mut &mut ListBuilder< + StringDictionaryBuilder, + > = unsafe { + &mut *(&mut builder as *mut &mut dyn ArrayBuilder + as *mut &mut ListBuilder< + StringDictionaryBuilder, + >) + }; for val in vals { if let Some(v) = val { let _ = builder.values().append(&v)?; @@ -686,12 +809,14 @@ impl Reader { // Amend to the list builder.append(true)?; - }, - e => return Err(ArrowError::JsonError( - format!("Nested list data builder type is not supported: {:?}", e) - )) + } + e => { + return Err(ArrowError::JsonError(format!( + "Nested list data builder type is not supported: {:?}", + e + ))) + } } - } } unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) } @@ -700,10 +825,10 @@ impl Reader { #[inline(always)] fn build_string_dictionary_builder( &self, - row_len: usize + row_len: usize, ) -> Result> where - T: ArrowPrimitiveType + ArrowDictionaryKeyType + T: ArrowPrimitiveType + ArrowDictionaryKeyType, { let key_builder = PrimitiveBuilder::::new(row_len); let values_builder = StringBuilder::new(row_len * 5); @@ -716,22 +841,40 @@ impl Reader { rows: &[Value], col_name: &str, key_type: &DataType, - value_type: &DataType + value_type: &DataType, ) -> Result { if let DataType::Utf8 = *value_type { match *key_type { DataType::Int8 => self.build_dictionary_array::(rows, col_name), - DataType::Int16 => self.build_dictionary_array::(rows, col_name), - DataType::Int32 => self.build_dictionary_array::(rows, col_name), - DataType::Int64 => self.build_dictionary_array::(rows, col_name), - DataType::UInt8 => self.build_dictionary_array::(rows, col_name), - DataType::UInt16 => self.build_dictionary_array::(rows, col_name), - DataType::UInt32 => self.build_dictionary_array::(rows, col_name), - DataType::UInt64 => self.build_dictionary_array::(rows, col_name), - _ => Err(ArrowError::JsonError("unsupported dictionary key type".to_string())) + DataType::Int16 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::Int32 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::Int64 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::UInt8 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::UInt16 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::UInt32 => { + self.build_dictionary_array::(rows, col_name) + } + DataType::UInt64 => { + self.build_dictionary_array::(rows, col_name) + } + _ => Err(ArrowError::JsonError( + "unsupported dictionary key type".to_string(), + )), } } else { - Err(ArrowError::JsonError("dictionary types other than UTF-8 not yet supported".to_string())) + Err(ArrowError::JsonError( + "dictionary types other than UTF-8 not yet supported".to_string(), + )) } } @@ -1512,7 +1655,10 @@ mod tests { fn test_list_of_string_dictionary_from_json() { let schema = Schema::new(vec![Field::new( "events", - List(Box::new(Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)))), + List(Box::new(Dictionary( + Box::new(DataType::UInt64), + Box::new(DataType::Utf8), + ))), true, )]); let builder = ReaderBuilder::new() @@ -1532,7 +1678,10 @@ mod tests { let events = schema.column_with_name("events").unwrap(); assert_eq!( - &List(Box::new(Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)))), + &List(Box::new(Dictionary( + Box::new(DataType::UInt64), + Box::new(DataType::Utf8) + ))), events.1.data_type() ); @@ -1542,7 +1691,10 @@ mod tests { .downcast_ref::() .unwrap(); let evs_list = evs_list.values(); - let evs_list = evs_list.as_any().downcast_ref::>().unwrap(); + let evs_list = evs_list + .as_any() + .downcast_ref::>() + .unwrap(); assert_eq!(6, evs_list.len()); assert_eq!(true, evs_list.is_valid(1)); assert_eq!(DataType::Utf8, evs_list.value_type()); From 26f74cd5562fdb128d173bb27efaecd04d275983 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Mon, 12 Oct 2020 23:13:30 +0200 Subject: [PATCH 3/5] Reword the comment --- rust/arrow/src/json/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 3eabfb51135..670ea665915 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -787,7 +787,7 @@ impl Reader { }; } - // Amend to the list + // Append to the list builder.append(true)?; } DataType::Dictionary(_, _) => { @@ -807,7 +807,7 @@ impl Reader { }; } - // Amend to the list + // Append to the list builder.append(true)?; } e => { From afc070df6c8b2c0b983d6a79cbdb1541555c0402 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Sun, 18 Oct 2020 19:33:13 +0200 Subject: [PATCH 4/5] Write docs for methods --- rust/arrow/src/json/reader.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 670ea665915..ac115ef7129 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -741,6 +741,11 @@ impl Reader { ))) } }; + // Builders have different APIs if ArrayBuilder has covered the methods we've used dynamic + // dispatch without leaking the pointer and reacquire back for dropping. + // XXX: Be careful about not prolonging this reference's lifetime outside of this method. + // Before exiting from this method we reacquire the Box back again and dtor is called + // properly. There is no leak for it. let mut builder = Box::leak(builder); for row in rows { @@ -772,9 +777,13 @@ impl Reader { )); }; - // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them. + // TODO: ARROW-10335: APIs of dictionary arrays and others are different. Unify + // them. match data_type { DataType::Utf8 => { + // instead of using transmute, we are using casts for reference to array + // builder. This is just a clippy lint called "transmute_ptr_to_ptr" which + // makes sense here. let builder: &mut &mut ListBuilder = unsafe { &mut *(&mut builder as *mut &mut dyn ArrayBuilder as *mut &mut ListBuilder) @@ -789,8 +798,12 @@ impl Reader { // Append to the list builder.append(true)?; + // Mutable ref of ref is dropped here. } DataType::Dictionary(_, _) => { + // instead of using transmute, we are using casts for reference to array + // builder. This is just a clippy lint called "transmute_ptr_to_ptr" which + // makes sense here. let builder: &mut &mut ListBuilder< StringDictionaryBuilder, > = unsafe { @@ -809,6 +822,7 @@ impl Reader { // Append to the list builder.append(true)?; + // Mutable ref of ref is dropped here. } e => { return Err(ArrowError::JsonError(format!( @@ -819,6 +833,11 @@ impl Reader { } } } + + // Actual off heap pointer acquired back again and dtor called after the `finish` method. + // XXX: Mind that this method is not breaking the lifetime rules, since it is all happening + // inside of this method's body. As already mentioned before, don't return any raw pointer + // beyond this method, or pass it down to the called methods of this method. unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) } } From a0b488cf047a2f1a68485d4c9026d8d9038b8766 Mon Sep 17 00:00:00 2001 From: Mahmut Bulut Date: Tue, 20 Oct 2020 23:38:29 +0200 Subject: [PATCH 5/5] Remove unsafe code This will slow down parsing on nested structures drastically because of RefCell overhead on every array block which is going to be appended to the builder. --- rust/arrow/src/json/reader.rs | 50 +++++++++++------------------------ 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index ac115ef7129..a8375dcb33e 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -44,6 +44,7 @@ use indexmap::map::IndexMap as HashMap; use indexmap::set::IndexSet as HashSet; +use std::cell::RefCell; use std::io::{BufRead, BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -724,15 +725,15 @@ impl Reader { where DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType, { - let builder: Box = match data_type { + let mut builder: RefCell> = match data_type { DataType::Utf8 => { let values_builder = StringBuilder::new(rows.len() * 5); - Box::new(ListBuilder::new(values_builder)) + RefCell::new(Box::new(ListBuilder::new(values_builder))) } DataType::Dictionary(_, _) => { let values_builder = self.build_string_dictionary_builder::(rows.len() * 5)?; - Box::new(ListBuilder::new(values_builder)) + RefCell::new(Box::new(ListBuilder::new(values_builder))) } e => { return Err(ArrowError::JsonError(format!( @@ -741,12 +742,6 @@ impl Reader { ))) } }; - // Builders have different APIs if ArrayBuilder has covered the methods we've used dynamic - // dispatch without leaking the pointer and reacquire back for dropping. - // XXX: Be careful about not prolonging this reference's lifetime outside of this method. - // Before exiting from this method we reacquire the Box back again and dtor is called - // properly. There is no leak for it. - let mut builder = Box::leak(builder); for row in rows { if let Some(value) = row.get(col_name) { @@ -781,13 +776,13 @@ impl Reader { // them. match data_type { DataType::Utf8 => { - // instead of using transmute, we are using casts for reference to array - // builder. This is just a clippy lint called "transmute_ptr_to_ptr" which - // makes sense here. - let builder: &mut &mut ListBuilder = unsafe { - &mut *(&mut builder as *mut &mut dyn ArrayBuilder - as *mut &mut ListBuilder) - }; + let builder = &mut builder.borrow_mut(); + let builder = builder + .as_any_mut() + .downcast_mut::>() + .ok_or(ArrowError::JsonError( + "Cast failed for ListBuilder during nested data parsing".to_string(), + ))?; for val in vals { if let Some(v) = val { builder.values().append_value(&v)? @@ -798,20 +793,12 @@ impl Reader { // Append to the list builder.append(true)?; - // Mutable ref of ref is dropped here. } DataType::Dictionary(_, _) => { - // instead of using transmute, we are using casts for reference to array - // builder. This is just a clippy lint called "transmute_ptr_to_ptr" which - // makes sense here. - let builder: &mut &mut ListBuilder< - StringDictionaryBuilder, - > = unsafe { - &mut *(&mut builder as *mut &mut dyn ArrayBuilder - as *mut &mut ListBuilder< - StringDictionaryBuilder, - >) - }; + let builder = &mut builder.borrow_mut(); + let builder = builder.as_any_mut().downcast_mut::>>().ok_or(ArrowError::JsonError( + "Cast failed for ListBuilder during nested data parsing".to_string(), + ))?; for val in vals { if let Some(v) = val { let _ = builder.values().append(&v)?; @@ -822,7 +809,6 @@ impl Reader { // Append to the list builder.append(true)?; - // Mutable ref of ref is dropped here. } e => { return Err(ArrowError::JsonError(format!( @@ -834,11 +820,7 @@ impl Reader { } } - // Actual off heap pointer acquired back again and dtor called after the `finish` method. - // XXX: Mind that this method is not breaking the lifetime rules, since it is all happening - // inside of this method's body. As already mentioned before, don't return any raw pointer - // beyond this method, or pass it down to the called methods of this method. - unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) } + Ok(builder.get_mut().finish() as ArrayRef) } #[inline(always)]