From 3456eacfb18a0314ca6071994f7b4b3222058765 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 28 Dec 2020 18:00:38 +0800 Subject: [PATCH 01/14] Add custom metadata to Field --- rust/arrow/src/array/builder.rs | 2 +- rust/arrow/src/datatypes.rs | 51 +++++++++++++++++--- rust/arrow/src/ipc/convert.rs | 20 +++++++- rust/datafusion/src/logical_plan/dfschema.rs | 4 +- rust/datafusion/src/physical_plan/planner.rs | 6 ++- 5 files changed, 69 insertions(+), 14 deletions(-) diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index ac5410c2583..30cce75d00c 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -2974,7 +2974,7 @@ mod tests { #[test] #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false }) is not currently supported" + expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }) is not currently supported" )] fn test_struct_array_builder_from_schema_unsupported_type() { let mut fields = Vec::new(); diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index b613a4d167a..e06809c66a8 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -22,6 +22,7 @@ //! * [`Field`](crate::datatypes::Field) to describe one field within a schema. //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. +use std::collections::BTreeMap; use std::collections::HashMap; use std::default::Default; use std::fmt; @@ -193,6 +194,9 @@ pub struct Field { nullable: bool, dict_id: i64, dict_is_ordered: bool, + /// A map of key-value pairs containing additional custom meta data. + #[serde(skip_serializing_if = "Option::is_none")] + metadata: Option>, } pub trait ArrowNativeType: @@ -1279,6 +1283,7 @@ impl Field { nullable, dict_id: 0, dict_is_ordered: false, + metadata: None, } } @@ -1296,9 +1301,22 @@ impl Field { nullable, dict_id, dict_is_ordered, + metadata: None, } } + /// Sets the `Field`'s optional custom metadata. + #[inline] + pub fn set_metadata(&mut self, metadata: Option>) { + self.metadata = metadata; + } + + /// Returns the immutable reference to the `Field`'s optional custom metadata. + #[inline] + pub const fn metadata(&self) -> &Option> { + &self.metadata + } + /// Returns an immutable reference to the `Field`'s name #[inline] pub const fn name(&self) -> &String { @@ -1461,6 +1479,7 @@ impl Field { data_type, dict_id, dict_is_ordered, + metadata: None, }) } _ => Err(ArrowError::ParseError( @@ -1616,7 +1635,7 @@ impl Field { impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}: {:?}", self.name, self.data_type) + write!(f, "{:?}", self) } } @@ -2687,12 +2706,14 @@ mod tests { #[test] fn create_schema_string() { let schema = person_schema(); - assert_eq!(schema.to_string(), "first_name: Utf8, \ - last_name: Utf8, \ - address: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false }]), \ - interests: Dictionary(Int32, Utf8)") + assert_eq!(schema.to_string(), + "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \ + Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"address\", data_type: Struct([\ + Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\ + ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }") } #[test] @@ -2710,6 +2731,14 @@ mod tests { assert_eq!(first_name.dict_id(), None); assert_eq!(first_name.dict_is_ordered(), None); + let metadata = first_name.metadata(); + assert!(metadata.is_some()); + let md = metadata.as_ref().unwrap(); + assert_eq!(md.len(), 1); + let key = md.get("k"); + assert!(md.get("k").is_some()); + assert_eq!(key.unwrap(), "v"); + let interests = &schema.fields()[3]; assert_eq!(interests.name(), "interests"); assert_eq!( @@ -2816,8 +2845,14 @@ mod tests { } fn person_schema() -> Schema { + let kv_array = [("k".to_string(), "v".to_string())]; + let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + + let mut first_name = Field::new("first_name", DataType::Utf8, false); + first_name.set_metadata(Some(field_metadata)); + Schema::new(vec![ - Field::new("first_name", DataType::Utf8, false), + first_name, Field::new("last_name", DataType::Utf8, false), Field::new( "address", diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index f003d6ebb79..19eca0da2d9 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -24,7 +24,7 @@ use crate::ipc; use flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use DataType::*; @@ -72,7 +72,7 @@ pub fn schema_to_fb_offset<'a>( /// Convert an IPC Field to Arrow Field impl<'a> From> for Field { fn from(field: ipc::Field) -> Field { - if let Some(dictionary) = field.dictionary() { + let mut arrow_field = if let Some(dictionary) = field.dictionary() { Field::new_dict( field.name().unwrap(), get_data_type(field, true), @@ -86,7 +86,23 @@ impl<'a> From> for Field { get_data_type(field, true), field.nullable(), ) + }; + + let mut metadata = None; + if let Some(list) = field.custom_metadata() { + let mut metadata_map = BTreeMap::default(); + for kv in list { + if let (Some(k), Some(v)) = (kv.key(), kv.value()) { + metadata_map.insert(k.to_string(), v.to_string()); + } + } + if !metadata_map.is_empty() { + metadata = Some(metadata_map); + } } + + arrow_field.set_metadata(metadata); + arrow_field } } diff --git a/rust/datafusion/src/logical_plan/dfschema.rs b/rust/datafusion/src/logical_plan/dfschema.rs index b6c1d21752d..0305f07acdc 100644 --- a/rust/datafusion/src/logical_plan/dfschema.rs +++ b/rust/datafusion/src/logical_plan/dfschema.rs @@ -390,7 +390,9 @@ mod tests { fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified("t1", &test_schema_1())?; let arrow_schema: Schema = schema.into(); - assert_eq!("t1.c0: Boolean, t1.c1: Boolean", arrow_schema.to_string()); + let expected = "Field { name: \"t1.c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"t1.c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }"; + assert_eq!(expected, arrow_schema.to_string()); Ok(()) } diff --git a/rust/datafusion/src/physical_plan/planner.rs b/rust/datafusion/src/physical_plan/planner.rs index fd414890fb6..ad866809b1b 100644 --- a/rust/datafusion/src/physical_plan/planner.rs +++ b/rust/datafusion/src/physical_plan/planner.rs @@ -860,7 +860,8 @@ mod tests { data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false } }\ + dict_is_ordered: false, \ + metadata: None } }\ ] }, \ ExecutionPlan schema: Schema { fields: [\ Field { \ @@ -868,7 +869,8 @@ mod tests { data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false }\ + dict_is_ordered: false, \ + metadata: None }\ ], metadata: {} }"; match plan { Ok(_) => panic!("Expected planning failure"), From 6582133cf14249888eda0aa7c35cdeca625e6c69 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 28 Dec 2020 19:02:38 +0800 Subject: [PATCH 02/14] Test Field from/to JSON --- rust/arrow/src/datatypes.rs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index e06809c66a8..3afb83b5066 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1306,9 +1306,16 @@ impl Field { } /// Sets the `Field`'s optional custom metadata. + /// If the map is empty, then the metadata is set as none. #[inline] pub fn set_metadata(&mut self, metadata: Option>) { - self.metadata = metadata; + if let Some(v) = metadata { + if !v.is_empty() { + self.metadata = Some(v); + return + } + } + self.metadata = None; } /// Returns the immutable reference to the `Field`'s optional custom metadata. @@ -1992,9 +1999,20 @@ mod tests { #[test] fn serde_struct_type() { + let kv_array = [("k".to_string(), "v".to_string())]; + let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); + + // Non-empty map: should be converted as JSON obj { ... } + let mut first_name = Field::new("first_name", DataType::Utf8, false); + first_name.set_metadata(Some(field_metadata)); + + // Empty map: should be omitted. + let mut last_name = Field::new("last_name", DataType::Utf8, false); + last_name.set_metadata(Some(BTreeMap::default())); + let person = DataType::Struct(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), + first_name, + last_name, Field::new( "address", DataType::Struct(vec![ @@ -2012,7 +2030,7 @@ mod tests { assert_eq!( "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ + {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ {\"name\":\"address\",\"data_type\":{\"Struct\":\ [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ From e2e2548672571cca4cc5881cc0b6e6e1b8f77bd5 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 28 Dec 2020 20:33:08 +0800 Subject: [PATCH 03/14] ArrowJsonField::from(): convert metadata --- rust/arrow/src/datatypes.rs | 4 ++-- rust/arrow/src/util/integration_util.rs | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 3afb83b5066..cde19e4dd2e 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1312,7 +1312,7 @@ impl Field { if let Some(v) = metadata { if !v.is_empty() { self.metadata = Some(v); - return + return; } } self.metadata = None; @@ -2754,7 +2754,7 @@ mod tests { let md = metadata.as_ref().unwrap(); assert_eq!(md.len(), 1); let key = md.get("k"); - assert!(md.get("k").is_some()); + assert!(key.is_some()); assert_eq!(key.unwrap(), "v"); let interests = &schema.fields()[3]; diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 6a6e7ea77de..1fcbfba882e 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -20,7 +20,7 @@ //! These utilities define structs that read the integration JSON format for integration testing purposes. use serde_derive::{Deserialize, Serialize}; -use serde_json::{Number as VNumber, Value}; +use serde_json::{Map as VMap, Number as VNumber, Value}; use crate::array::*; use crate::datatypes::*; @@ -60,13 +60,22 @@ pub struct ArrowJsonField { impl From<&Field> for ArrowJsonField { fn from(field: &Field) -> Self { + let mut metadata_value = None; + if let Some(kv_list) = field.metadata() { + let mut json_map = VMap::new(); + for (k, v) in kv_list { + json_map.insert(k.clone(), Value::String(v.clone())); + } + metadata_value = Some(Value::Object(json_map)); + } + Self { name: field.name().to_string(), field_type: field.data_type().to_json(), nullable: field.is_nullable(), children: vec![], - dictionary: None, // TODO: not enough info - metadata: None, // TODO(ARROW-10259) + dictionary: None, // TODO: not enough info + metadata: metadata_value, // TODO(ARROW-10259): metadata is not used. } } } From 9ae2f681c695aa858d7814ec30ea28eb64882743 Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 30 Dec 2020 18:21:19 +0800 Subject: [PATCH 04/14] Update for json integration test --- rust/arrow/src/datatypes.rs | 43 ++++++++++++++++++++++++- rust/arrow/src/util/integration_util.rs | 38 ++++++++++++++++------ rust/arrow/test/data/integration.json | 25 ++++++++++++++ 3 files changed, 95 insertions(+), 11 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index cde19e4dd2e..9501b1bfa29 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1388,6 +1388,46 @@ impl Field { )); } }; + let metadata = match map.get("metadata") { + Some(&Value::Array(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for value in values { + match value.as_object() { + Some(value) => { + if value.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'metadata' must have exact one json map for a key-value pair".to_string(), + )); + } + for (k, v) in value { + if let Some(str_value) = v.as_str() { + res.insert( + k.clone(), + str_value.to_string().clone(), + ); + } else { + return Err(ArrowError::ParseError( + format!("Field 'metadata' contains non-string value for key {}", k), + )); + } + } + } + _ => { + return Err(ArrowError::ParseError( + "Field 'metadata' contains non-object key-value pair".to_string(), + )); + } + } + } + Some(res) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field `metadata` is not json array".to_string(), + )); + } + _ => None, + }; // if data_type is a struct or list, get its children let data_type = match data_type { DataType::List(_) @@ -1486,7 +1526,7 @@ impl Field { data_type, dict_id, dict_is_ordered, - metadata: None, + metadata, }) } _ => Err(ArrowError::ParseError( @@ -1640,6 +1680,7 @@ impl Field { } } +// TODO: improve display with crate https://crates.io/crates/derive_more ? impl fmt::Display for Field { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", self) diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 1fcbfba882e..bb3ca18519b 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -20,7 +20,7 @@ //! These utilities define structs that read the integration JSON format for integration testing purposes. use serde_derive::{Deserialize, Serialize}; -use serde_json::{Map as VMap, Number as VNumber, Value}; +use serde_json::{Map as SJMap, Number as VNumber, Value}; use crate::array::*; use crate::datatypes::*; @@ -60,22 +60,30 @@ pub struct ArrowJsonField { impl From<&Field> for ArrowJsonField { fn from(field: &Field) -> Self { - let mut metadata_value = None; - if let Some(kv_list) = field.metadata() { - let mut json_map = VMap::new(); - for (k, v) in kv_list { - json_map.insert(k.clone(), Value::String(v.clone())); + let metadata_value = match field.metadata() { + Some(kv_list) => { + let mut array = Vec::new(); + for (k, v) in kv_list { + let mut kv_map = SJMap::new(); + kv_map.insert(k.clone(), Value::String(v.clone())); + array.push(Value::Object(kv_map)); + } + if !array.is_empty() { + Some(Value::Array(array)) + } else { + None + } } - metadata_value = Some(Value::Object(json_map)); - } + _ => None, + }; Self { name: field.name().to_string(), field_type: field.data_type().to_json(), nullable: field.is_nullable(), children: vec![], - dictionary: None, // TODO: not enough info - metadata: metadata_value, // TODO(ARROW-10259): metadata is not used. + dictionary: None, // TODO: not enough info + metadata: metadata_value, } } } @@ -718,7 +726,15 @@ mod tests { let millis_tz = Some("America/New_York".to_string()); let micros_tz = Some("UTC".to_string()); let nanos_tz = Some("Africa/Johannesburg".to_string()); + + let mut field_bools_with_metadata = + Field::new("bools-with-metadata", DataType::Boolean, true); + let mut field_metadata = std::collections::BTreeMap::new(); + field_metadata.insert("k".to_string(), "v".to_string()); + field_bools_with_metadata.set_metadata(Some(field_metadata)); + let schema = Schema::new(vec![ + field_bools_with_metadata, Field::new("bools", DataType::Boolean, true), Field::new("int8s", DataType::Int8, true), Field::new("int16s", DataType::Int16, true), @@ -788,6 +804,7 @@ mod tests { ), ]); + let bools_with_metadata = BooleanArray::from(vec![Some(true), None, Some(false)]); let bools = BooleanArray::from(vec![Some(true), None, Some(false)]); let int8s = Int8Array::from(vec![Some(1), None, Some(3)]); let int16s = Int16Array::from(vec![Some(1), None, Some(3)]); @@ -876,6 +893,7 @@ mod tests { let record_batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ + Arc::new(bools_with_metadata), Arc::new(bools), Arc::new(int8s), Arc::new(int16s), diff --git a/rust/arrow/test/data/integration.json b/rust/arrow/test/data/integration.json index 193636ff136..a055aa44f82 100644 --- a/rust/arrow/test/data/integration.json +++ b/rust/arrow/test/data/integration.json @@ -1,6 +1,17 @@ { "schema": { "fields": [ + { + "name": "bools-with-metadata", + "type": { + "name": "bool" + }, + "nullable": true, + "metadata": [ + {"k": "v"} + ], + "children": [] + }, { "name": "bools", "type": { @@ -301,6 +312,20 @@ { "count": 3, "columns": [ + { + "name": "bools-with-metadata", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "DATA": [ + true, + true, + false + ] + }, { "name": "bools", "count": 3, From 06f5f2bc78c7ea9b4affdbf0f672f83cc1379713 Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 31 Dec 2020 00:20:00 +0800 Subject: [PATCH 05/14] Support read field metadata from JSON oject(map) --- rust/arrow/src/datatypes.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 9501b1bfa29..e4a51735bc0 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1388,6 +1388,7 @@ impl Field { )); } }; + // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { Some(&Value::Array(ref values)) => { let mut res: BTreeMap = BTreeMap::new(); @@ -1421,6 +1422,21 @@ impl Field { } Some(res) } + // We also support map format, because Schema's metadata supports this. + // See https://github.com/apache/arrow/pull/5907 + Some(&Value::Object(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for (k, v) in values { + if let Some(str_value) = v.as_str() { + res.insert(k.clone(), str_value.to_string().clone()); + } else { + return Err(ArrowError::ParseError( + format!("Field 'metadata' contains non-string value for key {}", k), + )); + } + } + Some(res) + } Some(_) => { return Err(ArrowError::ParseError( "Field `metadata` is not json array".to_string(), From 4750df938932ad0fc49f896555675d6a82b02fb7 Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 31 Dec 2020 15:30:11 +0800 Subject: [PATCH 06/14] try merge field metadata --- rust/arrow/src/datatypes.rs | 115 ++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index e4a51735bc0..5ccbffa696d 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1582,6 +1582,7 @@ impl Field { } /// Merge field into self if it is compatible. Struct will be merged recursively. + /// NOTE: `self` may be updated to unexpected state in case of merge failure. /// /// Example: /// @@ -1593,6 +1594,28 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<()> { + // merge metadata + match (self.metadata(), from.metadata()) { + (Some(self_metadata), Some(from_metadata)) => { + let mut merged = self_metadata.clone(); + for (key, from_value) in from_metadata { + if let Some(self_value) = self_metadata.get(key) { + if self_value != from_value { + return Err(ArrowError::SchemaError(format!( + "Fail to merge field due to conflicting metadata data value for key {}", key), + )); + } + } else { + merged.insert(key.clone(), from_value.clone()); + } + } + self.set_metadata(Some(merged)); + } + (None, Some(from_metadata)) => { + self.set_metadata(Some(from_metadata.clone())); + } + _ => {} + } if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError( "Fail to merge schema Field due to conflicting dict_id".to_string(), @@ -2947,6 +2970,98 @@ mod tests { ]) } + #[test] + fn test_try_merge_field_with_metadata() { + // 1. Different values for the same key should cause error. + let mut f1 = Field::new("first_name", DataType::Utf8, false); + let metadata1: BTreeMap = + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(); + f1.set_metadata(Some(metadata1)); + + let mut f2 = Field::new("first_name", DataType::Utf8, false); + let metadata2: BTreeMap = + [("foo".to_string(), "baz".to_string())] + .iter() + .cloned() + .collect(); + f2.set_metadata(Some(metadata2)); + + assert!( + Schema::try_merge(&[Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err() + ); + + // 2. None + Some + let mut f1 = Field::new("first_name", DataType::Utf8, false); + let mut f2 = Field::new("first_name", DataType::Utf8, false); + let metadata2: BTreeMap = + [("missing".to_string(), "value".to_string())] + .iter() + .cloned() + .collect(); + f2.set_metadata(Some(metadata2)); + + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata.is_some()); + assert_eq!(f1.metadata.unwrap(), f2.metadata.unwrap()); + + // 3. Some + Some + let mut f1 = Field::new("first_name", DataType::Utf8, false); + f1.set_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let mut f2 = Field::new("first_name", DataType::Utf8, false); + f2.set_metadata(Some( + [("foo2".to_string(), "bar2".to_string())] + .iter() + .cloned() + .collect(), + )); + + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata.is_some()); + assert_eq!( + f1.metadata.unwrap(), + [ + ("foo".to_string(), "bar".to_string()), + ("foo2".to_string(), "bar2".to_string()) + ] + .iter() + .cloned() + .collect() + ); + + // 4. Some + None. + let mut f1 = Field::new("first_name", DataType::Utf8, false); + f1.set_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let f2 = Field::new("first_name", DataType::Utf8, false); + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata.is_some()); + assert_eq!( + f1.metadata.unwrap(), + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect() + ); + + // 5. None + None. + let mut f1 = Field::new("first_name", DataType::Utf8, false); + let f2 = Field::new("first_name", DataType::Utf8, false); + assert!(f1.try_merge(&f2).is_ok()); + assert!(f1.metadata.is_none()); + } + #[test] fn test_schema_merge() -> Result<()> { let merged = Schema::try_merge(&[ From 6dc7b3f80b23088a7bd53790fe3f09744eac72ea Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 31 Dec 2020 16:02:42 +0800 Subject: [PATCH 07/14] the test schema_equality(): add field metadata --- rust/arrow/src/datatypes.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 5ccbffa696d..16682629bdf 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -2918,6 +2918,20 @@ mod tests { assert!(schema2 != schema3); assert!(schema2 != schema4); assert!(schema3 != schema4); + + let mut f = Field::new("c1", DataType::Utf8, false); + f.set_metadata(Some( + [("foo".to_string(), "bar".to_string())] + .iter() + .cloned() + .collect(), + )); + let schema5 = Schema::new(vec![ + f, + Field::new("c2", DataType::Float64, true), + Field::new("c3", DataType::LargeBinary, true), + ]); + assert!(schema1 != schema5); } #[test] From c77587339e9c6a2c479374d7fb4f640f52ef6f57 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 4 Jan 2021 17:12:34 +0800 Subject: [PATCH 08/14] set_metadata -> with_metadata --- rust/arrow/src/datatypes.rs | 52 ++++++++++++------------- rust/arrow/src/ipc/convert.rs | 7 +--- rust/arrow/src/util/integration_util.rs | 14 ++++--- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 16682629bdf..ead7140f087 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1306,16 +1306,17 @@ impl Field { } /// Sets the `Field`'s optional custom metadata. - /// If the map is empty, then the metadata is set as none. + /// The metadata is set as `None` for empty map. #[inline] - pub fn set_metadata(&mut self, metadata: Option>) { + pub fn with_metadata(&mut self, metadata: Option>) -> Self { + // To make serde happy, convert Some(empty_map) to None. + self.metadata = None; if let Some(v) = metadata { if !v.is_empty() { self.metadata = Some(v); - return; } } - self.metadata = None; + self.clone() } /// Returns the immutable reference to the `Field`'s optional custom metadata. @@ -1388,6 +1389,7 @@ impl Field { )); } }; + // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz let metadata = match map.get("metadata") { Some(&Value::Array(ref values)) => { @@ -1444,6 +1446,7 @@ impl Field { } _ => None, }; + // if data_type is a struct or list, get its children let data_type = match data_type { DataType::List(_) @@ -1609,10 +1612,10 @@ impl Field { merged.insert(key.clone(), from_value.clone()); } } - self.set_metadata(Some(merged)); + self.with_metadata(Some(merged)); } (None, Some(from_metadata)) => { - self.set_metadata(Some(from_metadata.clone())); + self.with_metadata(Some(from_metadata.clone())); } _ => {} } @@ -2083,12 +2086,12 @@ mod tests { let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); // Non-empty map: should be converted as JSON obj { ... } - let mut first_name = Field::new("first_name", DataType::Utf8, false); - first_name.set_metadata(Some(field_metadata)); + let first_name = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(field_metadata)); // Empty map: should be omitted. - let mut last_name = Field::new("last_name", DataType::Utf8, false); - last_name.set_metadata(Some(BTreeMap::default())); + let last_name = Field::new("last_name", DataType::Utf8, false) + .with_metadata(Some(BTreeMap::default())); let person = DataType::Struct(vec![ first_name, @@ -2919,8 +2922,7 @@ mod tests { assert!(schema2 != schema4); assert!(schema3 != schema4); - let mut f = Field::new("c1", DataType::Utf8, false); - f.set_metadata(Some( + let f = Field::new("c1", DataType::Utf8, false).with_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() @@ -2959,9 +2961,8 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - - let mut first_name = Field::new("first_name", DataType::Utf8, false); - first_name.set_metadata(Some(field_metadata)); + let mut first_name = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(field_metadata)); Schema::new(vec![ first_name, @@ -2987,21 +2988,21 @@ mod tests { #[test] fn test_try_merge_field_with_metadata() { // 1. Different values for the same key should cause error. - let mut f1 = Field::new("first_name", DataType::Utf8, false); let metadata1: BTreeMap = [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(); - f1.set_metadata(Some(metadata1)); + let f1 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata1)); - let mut f2 = Field::new("first_name", DataType::Utf8, false); let metadata2: BTreeMap = [("foo".to_string(), "baz".to_string())] .iter() .cloned() .collect(); - f2.set_metadata(Some(metadata2)); + let f2 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata2)); assert!( Schema::try_merge(&[Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err() @@ -3009,28 +3010,26 @@ mod tests { // 2. None + Some let mut f1 = Field::new("first_name", DataType::Utf8, false); - let mut f2 = Field::new("first_name", DataType::Utf8, false); let metadata2: BTreeMap = [("missing".to_string(), "value".to_string())] .iter() .cloned() .collect(); - f2.set_metadata(Some(metadata2)); + let f2 = Field::new("first_name", DataType::Utf8, false) + .with_metadata(Some(metadata2)); assert!(f1.try_merge(&f2).is_ok()); assert!(f1.metadata.is_some()); assert_eq!(f1.metadata.unwrap(), f2.metadata.unwrap()); // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false); - f1.set_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(), )); - let mut f2 = Field::new("first_name", DataType::Utf8, false); - f2.set_metadata(Some( + let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( [("foo2".to_string(), "bar2".to_string())] .iter() .cloned() @@ -3051,8 +3050,7 @@ mod tests { ); // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false); - f1.set_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 19eca0da2d9..c8705499dfb 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -96,13 +96,10 @@ impl<'a> From> for Field { metadata_map.insert(k.to_string(), v.to_string()); } } - if !metadata_map.is_empty() { - metadata = Some(metadata_map); - } + metadata = Some(metadata_map); } - arrow_field.set_metadata(metadata); - arrow_field + arrow_field.with_metadata(metadata) } } diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index bb3ca18519b..799804b30e4 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -727,11 +727,15 @@ mod tests { let micros_tz = Some("UTC".to_string()); let nanos_tz = Some("Africa/Johannesburg".to_string()); - let mut field_bools_with_metadata = - Field::new("bools-with-metadata", DataType::Boolean, true); - let mut field_metadata = std::collections::BTreeMap::new(); - field_metadata.insert("k".to_string(), "v".to_string()); - field_bools_with_metadata.set_metadata(Some(field_metadata)); + let field_bools_with_metadata = + Field::new("bools-with-metadata", DataType::Boolean, true).with_metadata( + Some( + [("k".to_string(), "v".to_string())] + .iter() + .cloned() + .collect(), + ), + ); let schema = Schema::new(vec![ field_bools_with_metadata, From 11c8357ce06608c73750981eb45f0d0281919b61 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 4 Jan 2021 17:47:54 +0800 Subject: [PATCH 09/14] clippy and doc --- rust/arrow/src/datatypes.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index ead7140f087..09b94bb7252 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1307,6 +1307,7 @@ impl Field { /// Sets the `Field`'s optional custom metadata. /// The metadata is set as `None` for empty map. + /// Return cloned `Self`. #[inline] pub fn with_metadata(&mut self, metadata: Option>) -> Self { // To make serde happy, convert Some(empty_map) to None. @@ -2961,7 +2962,7 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - let mut first_name = Field::new("first_name", DataType::Utf8, false) + let first_name = Field::new("first_name", DataType::Utf8, false) .with_metadata(Some(field_metadata)); Schema::new(vec![ From 4b10cb9c3931b31b43887344135c42c5439472eb Mon Sep 17 00:00:00 2001 From: mqy Date: Tue, 5 Jan 2021 19:37:31 +0800 Subject: [PATCH 10/14] Enable generate_custom_metadata_case for Rust --- dev/archery/archery/integration/datagen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 1b97259e555..35ab289cc33 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1568,8 +1568,7 @@ def _temp_path(): generate_custom_metadata_case() .skip_category('Go') - .skip_category('JS') - .skip_category('Rust'), # TODO(ARROW-10259) + .skip_category('JS'), generate_duplicate_fieldnames_case() .skip_category('Go') From ec5a7afebff8b861f609c7f193e12501b6bd6602 Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 6 Jan 2021 14:56:16 +0800 Subject: [PATCH 11/14] Fix field metadata --- rust/arrow/src/datatypes.rs | 24 ++-- rust/arrow/src/util/integration_util.rs | 170 +++++++++++++----------- rust/arrow/test/data/integration.json | 34 ++++- 3 files changed, 134 insertions(+), 94 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 09b94bb7252..87e2942cba8 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1397,23 +1397,27 @@ impl Field { let mut res: BTreeMap = BTreeMap::new(); for value in values { match value.as_object() { - Some(value) => { - if value.len() != 1 { + Some(map) => { + if map.len() != 2 { return Err(ArrowError::ParseError( - "Field 'metadata' must have exact one json map for a key-value pair".to_string(), + "Field 'metadata' must have exact two entries for each key-value map".to_string(), )); } - for (k, v) in value { - if let Some(str_value) = v.as_str() { + if let (Some(k), Some(v)) = + (map.get("key"), map.get("value")) + { + if let (Some(k_str), Some(v_str)) = + (k.as_str(), v.as_str()) + { res.insert( - k.clone(), - str_value.to_string().clone(), + k_str.to_string().clone(), + v_str.to_string().clone(), ); } else { - return Err(ArrowError::ParseError( - format!("Field 'metadata' contains non-string value for key {}", k), - )); + return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); } + } else { + return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); } } _ => { diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index 799804b30e4..ffaffe9a234 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -727,88 +727,95 @@ mod tests { let micros_tz = Some("UTC".to_string()); let nanos_tz = Some("Africa/Johannesburg".to_string()); - let field_bools_with_metadata = - Field::new("bools-with-metadata", DataType::Boolean, true).with_metadata( - Some( - [("k".to_string(), "v".to_string())] - .iter() - .cloned() - .collect(), + let schema = + Schema::new(vec![ + Field::new("bools-with-metadata-map", DataType::Boolean, true) + .with_metadata(Some( + [("k".to_string(), "v".to_string())] + .iter() + .cloned() + .collect(), + )), + Field::new("bools-with-metadata-vec", DataType::Boolean, true) + .with_metadata(Some( + [("k2".to_string(), "v2".to_string())] + .iter() + .cloned() + .collect(), + )), + Field::new("bools", DataType::Boolean, true), + Field::new("int8s", DataType::Int8, true), + Field::new("int16s", DataType::Int16, true), + Field::new("int32s", DataType::Int32, true), + Field::new("int64s", DataType::Int64, true), + Field::new("uint8s", DataType::UInt8, true), + Field::new("uint16s", DataType::UInt16, true), + Field::new("uint32s", DataType::UInt32, true), + Field::new("uint64s", DataType::UInt64, true), + Field::new("float32s", DataType::Float32, true), + Field::new("float64s", DataType::Float64, true), + Field::new("date_days", DataType::Date32(DateUnit::Day), true), + Field::new("date_millis", DataType::Date64(DateUnit::Millisecond), true), + Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), + Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), + Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), + Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), + Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), + Field::new( + "ts_millis", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, ), - ); - - let schema = Schema::new(vec![ - field_bools_with_metadata, - Field::new("bools", DataType::Boolean, true), - Field::new("int8s", DataType::Int8, true), - Field::new("int16s", DataType::Int16, true), - Field::new("int32s", DataType::Int32, true), - Field::new("int64s", DataType::Int64, true), - Field::new("uint8s", DataType::UInt8, true), - Field::new("uint16s", DataType::UInt16, true), - Field::new("uint32s", DataType::UInt32, true), - Field::new("uint64s", DataType::UInt64, true), - Field::new("float32s", DataType::Float32, true), - Field::new("float64s", DataType::Float64, true), - Field::new("date_days", DataType::Date32(DateUnit::Day), true), - Field::new("date_millis", DataType::Date64(DateUnit::Millisecond), true), - Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), - Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), - Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), - Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), - Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), - Field::new( - "ts_millis", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micros", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nanos", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - Field::new( - "ts_secs_tz", - DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), - true, - ), - Field::new( - "ts_millis_tz", - DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), - true, - ), - Field::new( - "ts_micros_tz", - DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), - true, - ), - Field::new( - "ts_nanos_tz", - DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), - true, - ), - Field::new("utf8s", DataType::Utf8, true), - Field::new( - "lists", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), - true, - ), - Field::new( - "structs", - DataType::Struct(vec![ - Field::new("int32s", DataType::Int32, true), - Field::new("utf8s", DataType::Utf8, true), - ]), - true, - ), - ]); + Field::new( + "ts_micros", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nanos", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + Field::new( + "ts_secs_tz", + DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), + true, + ), + Field::new( + "ts_millis_tz", + DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), + true, + ), + Field::new( + "ts_micros_tz", + DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), + true, + ), + Field::new( + "ts_nanos_tz", + DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), + true, + ), + Field::new("utf8s", DataType::Utf8, true), + Field::new( + "lists", + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new( + "structs", + DataType::Struct(vec![ + Field::new("int32s", DataType::Int32, true), + Field::new("utf8s", DataType::Utf8, true), + ]), + true, + ), + ]); - let bools_with_metadata = BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_map = + BooleanArray::from(vec![Some(true), None, Some(false)]); + let bools_with_metadata_vec = + BooleanArray::from(vec![Some(true), None, Some(false)]); let bools = BooleanArray::from(vec![Some(true), None, Some(false)]); let int8s = Int8Array::from(vec![Some(1), None, Some(3)]); let int16s = Int16Array::from(vec![Some(1), None, Some(3)]); @@ -897,7 +904,8 @@ mod tests { let record_batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ - Arc::new(bools_with_metadata), + Arc::new(bools_with_metadata_map), + Arc::new(bools_with_metadata_vec), Arc::new(bools), Arc::new(int8s), Arc::new(int16s), diff --git a/rust/arrow/test/data/integration.json b/rust/arrow/test/data/integration.json index a055aa44f82..7e4a22cddba 100644 --- a/rust/arrow/test/data/integration.json +++ b/rust/arrow/test/data/integration.json @@ -2,13 +2,27 @@ "schema": { "fields": [ { - "name": "bools-with-metadata", + "name": "bools-with-metadata-map", + "type": { + "name": "bool" + }, + "nullable": true, + "metadata": { + "k": "v" + }, + "children": [] + }, + { + "name": "bools-with-metadata-vec", "type": { "name": "bool" }, "nullable": true, "metadata": [ - {"k": "v"} + { + "key": "k2", + "value": "v2" + } ], "children": [] }, @@ -313,7 +327,21 @@ "count": 3, "columns": [ { - "name": "bools-with-metadata", + "name": "bools-with-metadata-map", + "count": 3, + "VALIDITY": [ + 1, + 0, + 1 + ], + "DATA": [ + true, + true, + false + ] + }, + { + "name": "bools-with-metadata-vec", "count": 3, "VALIDITY": [ 1, From d70f6bd7f0ebf47532bf1cc91c21c2c44b8b0b31 Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 6 Jan 2021 18:14:06 +0800 Subject: [PATCH 12/14] ipc: build field metadata --- rust/arrow/src/ipc/convert.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index c8705499dfb..9c03ffbd359 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -326,6 +326,23 @@ pub(crate) fn build_field<'a>( fbb: &mut FlatBufferBuilder<'a>, field: &Field, ) -> WIPOffset> { + // Optional custom metadata. + let mut fb_metadata = None; + if let Some(metadata) = field.metadata() { + if !metadata.is_empty() { + let mut kv_vec = vec![]; + for (k, v) in metadata { + let kv_args = ipc::KeyValueArgs { + key: Some(fbb.create_string(k.as_str())), + value: Some(fbb.create_string(v.as_str())), + }; + let kv_offset = ipc::KeyValue::create(fbb, &kv_args); + kv_vec.push(kv_offset); + } + fb_metadata = Some(fbb.create_vector(&kv_vec)); + } + }; + let fb_field_name = fbb.create_string(field.name().as_str()); let field_type = get_fb_field_type(field.data_type(), field.is_nullable(), fbb); @@ -356,6 +373,11 @@ pub(crate) fn build_field<'a>( Some(children) => field_builder.add_children(children), }; field_builder.add_type_(field_type.type_); + + if let Some(fb_metadata) = fb_metadata { + field_builder.add_custom_metadata(fb_metadata); + } + field_builder.finish() } From 5d170a7466e0734f7bccc218c81a4d5e65fef756 Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 6 Jan 2021 19:39:08 +0800 Subject: [PATCH 13/14] Add test data to convert_schema_round_trip --- rust/arrow/src/ipc/convert.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index 9c03ffbd359..a6bc615295d 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -690,9 +690,13 @@ mod tests { .iter() .cloned() .collect(); + let field_md: BTreeMap = [("k".to_string(), "v".to_string())] + .iter() + .cloned() + .collect(); let schema = Schema::new_with_metadata( vec![ - Field::new("uint8", DataType::UInt8, false), + Field::new("uint8", DataType::UInt8, false).with_metadata(Some(field_md)), Field::new("uint16", DataType::UInt16, true), Field::new("uint32", DataType::UInt32, false), Field::new("uint64", DataType::UInt64, true), From a578fd65d5f5c7c9cdb2bc2d2db6ea882344d161 Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 6 Jan 2021 21:16:27 +0800 Subject: [PATCH 14/14] Change signature of with_metadata --- rust/arrow/src/datatypes.rs | 44 +++--- rust/arrow/src/ipc/convert.rs | 9 +- rust/arrow/src/util/integration_util.rs | 175 ++++++++++++------------ 3 files changed, 121 insertions(+), 107 deletions(-) diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 87e2942cba8..7b16d95a868 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -1307,9 +1307,8 @@ impl Field { /// Sets the `Field`'s optional custom metadata. /// The metadata is set as `None` for empty map. - /// Return cloned `Self`. #[inline] - pub fn with_metadata(&mut self, metadata: Option>) -> Self { + pub fn set_metadata(&mut self, metadata: Option>) { // To make serde happy, convert Some(empty_map) to None. self.metadata = None; if let Some(v) = metadata { @@ -1317,7 +1316,6 @@ impl Field { self.metadata = Some(v); } } - self.clone() } /// Returns the immutable reference to the `Field`'s optional custom metadata. @@ -1617,10 +1615,10 @@ impl Field { merged.insert(key.clone(), from_value.clone()); } } - self.with_metadata(Some(merged)); + self.set_metadata(Some(merged)); } (None, Some(from_metadata)) => { - self.with_metadata(Some(from_metadata.clone())); + self.set_metadata(Some(from_metadata.clone())); } _ => {} } @@ -2091,12 +2089,12 @@ mod tests { let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); // Non-empty map: should be converted as JSON obj { ... } - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); + let mut first_name = Field::new("first_name", DataType::Utf8, false); + first_name.set_metadata(Some(field_metadata)); // Empty map: should be omitted. - let last_name = Field::new("last_name", DataType::Utf8, false) - .with_metadata(Some(BTreeMap::default())); + let mut last_name = Field::new("last_name", DataType::Utf8, false); + last_name.set_metadata(Some(BTreeMap::default())); let person = DataType::Struct(vec![ first_name, @@ -2927,7 +2925,8 @@ mod tests { assert!(schema2 != schema4); assert!(schema3 != schema4); - let f = Field::new("c1", DataType::Utf8, false).with_metadata(Some( + let mut f = Field::new("c1", DataType::Utf8, false); + f.set_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() @@ -2966,8 +2965,8 @@ mod tests { fn person_schema() -> Schema { let kv_array = [("k".to_string(), "v".to_string())]; let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - let first_name = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(field_metadata)); + let mut first_name = Field::new("first_name", DataType::Utf8, false); + first_name.set_metadata(Some(field_metadata)); Schema::new(vec![ first_name, @@ -2998,16 +2997,16 @@ mod tests { .iter() .cloned() .collect(); - let f1 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata1)); + let mut f1 = Field::new("first_name", DataType::Utf8, false); + f1.set_metadata(Some(metadata1)); let metadata2: BTreeMap = [("foo".to_string(), "baz".to_string())] .iter() .cloned() .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); + let mut f2 = Field::new("first_name", DataType::Utf8, false); + f2.set_metadata(Some(metadata2)); assert!( Schema::try_merge(&[Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err() @@ -3020,21 +3019,23 @@ mod tests { .iter() .cloned() .collect(); - let f2 = Field::new("first_name", DataType::Utf8, false) - .with_metadata(Some(metadata2)); + let mut f2 = Field::new("first_name", DataType::Utf8, false); + f2.set_metadata(Some(metadata2)); assert!(f1.try_merge(&f2).is_ok()); assert!(f1.metadata.is_some()); assert_eq!(f1.metadata.unwrap(), f2.metadata.unwrap()); // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false); + f1.set_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() .collect(), )); - let f2 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + let mut f2 = Field::new("first_name", DataType::Utf8, false); + f2.set_metadata(Some( [("foo2".to_string(), "bar2".to_string())] .iter() .cloned() @@ -3055,7 +3056,8 @@ mod tests { ); // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false).with_metadata(Some( + let mut f1 = Field::new("first_name", DataType::Utf8, false); + f1.set_metadata(Some( [("foo".to_string(), "bar".to_string())] .iter() .cloned() diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs index a6bc615295d..8733b0ee35c 100644 --- a/rust/arrow/src/ipc/convert.rs +++ b/rust/arrow/src/ipc/convert.rs @@ -99,7 +99,8 @@ impl<'a> From> for Field { metadata = Some(metadata_map); } - arrow_field.with_metadata(metadata) + arrow_field.set_metadata(metadata); + arrow_field } } @@ -696,7 +697,11 @@ mod tests { .collect(); let schema = Schema::new_with_metadata( vec![ - Field::new("uint8", DataType::UInt8, false).with_metadata(Some(field_md)), + { + let mut f = Field::new("uint8", DataType::UInt8, false); + f.set_metadata(Some(field_md)); + f + }, Field::new("uint16", DataType::UInt16, true), Field::new("uint32", DataType::UInt32, false), Field::new("uint64", DataType::UInt64, true), diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs index ffaffe9a234..4a4b865e8f4 100644 --- a/rust/arrow/src/util/integration_util.rs +++ b/rust/arrow/src/util/integration_util.rs @@ -727,90 +727,97 @@ mod tests { let micros_tz = Some("UTC".to_string()); let nanos_tz = Some("Africa/Johannesburg".to_string()); - let schema = - Schema::new(vec![ - Field::new("bools-with-metadata-map", DataType::Boolean, true) - .with_metadata(Some( - [("k".to_string(), "v".to_string())] - .iter() - .cloned() - .collect(), - )), - Field::new("bools-with-metadata-vec", DataType::Boolean, true) - .with_metadata(Some( - [("k2".to_string(), "v2".to_string())] - .iter() - .cloned() - .collect(), - )), - Field::new("bools", DataType::Boolean, true), - Field::new("int8s", DataType::Int8, true), - Field::new("int16s", DataType::Int16, true), - Field::new("int32s", DataType::Int32, true), - Field::new("int64s", DataType::Int64, true), - Field::new("uint8s", DataType::UInt8, true), - Field::new("uint16s", DataType::UInt16, true), - Field::new("uint32s", DataType::UInt32, true), - Field::new("uint64s", DataType::UInt64, true), - Field::new("float32s", DataType::Float32, true), - Field::new("float64s", DataType::Float64, true), - Field::new("date_days", DataType::Date32(DateUnit::Day), true), - Field::new("date_millis", DataType::Date64(DateUnit::Millisecond), true), - Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), - Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), - Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), - Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), - Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), - Field::new( - "ts_millis", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "ts_micros", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ), - Field::new( - "ts_nanos", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - Field::new( - "ts_secs_tz", - DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), - true, - ), - Field::new( - "ts_millis_tz", - DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), - true, - ), - Field::new( - "ts_micros_tz", - DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), - true, - ), - Field::new( - "ts_nanos_tz", - DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), - true, - ), - Field::new("utf8s", DataType::Utf8, true), - Field::new( - "lists", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), - true, - ), - Field::new( - "structs", - DataType::Struct(vec![ - Field::new("int32s", DataType::Int32, true), - Field::new("utf8s", DataType::Utf8, true), - ]), - true, - ), - ]); + let schema = Schema::new(vec![ + { + let mut f = + Field::new("bools-with-metadata-map", DataType::Boolean, true); + f.set_metadata(Some( + [("k".to_string(), "v".to_string())] + .iter() + .cloned() + .collect(), + )); + f + }, + { + let mut f = + Field::new("bools-with-metadata-vec", DataType::Boolean, true); + f.set_metadata(Some( + [("k2".to_string(), "v2".to_string())] + .iter() + .cloned() + .collect(), + )); + f + }, + Field::new("bools", DataType::Boolean, true), + Field::new("int8s", DataType::Int8, true), + Field::new("int16s", DataType::Int16, true), + Field::new("int32s", DataType::Int32, true), + Field::new("int64s", DataType::Int64, true), + Field::new("uint8s", DataType::UInt8, true), + Field::new("uint16s", DataType::UInt16, true), + Field::new("uint32s", DataType::UInt32, true), + Field::new("uint64s", DataType::UInt64, true), + Field::new("float32s", DataType::Float32, true), + Field::new("float64s", DataType::Float64, true), + Field::new("date_days", DataType::Date32(DateUnit::Day), true), + Field::new("date_millis", DataType::Date64(DateUnit::Millisecond), true), + Field::new("time_secs", DataType::Time32(TimeUnit::Second), true), + Field::new("time_millis", DataType::Time32(TimeUnit::Millisecond), true), + Field::new("time_micros", DataType::Time64(TimeUnit::Microsecond), true), + Field::new("time_nanos", DataType::Time64(TimeUnit::Nanosecond), true), + Field::new("ts_secs", DataType::Timestamp(TimeUnit::Second, None), true), + Field::new( + "ts_millis", + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + ), + Field::new( + "ts_micros", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + ), + Field::new( + "ts_nanos", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + Field::new( + "ts_secs_tz", + DataType::Timestamp(TimeUnit::Second, secs_tz.clone()), + true, + ), + Field::new( + "ts_millis_tz", + DataType::Timestamp(TimeUnit::Millisecond, millis_tz.clone()), + true, + ), + Field::new( + "ts_micros_tz", + DataType::Timestamp(TimeUnit::Microsecond, micros_tz.clone()), + true, + ), + Field::new( + "ts_nanos_tz", + DataType::Timestamp(TimeUnit::Nanosecond, nanos_tz.clone()), + true, + ), + Field::new("utf8s", DataType::Utf8, true), + Field::new( + "lists", + DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + true, + ), + Field::new( + "structs", + DataType::Struct(vec![ + Field::new("int32s", DataType::Int32, true), + Field::new("utf8s", DataType::Utf8, true), + ]), + true, + ), + ]); let bools_with_metadata_map = BooleanArray::from(vec![Some(true), None, Some(false)]);