From 63bb888246b89c939279ab3ccb2ba55b1ad0a434 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 17:19:44 +0200 Subject: [PATCH 1/5] Use more compact Debug formatting of Field --- arrow-schema/Cargo.toml | 1 + arrow-schema/src/field.rs | 53 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 314c8f7a3515..7c3c208649ba 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -53,6 +53,7 @@ all-features = true [dev-dependencies] bincode = { version = "1.3.3", default-features = false } criterion = { version = "0.5", default-features = false } +insta = "1.43.1" [[bench]] name = "ffi" diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 8017fa81b5ea..1f5f8341fb6d 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -44,7 +44,7 @@ pub type FieldRef = Arc; /// /// Arrow Extension types, are encoded in `Field`s metadata. See /// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any. -#[derive(Clone, Debug)] +#[derive(Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Field { name: String, @@ -60,6 +60,39 @@ pub struct Field { metadata: HashMap, } +impl std::fmt::Debug for Field { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #![expect(deprecated)] // Must still print dict_id, if set + let Self { + name, + data_type, + nullable, + dict_id, + dict_is_ordered, + metadata, + } = self; + + let mut s = f.debug_struct("Field"); + let s = s + .field("name", name) + .field("data_type", data_type) + .field("nullable", nullable); + + if *dict_id != 0 { + s.field("dict_id", dict_id); + } + + if *dict_is_ordered { + s.field("dict_is_ordered", dict_is_ordered); + } + + if !metadata.is_empty() { + s.field("metadata", metadata); + } + s.finish() + } +} + // Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered` // into comparison. However, these properties are only used in IPC context // for matching dictionary encoded data. They are not necessary to be same @@ -914,6 +947,24 @@ mod test { Field::new_dict(s, DataType::Int64, false, 4, false); } + #[test] + fn test_debug_format_field() { + insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, true), @r#" + Field { + name: "item", + data_type: UInt8, + nullable: true, + } + "#); + insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r#" + Field { + name: "item", + data_type: UInt8, + nullable: false, + } + "#); + } + #[test] fn test_merge_incompatible_types() { let mut field = Field::new("c1", DataType::Int64, false); From 63aa15cfd9d6864b4eca2ecacc36ef3cc15e6450 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 17:37:46 +0200 Subject: [PATCH 2/5] Can't run miri on the test --- arrow-schema/src/field.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 1f5f8341fb6d..964c687c9403 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -948,19 +948,20 @@ mod test { } #[test] + #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro fn test_debug_format_field() { - insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, true), @r#" + insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r#" Field { name: "item", data_type: UInt8, - nullable: true, + nullable: false, } "#); - insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r#" + insta::assert_debug_snapshot!(Field::new("column", DataType::LargeUtf8, true), @r#" Field { - name: "item", - data_type: UInt8, - nullable: false, + name: "column", + data_type: LargeUtf8, + nullable: true, } "#); } From 5a24576ca732e746f176fca823a281d1f4fbd306 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 17:40:48 +0200 Subject: [PATCH 3/5] Add a snapshot test of Debug formatting of DataType::List too --- arrow-schema/src/datatype.rs | 15 +++++++++++++++ arrow-schema/src/field.rs | 1 + 2 files changed, 16 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 32bce3347404..a4c018eee0a7 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -1175,4 +1175,19 @@ mod tests { let data_type: DataType = "UInt64".parse().unwrap(); assert_eq!(data_type, DataType::UInt64); } + + #[test] + #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro + fn test_debug_format_field() { + // Make sure the `Debug` formatting of `DataType` is readable and not too long + insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r#" + List( + Field { + name: "item", + data_type: Int8, + nullable: false, + }, + ) + "#); + } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 964c687c9403..5510dda88265 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -950,6 +950,7 @@ mod test { #[test] #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro fn test_debug_format_field() { + // Make sure the `Debug` formatting of `Field` is readable and not too long insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r#" Field { name: "item", From c3a3d1a5068b53b7e6d426e9e677a892ea078fa5 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 17:51:40 +0200 Subject: [PATCH 4/5] Even shorter --- arrow-schema/src/datatype.rs | 6 ++---- arrow-schema/src/field.rs | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index a4c018eee0a7..42708aec63de 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -1180,14 +1180,12 @@ mod tests { #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro fn test_debug_format_field() { // Make sure the `Debug` formatting of `DataType` is readable and not too long - insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r#" + insta::assert_debug_snapshot!(DataType::new_list(DataType::Int8, false), @r" List( Field { - name: "item", data_type: Int8, - nullable: false, }, ) - "#); + "); } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 5510dda88265..23ea16841fa1 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -73,10 +73,17 @@ impl std::fmt::Debug for Field { } = self; let mut s = f.debug_struct("Field"); - let s = s - .field("name", name) - .field("data_type", data_type) - .field("nullable", nullable); + + if name != "item" { + // Keep it short when debug-formatting `DataType::List` + s.field("name", name); + } + + s.field("data_type", data_type); + + if *nullable { + s.field("nullable", nullable); + } if *dict_id != 0 { s.field("dict_id", dict_id); @@ -951,13 +958,11 @@ mod test { #[cfg_attr(miri, ignore)] // Can't handle the inlined strings of the assert_debug_snapshot macro fn test_debug_format_field() { // Make sure the `Debug` formatting of `Field` is readable and not too long - insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r#" + insta::assert_debug_snapshot!(Field::new("item", DataType::UInt8, false), @r" Field { - name: "item", data_type: UInt8, - nullable: false, } - "#); + "); insta::assert_debug_snapshot!(Field::new("column", DataType::LargeUtf8, true), @r#" Field { name: "column", From 64ca2306d8b51786be5d4d0aafafab75dd24a230 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 17:51:47 +0200 Subject: [PATCH 5/5] Ignore temp files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 05091a4e975d..127182a8f99e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,9 @@ __blobstorage__ *.bak2 # OS-specific .gitignores +# cargo insta temp files +*.pending-snap + # Mac .gitignore # General .DS_Store @@ -99,4 +102,4 @@ parquet/pytest/venv/ __pycache__/ # Parquet file from arrow_reader_clickbench -hits_1.parquet \ No newline at end of file +hits_1.parquet