From 91ecd66277a59476bf385e6a8c6a8cbe32c1f08c Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Sat, 20 Feb 2021 01:25:36 -0800 Subject: [PATCH 1/3] ARROW-11707: [Rust] support CSV schema inference without file IO --- rust/arrow/src/csv/reader.rs | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index c6f90ae4633..fde707b2135 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -99,7 +99,27 @@ fn infer_field_schema(string: &str) -> DataType { /// If `max_read_records` is not set, the whole file is read to infer its schema. /// /// Return infered schema and number of records used for inference. -fn infer_file_schema( +pub fn infer_file_schema( + reader: &mut R, + delimiter: u8, + max_read_records: Option, + has_header: bool, +) -> Result<(Schema, usize)> { + let (schema, records_count) = + infer_schema_from_reader(reader, delimiter, max_read_records, has_header)?; + // return the reader seek back to the start + reader.seek(SeekFrom::Start(0))?; + + Ok((schema, records_count)) +} + +/// Infer schema of CSV records provided by struct that implements `Read` trait. +/// +/// `max_read_records` controlling the maximum number of records to read. If `max_read_records` is +/// not set, all records are read to infer the schema. +/// +/// Return infered schema and number of records used for inference. +pub fn infer_schema_from_reader( reader: &mut R, delimiter: u8, max_read_records: Option, @@ -121,18 +141,12 @@ fn infer_file_schema( .collect() }; - // save the csv reader position after reading headers - let position = csv_reader.position().clone(); - let header_length = headers.len(); // keep track of inferred field types let mut column_types: Vec> = vec![HashSet::new(); header_length]; // keep track of columns with nulls let mut nulls: Vec = vec![false; header_length]; - // return csv reader position to after headers - csv_reader.seek(position)?; - let mut records_count = 0; let mut fields = vec![]; @@ -184,9 +198,6 @@ fn infer_file_schema( } } - // return the reader seek back to the start - csv_reader.into_inner().seek(SeekFrom::Start(0))?; - Ok((Schema::new(fields), records_count)) } From b1471977fa250ee1bd798a9299bd0dc4d45a1b7c Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Sun, 21 Feb 2021 13:11:30 -0800 Subject: [PATCH 2/3] address code review --- rust/arrow/src/csv/reader.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index fde707b2135..1a6b0bd51fe 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -98,7 +98,8 @@ fn infer_field_schema(string: &str) -> DataType { /// /// If `max_read_records` is not set, the whole file is read to infer its schema. /// -/// Return infered schema and number of records used for inference. +/// Return infered schema and number of records used for inference. This function will also +/// automatically reset reader cursor offset to 0 (start of the byte stream). pub fn infer_file_schema( reader: &mut R, delimiter: u8, @@ -106,7 +107,7 @@ pub fn infer_file_schema( has_header: bool, ) -> Result<(Schema, usize)> { let (schema, records_count) = - infer_schema_from_reader(reader, delimiter, max_read_records, has_header)?; + infer_reader_schema(reader, delimiter, max_read_records, has_header)?; // return the reader seek back to the start reader.seek(SeekFrom::Start(0))?; @@ -119,7 +120,7 @@ pub fn infer_file_schema( /// not set, all records are read to infer the schema. /// /// Return infered schema and number of records used for inference. -pub fn infer_schema_from_reader( +pub fn infer_reader_schema( reader: &mut R, delimiter: u8, max_read_records: Option, From e3312f2f7791ac4a96708bea184c0be00831ed8d Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Sat, 27 Feb 2021 22:57:17 -0800 Subject: [PATCH 3/3] rewind reader offset to what it started at instead of 0 --- rust/arrow/src/csv/reader.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 1a6b0bd51fe..985c88b4978 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -98,18 +98,21 @@ fn infer_field_schema(string: &str) -> DataType { /// /// If `max_read_records` is not set, the whole file is read to infer its schema. /// -/// Return infered schema and number of records used for inference. This function will also -/// automatically reset reader cursor offset to 0 (start of the byte stream). +/// Return infered schema and number of records used for inference. This function does not change +/// reader cursor offset. pub fn infer_file_schema( reader: &mut R, delimiter: u8, max_read_records: Option, has_header: bool, ) -> Result<(Schema, usize)> { + let saved_offset = reader.seek(SeekFrom::Current(0))?; + let (schema, records_count) = infer_reader_schema(reader, delimiter, max_read_records, has_header)?; + // return the reader seek back to the start - reader.seek(SeekFrom::Start(0))?; + reader.seek(SeekFrom::Start(saved_offset))?; Ok((schema, records_count)) }