From bf384550401122d73b51265739a6164d35ddede4 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 00:31:37 +0100 Subject: [PATCH 1/6] Parse manually, remove lazy static dependency --- rust/arrow/Cargo.toml | 1 - rust/arrow/src/csv/reader.rs | 34 ++++++++++++++++------------------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index ff53dc9c445..880aa0a0b28 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -44,7 +44,6 @@ rand = "0.7" csv = "1.1" num = "0.3" regex = "1.3" -lazy_static = "1.4" packed_simd = { version = "0.3.4", optional = true, package = "packed_simd_2" } chrono = "0.4" flatbuffers = "0.6" diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 9ed2d1fd4bc..569028a7d11 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -40,8 +40,6 @@ //! let batch = csv.next().unwrap().unwrap(); //! ``` -use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; use std::fs::File; use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -60,13 +58,8 @@ use crate::{ use self::csv_crate::{Error, StringRecord, StringRecordsIntoIter}; -lazy_static! { - static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.\d+)$").unwrap(); - static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); - static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") - .case_insensitive(true) - .build() - .unwrap(); +fn all_digit(string: &str) -> bool { + string.chars().all(|c| c.is_ascii_digit()) } /// Infer the data type of a record @@ -77,15 +70,20 @@ fn infer_field_schema(string: &str) -> DataType { return DataType::Utf8; } // match regex in a particular order - if BOOLEAN_RE.is_match(string) { - DataType::Boolean - } else if DECIMAL_RE.is_match(string) { - DataType::Float64 - } else if INTEGER_RE.is_match(string) { - DataType::Int64 - } else { - DataType::Utf8 + let lower = string.to_ascii_lowercase(); + if lower == "true" || lower == "false" { + return DataType::Boolean; + } + let skip_minus = if string.starts_with('-') { 1 } else { 0 }; + let mut parts = string[skip_minus..].split('.'); + let (left, right) = (parts.next(), parts.next()); + let left_is_number = left.map_or(false, all_digit); + if left_is_number && right.map_or(false, all_digit) { + return DataType::Float64; + } else if left_is_number { + return DataType::Int64; } + DataType::Utf8 } /// Infer the schema of a CSV file by reading through the first n records of the file, @@ -888,7 +886,7 @@ mod tests { format!("{:?}", e) ), Ok(_) => panic!("should have failed"), - } + }, None => panic!("should have failed"), } } From 2a8dc8140db63024f90cc92f1d87d67429aec8d9 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 01:27:21 +0100 Subject: [PATCH 2/6] Test against remaining part in float --- rust/arrow/src/csv/reader.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 569028a7d11..18e026b4c8a 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -75,11 +75,16 @@ fn infer_field_schema(string: &str) -> DataType { return DataType::Boolean; } let skip_minus = if string.starts_with('-') { 1 } else { 0 }; - let mut parts = string[skip_minus..].split('.'); + let mut parts = string[skip_minus..].splitn(3, '.'); let (left, right) = (parts.next(), parts.next()); let left_is_number = left.map_or(false, all_digit); if left_is_number && right.map_or(false, all_digit) { - return DataType::Float64; + let no_remainder = parts.next().is_none(); + if no_remainder { + return DataType::Float64; + } else { + return DataType::Utf8; + } } else if left_is_number { return DataType::Int64; } From ad9de4c1f3378258b05d444228792a83800dbaf8 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 01:35:25 +0100 Subject: [PATCH 3/6] Use matching on parts for clarity and fixing Int64 match --- rust/arrow/src/csv/reader.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 18e026b4c8a..7e1168eae34 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -77,18 +77,19 @@ fn infer_field_schema(string: &str) -> DataType { let skip_minus = if string.starts_with('-') { 1 } else { 0 }; let mut parts = string[skip_minus..].splitn(3, '.'); let (left, right) = (parts.next(), parts.next()); - let left_is_number = left.map_or(false, all_digit); - if left_is_number && right.map_or(false, all_digit) { - let no_remainder = parts.next().is_none(); - if no_remainder { - return DataType::Float64; - } else { - return DataType::Utf8; + + match (left, right) { + (Some(l), None) if all_digit(l) => DataType::Int64, + (Some(l), Some(r)) if all_digit(l) && all_digit(r) => { + let no_remainder = parts.next().is_none(); + if no_remainder { + return DataType::Float64; + } else { + return DataType::Utf8; + } } - } else if left_is_number { - return DataType::Int64; + _ => DataType::Utf8, } - DataType::Utf8 } /// Infer the schema of a CSV file by reading through the first n records of the file, From 6a9f60f2fe874972de7658446e121cafe137999e Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 01:37:27 +0100 Subject: [PATCH 4/6] Use split iterator in match --- rust/arrow/src/csv/reader.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 7e1168eae34..38c452dea37 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -76,11 +76,10 @@ fn infer_field_schema(string: &str) -> DataType { } let skip_minus = if string.starts_with('-') { 1 } else { 0 }; let mut parts = string[skip_minus..].splitn(3, '.'); - let (left, right) = (parts.next(), parts.next()); - match (left, right) { - (Some(l), None) if all_digit(l) => DataType::Int64, - (Some(l), Some(r)) if all_digit(l) && all_digit(r) => { + match (parts.next(), parts.next()) { + (Some(left), None) if all_digit(left) => DataType::Int64, + (Some(left), Some(right)) if all_digit(left) && all_digit(right) => { let no_remainder = parts.next().is_none(); if no_remainder { return DataType::Float64; From 8305c1e89dfc47fdc9ec0b9bf3ecb1cea3741e81 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 01:52:37 +0100 Subject: [PATCH 5/6] Use eq_ignore_ascii_case --- rust/arrow/src/csv/reader.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 38c452dea37..33a16c5b2b8 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -70,8 +70,7 @@ fn infer_field_schema(string: &str) -> DataType { return DataType::Utf8; } // match regex in a particular order - let lower = string.to_ascii_lowercase(); - if lower == "true" || lower == "false" { + if string.eq_ignore_ascii_case("true") || string.eq_ignore_ascii_case("false") { return DataType::Boolean; } let skip_minus = if string.starts_with('-') { 1 } else { 0 }; From c7901ea50edccd8254713da35d12332f369bf590 Mon Sep 17 00:00:00 2001 From: "Heres, Daniel" Date: Thu, 19 Nov 2020 01:56:44 +0100 Subject: [PATCH 6/6] Clippy --- rust/arrow/src/csv/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 33a16c5b2b8..6e1d4efd0ca 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -81,9 +81,9 @@ fn infer_field_schema(string: &str) -> DataType { (Some(left), Some(right)) if all_digit(left) && all_digit(right) => { let no_remainder = parts.next().is_none(); if no_remainder { - return DataType::Float64; + DataType::Float64 } else { - return DataType::Utf8; + DataType::Utf8 } } _ => DataType::Utf8,