diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml index ff53dc9c445..880aa0a0b28 100644 --- a/rust/arrow/Cargo.toml +++ b/rust/arrow/Cargo.toml @@ -44,7 +44,6 @@ rand = "0.7" csv = "1.1" num = "0.3" regex = "1.3" -lazy_static = "1.4" packed_simd = { version = "0.3.4", optional = true, package = "packed_simd_2" } chrono = "0.4" flatbuffers = "0.6" diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 9ed2d1fd4bc..6e1d4efd0ca 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -40,8 +40,6 @@ //! let batch = csv.next().unwrap().unwrap(); //! ``` -use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; use std::fs::File; use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -60,13 +58,8 @@ use crate::{ use self::csv_crate::{Error, StringRecord, StringRecordsIntoIter}; -lazy_static! { - static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.\d+)$").unwrap(); - static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); - static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") - .case_insensitive(true) - .build() - .unwrap(); +fn all_digit(string: &str) -> bool { + string.chars().all(|c| c.is_ascii_digit()) } /// Infer the data type of a record @@ -77,14 +70,23 @@ fn infer_field_schema(string: &str) -> DataType { return DataType::Utf8; } // match regex in a particular order - if BOOLEAN_RE.is_match(string) { - DataType::Boolean - } else if DECIMAL_RE.is_match(string) { - DataType::Float64 - } else if INTEGER_RE.is_match(string) { - DataType::Int64 - } else { - DataType::Utf8 + if string.eq_ignore_ascii_case("true") || string.eq_ignore_ascii_case("false") { + return DataType::Boolean; + } + let skip_minus = if string.starts_with('-') { 1 } else { 0 }; + let mut parts = string[skip_minus..].splitn(3, '.'); + + match (parts.next(), parts.next()) { + (Some(left), None) if all_digit(left) => DataType::Int64, + (Some(left), Some(right)) if all_digit(left) && all_digit(right) => { + let no_remainder = parts.next().is_none(); + if no_remainder { + DataType::Float64 + } else { + DataType::Utf8 + } + } + _ => DataType::Utf8, } } @@ -888,7 +890,7 @@ mod tests { format!("{:?}", e) ), Ok(_) => panic!("should have failed"), - } + }, None => panic!("should have failed"), } }