From cad1b778c6fb177b5cfffc5200f32ba783376000 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 7 Apr 2023 19:34:29 +0800 Subject: [PATCH 1/5] arrow_cast to cast to timestamptz --- .../sqllogictests/test_files/arrow_typeof.slt | 8 +- datafusion/sql/src/expr/arrow_cast.rs | 82 ++++++++++++++++--- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt index 5166d291ac260..89b051ae4716e 100644 --- a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt +++ b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt @@ -102,7 +102,7 @@ query error Error unrecognized word: unknown SELECT arrow_cast('1', 'unknown') # Round Trip tests: -query TTTTTTTTTTTTTTTTTTT +query TTTTTTTTTTTTTTTTTTTTTTT SELECT arrow_typeof(arrow_cast(1, 'Int8')) as col_i8, arrow_typeof(arrow_cast(1, 'Int16')) as col_i16, @@ -124,9 +124,13 @@ SELECT arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, None)')) as col_ts_ms, arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, None)')) as col_ts_us, arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, None)')) as col_ts_ns, + arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Second, Some("+08:00"))')) as col_tstz_s, + arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Millisecond, Some("+08:00"))')) as col_tstz_ms, + arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Microsecond, Some("+08:00"))')) as col_tstz_us, + arrow_typeof(arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 'Timestamp(Nanosecond, Some("+08:00"))')) as col_tstz_ns, arrow_typeof(arrow_cast('foo', 'Dictionary(Int32, Utf8)')) as col_dict ---- -Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float32 Float64 Utf8 LargeUtf8 Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Dictionary(Int32, Utf8) +Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float32 Float64 Utf8 LargeUtf8 Binary LargeBinary Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, None) Timestamp(Nanosecond, None) Timestamp(Second, Some("+08:00")) Timestamp(Millisecond, Some("+08:00")) Timestamp(Microsecond, Some("+08:00")) Timestamp(Nanosecond, Some("+08:00")) Dictionary(Int32, Utf8) diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index 9a2ac28c46ab4..a9ff805d0ca50 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -167,6 +167,34 @@ impl<'a> Parser<'a> { } } + /// Parses the next timezone + fn parse_timezone(&mut self, context: &str) -> Result> { + match self.next_token()? { + Token::None => Ok(None), + Token::Some => { + self.expect_token(Token::LParen)?; + let timezone = self.parse_double_quoted_string("Timezone")?; + self.expect_token(Token::RParen)?; + Ok(Some(timezone)) + } + tok => Err(make_error( + self.val, + &format!("finding Timezone for {context}, got {tok}"), + )), + } + } + + /// Parses the next double quoted string + fn parse_double_quoted_string(&mut self, context: &str) -> Result { + match self.next_token()? { + Token::DoubleQuotedString(s) => Ok(s), + tok => Err(make_error( + self.val, + &format!("finding double quoted string for {context}, got '{tok}'"), + )), + } + } + /// Parses the next integer value fn parse_i64(&mut self, context: &str) -> Result { match self.next_token()? { @@ -216,10 +244,7 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let time_unit = self.parse_time_unit("Timestamp")?; self.expect_token(Token::Comma)?; - // TODO Support timezones other than None - self.expect_token(Token::None)?; - let timezone = None; - + let timezone = self.parse_timezone("Timestamp")?; self.expect_token(Token::RParen)?; Ok(DataType::Timestamp(time_unit, timezone)) } @@ -382,8 +407,8 @@ impl<'a> Tokenizer<'a> { } } - // if it started with a number, try parsing it as an integer if let Some(c) = self.word.chars().next() { + // if it started with a number, try parsing it as an integer if c == '-' || c.is_numeric() { let val: i64 = self.word.parse().map_err(|e| { make_error( @@ -393,6 +418,28 @@ impl<'a> Tokenizer<'a> { })?; return Ok(Token::Integer(val)); } + // if it started with a double quote `"`, try parsing it as a double quoted string + else if c == '"' { + let len = self.word.chars().count(); + + // to verify it's double quoted + if let Some(last_c) = self.word.chars().last() { + if last_c != '"' || len < 2 { + return Err(make_error( + self.val, + &format!("parsing {} as double quoted string", self.word), + )); + } + } + + let val: String = self.word.parse().map_err(|e| { + make_error( + self.val, + &format!("parsing {} as double quoted string: {e}", self.word), + ) + })?; + return Ok(Token::DoubleQuotedString(val[1..len - 1].to_owned())); + } } // figure out what the word was @@ -442,6 +489,7 @@ impl<'a> Tokenizer<'a> { "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime), "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano), + "Some" => Token::Some, "None" => Token::None, _ => { @@ -504,8 +552,10 @@ enum Token { LParen, RParen, Comma, + Some, None, Integer(i64), + DoubleQuotedString(String), } impl Display for Token { @@ -522,12 +572,14 @@ impl Display for Token { Token::LParen => write!(f, "("), Token::RParen => write!(f, ")"), Token::Comma => write!(f, ","), + Token::Some => write!(f, "Some"), Token::None => write!(f, "None"), Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), Token::Decimal128 => write!(f, "Decimal128"), Token::Decimal256 => write!(f, "Decimal256"), Token::Dictionary => write!(f, "Dictionary"), Token::Integer(v) => write!(f, "Integer({v})"), + Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), } } } @@ -580,8 +632,15 @@ mod test { DataType::Timestamp(TimeUnit::Millisecond, None), DataType::Timestamp(TimeUnit::Microsecond, None), DataType::Timestamp(TimeUnit::Nanosecond, None), - // TODO support timezones - //DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + // we can't cover all possible timezones, here we only test utc and +08:00 + DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Second, Some("+00:00".into())), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+08:00".into())), + DataType::Timestamp(TimeUnit::Microsecond, Some("+08:00".into())), + DataType::Timestamp(TimeUnit::Millisecond, Some("+08:00".into())), + DataType::Timestamp(TimeUnit::Second, Some("+08:00".into())), DataType::Date32, DataType::Date64, DataType::Time32(TimeUnit::Second), @@ -673,10 +732,13 @@ mod test { ("", "Error finding next token"), ("null", "Unsupported type 'null'"), ("Nu", "Unsupported type 'Nu'"), - // TODO support timezones ( - r#"Timestamp(Nanosecond, Some("UTC"))"#, - "Error unrecognized word: Some", + r#"Timestamp(Nanosecond, Some(+00:00))"#, + "Error unrecognized word: +00:00", + ), + ( + r#"Timestamp(Nanosecond, Some("+00:00))"#, + r#"Error parsing "+00:00 as double quoted string"#, ), ("Timestamp(Nanosecond, ", "Error finding next token"), ( From 4eb4d5356911fc6b00941b49d6330f09400c77f2 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 7 Apr 2023 21:10:21 +0800 Subject: [PATCH 2/5] add more error handling for parsing double quoted string --- datafusion/sql/src/expr/arrow_cast.rs | 30 ++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index a9ff805d0ca50..6ec1dd2846758 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -427,18 +427,34 @@ impl<'a> Tokenizer<'a> { if last_c != '"' || len < 2 { return Err(make_error( self.val, - &format!("parsing {} as double quoted string", self.word), + &format!("parsing {} as double quoted string: last char must be \"", self.word), )); } } + if len == 2 { + return Err(make_error( + self.val, + &format!("parsing {} as double quoted string: empty string isn't supported", self.word), + )) + } + let val: String = self.word.parse().map_err(|e| { make_error( self.val, &format!("parsing {} as double quoted string: {e}", self.word), ) })?; - return Ok(Token::DoubleQuotedString(val[1..len - 1].to_owned())); + + let s = val[1..len - 1].to_string(); + if s.contains("\"") { + return Err(make_error( + self.val, + &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word), + )) + } + + return Ok(Token::DoubleQuotedString(s)); } } @@ -738,7 +754,15 @@ mod test { ), ( r#"Timestamp(Nanosecond, Some("+00:00))"#, - r#"Error parsing "+00:00 as double quoted string"#, + r#"parsing "+00:00 as double quoted string: last char must be ""#, + ), + ( + r#"Timestamp(Nanosecond, Some(""))"#, + r#"parsing "" as double quoted string: empty string isn't supported"#, + ), + ( + r#"Timestamp(Nanosecond, Some("+00:00""))"#, + r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#, ), ("Timestamp(Nanosecond, ", "Error finding next token"), ( From 341f5bc1a46f1f5be70575b52c141a8da600fa67 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 7 Apr 2023 21:14:06 +0800 Subject: [PATCH 3/5] git fmt --- datafusion/sql/src/expr/arrow_cast.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index 6ec1dd2846758..3009085d985c9 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -436,7 +436,7 @@ impl<'a> Tokenizer<'a> { return Err(make_error( self.val, &format!("parsing {} as double quoted string: empty string isn't supported", self.word), - )) + )); } let val: String = self.word.parse().map_err(|e| { @@ -451,7 +451,7 @@ impl<'a> Tokenizer<'a> { return Err(make_error( self.val, &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word), - )) + )); } return Ok(Token::DoubleQuotedString(s)); From a926ee6383512b5eab6c88c1d041623f7ad1a5fa Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 7 Apr 2023 21:22:38 +0800 Subject: [PATCH 4/5] add test cases in .slt --- .../sqllogictests/test_files/arrow_typeof.slt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt index 89b051ae4716e..47425f4148f2a 100644 --- a/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt +++ b/datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt @@ -307,3 +307,25 @@ select arrow_cast(interval '30 minutes', 'Duration(Second)'); query error DataFusion error: Error during planning: Cannot automatically convert Utf8 to Duration\(Second\) select arrow_cast('30 minutes', 'Duration(Second)'); + + +## Timestamptz + +query P +select arrow_cast(timestamp '2000-01-01T00:00:00', 'Timestamp(Nanosecond, Some( "+00:00" ))'); +---- +2000-01-01T00:00:00Z + +query P +select arrow_cast(timestamp '2000-01-01T00:00:00', 'Timestamp(Nanosecond, Some( "+08:00" ))'); +---- +2000-01-01T08:00:00+08:00 + +# once https://github.com/apache/arrow-rs/issues/1936 fixed, please update this query +query P +select arrow_cast(timestamp '2000-01-01T00:00:00Z', 'Timestamp(Nanosecond, Some( "+08:00" ))'); +---- +2000-01-01T08:00:00+08:00 + +statement error Arrow error: Parser error: Invalid timezone "\+25:00": '\+25:00' is not a valid timezone +select arrow_cast(timestamp '2000-01-01T00:00:00', 'Timestamp(Nanosecond, Some( "+25:00" ))'); From 56a7b9eb762e4172ae0c0a045c61ec7a98eb51f0 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Fri, 7 Apr 2023 21:58:22 +0800 Subject: [PATCH 5/5] cargo clippy --- datafusion/sql/src/expr/arrow_cast.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index 3009085d985c9..91a05e60f6dc1 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -447,7 +447,7 @@ impl<'a> Tokenizer<'a> { })?; let s = val[1..len - 1].to_string(); - if s.contains("\"") { + if s.contains('"') { return Err(make_error( self.val, &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),