From db23134c85ab466ed208fc8c980fc8b0041abd41 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Fri, 1 May 2026 11:54:09 +0530 Subject: [PATCH 1/2] fix(spark): align parse_url empty FILE path --- .../spark/src/function/url/parse_url.rs | 46 ++++++++++++++++++- .../test_files/spark/url/parse_url.slt | 30 ++++++++++++ .../test_files/spark/url/try_parse_url.slt | 30 ++++++++++++ 3 files changed, 104 insertions(+), 2 deletions(-) diff --git a/datafusion/spark/src/function/url/parse_url.rs b/datafusion/spark/src/function/url/parse_url.rs index 2374d6c084b0d..cfd0bf201067f 100644 --- a/datafusion/spark/src/function/url/parse_url.rs +++ b/datafusion/spark/src/function/url/parse_url.rs @@ -77,7 +77,7 @@ impl ParseUrl { /// # Returns /// /// * `Ok(Some(String))` - The extracted URL component as a string - /// * `Ok(None)` - If the requested component doesn't exist or is empty + /// * `Ok(None)` - If the requested component doesn't exist /// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed fn parse(value: &str, part: &str, key: Option<&str>) -> Result> { let url: std::result::Result = Url::parse(value); @@ -136,7 +136,7 @@ impl ParseUrl { "REF" => url.fragment().map(String::from), "PROTOCOL" => Some(url.scheme().to_string()), "FILE" => { - let path = url.path(); + let path = Self::file_path(value, &url); match url.query() { Some(query) => Some(format!("{path}?{query}")), None => Some(path.to_string()), @@ -156,6 +156,26 @@ impl ParseUrl { _ => None, }) } + + fn file_path<'a>(value: &str, url: &'a Url) -> &'a str { + let path = url.path(); + if path == "/" && Self::absolute_url_has_empty_path(value) { + "" + } else { + path + } + } + + fn absolute_url_has_empty_path(value: &str) -> bool { + let Some(authority_start) = value.find("://").map(|index| index + 3) else { + return false; + }; + let after_authority = &value[authority_start..]; + match after_authority.find(['/', '?', '#']) { + None => true, + Some(index) => matches!(after_authority.as_bytes()[index], b'?' | b'#'), + } + } } impl ScalarUDFImpl for ParseUrl { @@ -388,6 +408,28 @@ mod tests { Ok(()) } + #[test] + fn test_parse_empty_path_file() -> Result<()> { + assert_eq!(ParseUrl::parse("", "PATH", None)?, Some("".to_string())); + assert_eq!( + ParseUrl::parse("http://example.com", "FILE", None)?, + Some("".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://example.com?foo=bar", "FILE", None)?, + Some("?foo=bar".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://example.com#fragment", "FILE", None)?, + Some("".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://example.com/?foo=bar", "FILE", None)?, + Some("/?foo=bar".to_string()) + ); + Ok(()) + } + #[test] fn test_parse_schemeless_url() -> Result<()> { // Spark's java.net.URI treats schemeless strings as relative URIs. diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt index 7a5051d50e2ce..263f664209a30 100644 --- a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt +++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt @@ -235,6 +235,36 @@ SELECT parse_url('https://example.com', 'PATH'); ---- (empty) +query T +SELECT parse_url('', 'PATH'); +---- +(empty) + +query T +SELECT parse_url('http://example.com', 'FILE'); +---- +(empty) + +query T +SELECT parse_url('http://example.com/', 'FILE'); +---- +/ + +query T +SELECT parse_url('http://example.com?foo=bar', 'FILE'); +---- +?foo=bar + +query T +SELECT parse_url('http://example.com#fragment', 'FILE'); +---- +(empty) + +query T +SELECT parse_url('http://example.com/?foo=bar', 'FILE'); +---- +/?foo=bar + query T SELECT parse_url('https://example.com', 'path'); ---- diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt index a0e42a16483f3..ff2a4fa3af124 100644 --- a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt +++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt @@ -186,6 +186,36 @@ SELECT try_parse_url('https://example.com', 'PATH'); ---- (empty) +query T +SELECT try_parse_url('', 'PATH'); +---- +(empty) + +query T +SELECT try_parse_url('http://example.com', 'FILE'); +---- +(empty) + +query T +SELECT try_parse_url('http://example.com/', 'FILE'); +---- +/ + +query T +SELECT try_parse_url('http://example.com?foo=bar', 'FILE'); +---- +?foo=bar + +query T +SELECT try_parse_url('http://example.com#fragment', 'FILE'); +---- +(empty) + +query T +SELECT try_parse_url('http://example.com/?foo=bar', 'FILE'); +---- +/?foo=bar + query T SELECT try_parse_url('https://example.com', 'path'); ---- From a07b3eb8d33087432add3c41de36184c178d1787 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Fri, 1 May 2026 12:25:05 +0530 Subject: [PATCH 2/2] align parse_url raw parts --- .../spark/src/function/url/parse_url.rs | 87 +++++++++++++++---- .../test_files/spark/url/parse_url.slt | 50 +++++++++++ .../test_files/spark/url/try_parse_url.slt | 50 +++++++++++ 3 files changed, 168 insertions(+), 19 deletions(-) diff --git a/datafusion/spark/src/function/url/parse_url.rs b/datafusion/spark/src/function/url/parse_url.rs index cfd0bf201067f..18f0bb1e0d78b 100644 --- a/datafusion/spark/src/function/url/parse_url.rs +++ b/datafusion/spark/src/function/url/parse_url.rs @@ -97,12 +97,7 @@ impl ParseUrl { "PATH" => Some(path.to_string()), "QUERY" => match key { None => query.map(String::from), - Some(key) => query.and_then(|q| { - q.split('&') - .filter_map(|pair| pair.split_once('=')) - .find(|(k, _)| *k == key) - .map(|(_, v)| v.to_string()) - }), + Some(key) => Self::query_value(query, key).map(String::from), }, "REF" => fragment.map(String::from), "FILE" => { @@ -122,21 +117,17 @@ impl ParseUrl { .map(|url| match part { "HOST" => url.host_str().map(String::from), "PATH" => { - let path: String = url.path().to_string(); - let path: String = if path == "/" { "".to_string() } else { path }; - Some(path) + let path = Self::path(value, &url); + Some(path.to_string()) } "QUERY" => match key { None => url.query().map(String::from), - Some(key) => url - .query_pairs() - .find(|(k, _)| k == key) - .map(|(_, v)| v.into_owned()), + Some(key) => Self::query_value(url.query(), key).map(String::from), }, "REF" => url.fragment().map(String::from), "PROTOCOL" => Some(url.scheme().to_string()), "FILE" => { - let path = Self::file_path(value, &url); + let path = Self::path(value, &url); match url.query() { Some(query) => Some(format!("{path}?{query}")), None => Some(path.to_string()), @@ -157,7 +148,7 @@ impl ParseUrl { }) } - fn file_path<'a>(value: &str, url: &'a Url) -> &'a str { + fn path<'a>(value: &str, url: &'a Url) -> &'a str { let path = url.path(); if path == "/" && Self::absolute_url_has_empty_path(value) { "" @@ -176,6 +167,16 @@ impl ParseUrl { Some(index) => matches!(after_authority.as_bytes()[index], b'?' | b'#'), } } + + fn query_value<'a>(query: Option<&'a str>, key: &str) -> Option<&'a str> { + query.and_then(|query| { + query + .split('&') + .filter_map(|pair| pair.split_once('=')) + .find(|(query_key, _)| *query_key == key) + .map(|(_, value)| value) + }) + } } impl ScalarUDFImpl for ParseUrl { @@ -402,9 +403,49 @@ mod tests { } #[test] - fn test_parse_path_root_is_empty_string() -> Result<()> { - let got = ParseUrl::parse("https://example.com/", "PATH", None)?; - assert_eq!(got, Some("".to_string())); + fn test_parse_path_empty_vs_root() -> Result<()> { + assert_eq!( + ParseUrl::parse("https://example.com", "PATH", None)?, + Some("".to_string()) + ); + assert_eq!( + ParseUrl::parse("https://example.com/", "PATH", None)?, + Some("/".to_string()) + ); + assert_eq!( + ParseUrl::parse("https://ex.com/dir%20/pa%20th.HTML", "PATH", None)?, + Some("/dir%20/pa%20th.HTML".to_string()) + ); + Ok(()) + } + + #[test] + fn test_parse_query_key_is_raw() -> Result<()> { + let url = "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two"; + assert_eq!( + ParseUrl::parse(url, "QUERY", None)?, + Some("query=x%20y&q2=2".to_string()) + ); + assert_eq!( + ParseUrl::parse(url, "QUERY", Some("query"))?, + Some("x%20y".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://ex.com?key=", "QUERY", Some("key"))?, + Some("".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://ex.com?keyonly", "QUERY", Some("keyonly"))?, + None + ); + assert_eq!( + ParseUrl::parse("http://ex.com?a=1&a=2", "QUERY", Some("a"))?, + Some("1".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://ex.com?a%20b=1", "QUERY", Some("a b"))?, + None + ); Ok(()) } @@ -427,6 +468,14 @@ mod tests { ParseUrl::parse("http://example.com/?foo=bar", "FILE", None)?, Some("/?foo=bar".to_string()) ); + assert_eq!( + ParseUrl::parse("http://ex.com/?", "FILE", None)?, + Some("/?".to_string()) + ); + assert_eq!( + ParseUrl::parse("http://ex.com?", "FILE", None)?, + Some("?".to_string()) + ); Ok(()) } @@ -524,7 +573,7 @@ mod tests { assert_eq!(out_sa.len(), 2); assert_eq!(out_sa.value(0), "example.com"); - assert_eq!(out_sa.value(1), ""); + assert_eq!(out_sa.value(1), "/"); Ok(()) } diff --git a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt index 263f664209a30..78030846930ff 100644 --- a/datafusion/sqllogictest/test_files/spark/url/parse_url.slt +++ b/datafusion/sqllogictest/test_files/spark/url/parse_url.slt @@ -235,6 +235,16 @@ SELECT parse_url('https://example.com', 'PATH'); ---- (empty) +query T +SELECT parse_url('https://example.com/', 'PATH'); +---- +/ + +query T +SELECT parse_url('https://ex.com/dir%20/pa%20th.HTML', 'PATH'); +---- +/dir%20/pa%20th.HTML + query T SELECT parse_url('', 'PATH'); ---- @@ -265,6 +275,46 @@ SELECT parse_url('http://example.com/?foo=bar', 'FILE'); ---- /?foo=bar +query T +SELECT parse_url('http://ex.com/?', 'FILE'); +---- +/? + +query T +SELECT parse_url('http://ex.com?', 'FILE'); +---- +? + +query T +SELECT parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY'); +---- +query=x%20y&q2=2 + +query T +SELECT parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY', 'query'); +---- +x%20y + +query T +SELECT parse_url('http://ex.com?key=', 'QUERY', 'key'); +---- +(empty) + +query T +SELECT parse_url('http://ex.com?keyonly', 'QUERY', 'keyonly'); +---- +NULL + +query T +SELECT parse_url('http://ex.com?a=1&a=2', 'QUERY', 'a'); +---- +1 + +query T +SELECT parse_url('http://ex.com?a%20b=1', 'QUERY', 'a b'); +---- +NULL + query T SELECT parse_url('https://example.com', 'path'); ---- diff --git a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt index ff2a4fa3af124..4b105f1450778 100644 --- a/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt +++ b/datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt @@ -186,6 +186,16 @@ SELECT try_parse_url('https://example.com', 'PATH'); ---- (empty) +query T +SELECT try_parse_url('https://example.com/', 'PATH'); +---- +/ + +query T +SELECT try_parse_url('https://ex.com/dir%20/pa%20th.HTML', 'PATH'); +---- +/dir%20/pa%20th.HTML + query T SELECT try_parse_url('', 'PATH'); ---- @@ -216,6 +226,46 @@ SELECT try_parse_url('http://example.com/?foo=bar', 'FILE'); ---- /?foo=bar +query T +SELECT try_parse_url('http://ex.com/?', 'FILE'); +---- +/? + +query T +SELECT try_parse_url('http://ex.com?', 'FILE'); +---- +? + +query T +SELECT try_parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY'); +---- +query=x%20y&q2=2 + +query T +SELECT try_parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY', 'query'); +---- +x%20y + +query T +SELECT try_parse_url('http://ex.com?key=', 'QUERY', 'key'); +---- +(empty) + +query T +SELECT try_parse_url('http://ex.com?keyonly', 'QUERY', 'keyonly'); +---- +NULL + +query T +SELECT try_parse_url('http://ex.com?a=1&a=2', 'QUERY', 'a'); +---- +1 + +query T +SELECT try_parse_url('http://ex.com?a%20b=1', 'QUERY', 'a b'); +---- +NULL + query T SELECT try_parse_url('https://example.com', 'path'); ----