Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 110 additions & 19 deletions datafusion/spark/src/function/url/parse_url.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ impl ParseUrl {
/// # Returns
///
/// * `Ok(Some(String))` - The extracted URL component as a string
/// * `Ok(None)` - If the requested component doesn't exist or is empty
/// * `Ok(None)` - If the requested component doesn't exist
/// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed
fn parse(value: &str, part: &str, key: Option<&str>) -> Result<Option<String>> {
let url: std::result::Result<Url, ParseError> = Url::parse(value);
Expand All @@ -97,12 +97,7 @@ impl ParseUrl {
"PATH" => Some(path.to_string()),
"QUERY" => match key {
None => query.map(String::from),
Some(key) => query.and_then(|q| {
q.split('&')
.filter_map(|pair| pair.split_once('='))
.find(|(k, _)| *k == key)
.map(|(_, v)| v.to_string())
}),
Some(key) => Self::query_value(query, key).map(String::from),
},
"REF" => fragment.map(String::from),
"FILE" => {
Expand All @@ -122,21 +117,17 @@ impl ParseUrl {
.map(|url| match part {
"HOST" => url.host_str().map(String::from),
"PATH" => {
let path: String = url.path().to_string();
let path: String = if path == "/" { "".to_string() } else { path };
Some(path)
let path = Self::path(value, &url);
Some(path.to_string())
}
"QUERY" => match key {
None => url.query().map(String::from),
Some(key) => url
.query_pairs()
.find(|(k, _)| k == key)
.map(|(_, v)| v.into_owned()),
Some(key) => Self::query_value(url.query(), key).map(String::from),
},
"REF" => url.fragment().map(String::from),
"PROTOCOL" => Some(url.scheme().to_string()),
"FILE" => {
let path = url.path();
let path = Self::path(value, &url);
match url.query() {
Some(query) => Some(format!("{path}?{query}")),
None => Some(path.to_string()),
Expand All @@ -156,6 +147,36 @@ impl ParseUrl {
_ => None,
})
}

fn path<'a>(value: &str, url: &'a Url) -> &'a str {
let path = url.path();
if path == "/" && Self::absolute_url_has_empty_path(value) {
""
} else {
path
}
}

fn absolute_url_has_empty_path(value: &str) -> bool {
let Some(authority_start) = value.find("://").map(|index| index + 3) else {
return false;
};
let after_authority = &value[authority_start..];
match after_authority.find(['/', '?', '#']) {
None => true,
Some(index) => matches!(after_authority.as_bytes()[index], b'?' | b'#'),
}
}

fn query_value<'a>(query: Option<&'a str>, key: &str) -> Option<&'a str> {
query.and_then(|query| {
query
.split('&')
.filter_map(|pair| pair.split_once('='))
.find(|(query_key, _)| *query_key == key)
.map(|(_, value)| value)
})
}
}

impl ScalarUDFImpl for ParseUrl {
Expand Down Expand Up @@ -382,9 +403,79 @@ mod tests {
}

#[test]
fn test_parse_path_root_is_empty_string() -> Result<()> {
let got = ParseUrl::parse("https://example.com/", "PATH", None)?;
assert_eq!(got, Some("".to_string()));
fn test_parse_path_empty_vs_root() -> Result<()> {
assert_eq!(
ParseUrl::parse("https://example.com", "PATH", None)?,
Some("".to_string())
);
assert_eq!(
ParseUrl::parse("https://example.com/", "PATH", None)?,
Some("/".to_string())
);
assert_eq!(
ParseUrl::parse("https://ex.com/dir%20/pa%20th.HTML", "PATH", None)?,
Some("/dir%20/pa%20th.HTML".to_string())
);
Ok(())
}

#[test]
fn test_parse_query_key_is_raw() -> Result<()> {
let url = "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two";
assert_eq!(
ParseUrl::parse(url, "QUERY", None)?,
Some("query=x%20y&q2=2".to_string())
);
assert_eq!(
ParseUrl::parse(url, "QUERY", Some("query"))?,
Some("x%20y".to_string())
);
assert_eq!(
ParseUrl::parse("http://ex.com?key=", "QUERY", Some("key"))?,
Some("".to_string())
);
assert_eq!(
ParseUrl::parse("http://ex.com?keyonly", "QUERY", Some("keyonly"))?,
None
);
assert_eq!(
ParseUrl::parse("http://ex.com?a=1&a=2", "QUERY", Some("a"))?,
Some("1".to_string())
);
assert_eq!(
ParseUrl::parse("http://ex.com?a%20b=1", "QUERY", Some("a b"))?,
None
);
Ok(())
}

#[test]
fn test_parse_empty_path_file() -> Result<()> {
assert_eq!(ParseUrl::parse("", "PATH", None)?, Some("".to_string()));
assert_eq!(
ParseUrl::parse("http://example.com", "FILE", None)?,
Some("".to_string())
);
assert_eq!(
ParseUrl::parse("http://example.com?foo=bar", "FILE", None)?,
Some("?foo=bar".to_string())
);
assert_eq!(
ParseUrl::parse("http://example.com#fragment", "FILE", None)?,
Some("".to_string())
);
assert_eq!(
ParseUrl::parse("http://example.com/?foo=bar", "FILE", None)?,
Some("/?foo=bar".to_string())
);
assert_eq!(
ParseUrl::parse("http://ex.com/?", "FILE", None)?,
Some("/?".to_string())
);
assert_eq!(
ParseUrl::parse("http://ex.com?", "FILE", None)?,
Some("?".to_string())
);
Ok(())
}

Expand Down Expand Up @@ -482,7 +573,7 @@ mod tests {

assert_eq!(out_sa.len(), 2);
assert_eq!(out_sa.value(0), "example.com");
assert_eq!(out_sa.value(1), "");
assert_eq!(out_sa.value(1), "/");
Ok(())
}

Expand Down
80 changes: 80 additions & 0 deletions datafusion/sqllogictest/test_files/spark/url/parse_url.slt
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,86 @@ SELECT parse_url('https://example.com', 'PATH');
----
(empty)

query T
SELECT parse_url('https://example.com/', 'PATH');
----
/

query T
SELECT parse_url('https://ex.com/dir%20/pa%20th.HTML', 'PATH');
----
/dir%20/pa%20th.HTML

query T
SELECT parse_url('', 'PATH');
----
(empty)

query T
SELECT parse_url('http://example.com', 'FILE');
----
(empty)

query T
SELECT parse_url('http://example.com/', 'FILE');
----
/

query T
SELECT parse_url('http://example.com?foo=bar', 'FILE');
----
?foo=bar

query T
SELECT parse_url('http://example.com#fragment', 'FILE');
----
(empty)

query T
SELECT parse_url('http://example.com/?foo=bar', 'FILE');
----
/?foo=bar

query T
SELECT parse_url('http://ex.com/?', 'FILE');
----
/?

query T
SELECT parse_url('http://ex.com?', 'FILE');
----
?

query T
SELECT parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY');
----
query=x%20y&q2=2

query T
SELECT parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY', 'query');
----
x%20y

query T
SELECT parse_url('http://ex.com?key=', 'QUERY', 'key');
----
(empty)

query T
SELECT parse_url('http://ex.com?keyonly', 'QUERY', 'keyonly');
----
NULL

query T
SELECT parse_url('http://ex.com?a=1&a=2', 'QUERY', 'a');
----
1

query T
SELECT parse_url('http://ex.com?a%20b=1', 'QUERY', 'a b');
----
NULL

query T
SELECT parse_url('https://example.com', 'path');
----
Expand Down
80 changes: 80 additions & 0 deletions datafusion/sqllogictest/test_files/spark/url/try_parse_url.slt
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,86 @@ SELECT try_parse_url('https://example.com', 'PATH');
----
(empty)

query T
SELECT try_parse_url('https://example.com/', 'PATH');
----
/

query T
SELECT try_parse_url('https://ex.com/dir%20/pa%20th.HTML', 'PATH');
----
/dir%20/pa%20th.HTML

query T
SELECT try_parse_url('', 'PATH');
----
(empty)

query T
SELECT try_parse_url('http://example.com', 'FILE');
----
(empty)

query T
SELECT try_parse_url('http://example.com/', 'FILE');
----
/

query T
SELECT try_parse_url('http://example.com?foo=bar', 'FILE');
----
?foo=bar

query T
SELECT try_parse_url('http://example.com#fragment', 'FILE');
----
(empty)

query T
SELECT try_parse_url('http://example.com/?foo=bar', 'FILE');
----
/?foo=bar

query T
SELECT try_parse_url('http://ex.com/?', 'FILE');
----
/?

query T
SELECT try_parse_url('http://ex.com?', 'FILE');
----
?

query T
SELECT try_parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY');
----
query=x%20y&q2=2

query T
SELECT try_parse_url('https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two', 'QUERY', 'query');
----
x%20y

query T
SELECT try_parse_url('http://ex.com?key=', 'QUERY', 'key');
----
(empty)

query T
SELECT try_parse_url('http://ex.com?keyonly', 'QUERY', 'keyonly');
----
NULL

query T
SELECT try_parse_url('http://ex.com?a=1&a=2', 'QUERY', 'a');
----
1

query T
SELECT try_parse_url('http://ex.com?a%20b=1', 'QUERY', 'a b');
----
NULL

query T
SELECT try_parse_url('https://example.com', 'path');
----
Expand Down
Loading