From b486bce218aabcec48ca92809c4f5ae7a2762abd Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 4 Nov 2019 16:31:27 +0100 Subject: [PATCH 1/2] ARROW-7057: [C++] Add API to parse URI query strings --- cpp/src/arrow/util/uri.cc | 32 +++++++++ cpp/src/arrow/util/uri.h | 7 ++ cpp/src/arrow/util/uri_test.cc | 121 ++++++++++++++++----------------- 3 files changed, 98 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index 579d4c7f243..bfae492ae72 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -120,6 +120,38 @@ std::string Uri::path() const { return ss.str(); } +std::string Uri::query_string() const { return TextRangeToString(impl_->uri_.query); } + +Result>> Uri::query_items() const { + const auto& query = impl_->uri_.query; + UriQueryListA* query_list_start; + int item_count; + std::vector> items; + + if (query.first == nullptr) { + return items; + } + if (uriDissectQueryMallocA(&query_list_start, &item_count, query.first, + query.afterLast) != URI_SUCCESS) { + return Status::Invalid("Cannot parse query string: '", query_string(), "'"); + } + + items.reserve(item_count); + auto query_list = query_list_start; + while (query_list != nullptr) { + if (query_list->value != nullptr) { + items.emplace_back(query_list->key, query_list->value); + } else { + // XXX This doesn't allow differentiating between an empty value + // and a missing value, e.g. between "a&b=1" and "a=&b=1" + items.emplace_back(query_list->key, ""); + } + query_list = query_list->next; + } + uriFreeQueryListA(query_list_start); + return items; +} + const std::string& Uri::ToString() const { return impl_->string_rep_; } Status Uri::Parse(const std::string& uri_string) { diff --git a/cpp/src/arrow/util/uri.h b/cpp/src/arrow/util/uri.h index ce082ccc8e6..b64eb8c2ba6 100644 --- a/cpp/src/arrow/util/uri.h +++ b/cpp/src/arrow/util/uri.h @@ -20,7 +20,10 @@ #include #include #include +#include +#include +#include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/visibility.h" @@ -54,6 +57,10 @@ class ARROW_EXPORT Uri { int32_t port() const; /// The URI path component. std::string path() const; + /// The URI query string + std::string query_string() const; + /// The URI query items + Result>> query_items() const; /// Get the string representation of this URI. const std::string& ToString() const; diff --git a/cpp/src/arrow/util/uri_test.cc b/cpp/src/arrow/util/uri_test.cc index 34a7d24cf99..d06e3265867 100644 --- a/cpp/src/arrow/util/uri_test.cc +++ b/cpp/src/arrow/util/uri_test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -52,75 +53,71 @@ TEST(Uri, ParsePath) { Uri uri; + auto check_case = [&](std::string uri_string, std::string scheme, bool has_host, + std::string host, std::string path) -> void { + ASSERT_OK(uri.Parse(uri_string)); + ASSERT_EQ(uri.scheme(), scheme); + ASSERT_EQ(uri.has_host(), has_host); + ASSERT_EQ(uri.host(), host); + ASSERT_EQ(uri.path(), path); + }; + // Relative path - ASSERT_OK(uri.Parse("unix:tmp/flight.sock")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "tmp/flight.sock"); + check_case("unix:tmp/flight.sock", "unix", false, "", "tmp/flight.sock"); // Absolute path - ASSERT_OK(uri.Parse("unix:/tmp/flight.sock")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "/tmp/flight.sock"); - - ASSERT_OK(uri.Parse("unix://localhost/tmp/flight.sock")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_TRUE(uri.has_host()); - ASSERT_EQ(uri.host(), "localhost"); - ASSERT_EQ(uri.path(), "/tmp/flight.sock"); - - ASSERT_OK(uri.Parse("unix:///tmp/flight.sock")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_TRUE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "/tmp/flight.sock"); + check_case("unix:/tmp/flight.sock", "unix", false, "", "/tmp/flight.sock"); + check_case("unix://localhost/tmp/flight.sock", "unix", true, "localhost", + "/tmp/flight.sock"); + check_case("unix:///tmp/flight.sock", "unix", true, "", "/tmp/flight.sock"); // Empty path - ASSERT_OK(uri.Parse("unix:")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), ""); - - ASSERT_OK(uri.Parse("unix://localhost")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_TRUE(uri.has_host()); - ASSERT_EQ(uri.host(), "localhost"); - ASSERT_EQ(uri.path(), ""); + check_case("unix:", "unix", false, "", ""); + check_case("unix://localhost", "unix", true, "localhost", ""); // With trailing slash - ASSERT_OK(uri.Parse("unix:/")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "/"); - - ASSERT_OK(uri.Parse("unix:tmp/")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "tmp/"); - - ASSERT_OK(uri.Parse("unix://localhost/")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_TRUE(uri.has_host()); - ASSERT_EQ(uri.host(), "localhost"); - ASSERT_EQ(uri.path(), "/"); - - ASSERT_OK(uri.Parse("unix:/tmp/flight/")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_FALSE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "/tmp/flight/"); - - ASSERT_OK(uri.Parse("unix:///tmp/flight/")); - ASSERT_EQ(uri.scheme(), "unix"); - ASSERT_TRUE(uri.has_host()); - ASSERT_EQ(uri.host(), ""); - ASSERT_EQ(uri.path(), "/tmp/flight/"); + check_case("unix:/", "unix", false, "", "/"); + check_case("unix:tmp/", "unix", false, "", "tmp/"); + check_case("unix://localhost/", "unix", true, "localhost", "/"); + check_case("unix:/tmp/flight/", "unix", false, "", "/tmp/flight/"); + check_case("unix://localhost/tmp/flight/", "unix", true, "localhost", "/tmp/flight/"); + check_case("unix:///tmp/flight/", "unix", true, "", "/tmp/flight/"); + + // With query string + check_case("unix:?", "unix", false, "", ""); + check_case("unix:?foo", "unix", false, "", ""); + check_case("unix:?foo=bar", "unix", false, "", ""); + check_case("unix:/?", "unix", false, "", "/"); + check_case("unix:/?foo", "unix", false, "", "/"); + check_case("unix:/?foo=bar", "unix", false, "", "/"); + check_case("unix://localhost/tmp?", "unix", true, "localhost", "/tmp"); + check_case("unix://localhost/tmp?foo", "unix", true, "localhost", "/tmp"); + check_case("unix://localhost/tmp?foo=bar", "unix", true, "localhost", "/tmp"); +} + +TEST(Uri, ParseQuery) { + Uri uri; + + auto check_case = [&](std::string uri_string, std::string query_string, + std::vector> items) -> void { + ASSERT_OK(uri.Parse(uri_string)); + ASSERT_EQ(uri.query_string(), query_string); + auto result = uri.query_items(); + ASSERT_OK(result); + ASSERT_EQ(*result, items); + }; + + check_case("unix://localhost/tmp", "", {}); + check_case("unix://localhost/tmp?", "", {}); + check_case("unix://localhost/tmp?foo=bar", "foo=bar", {{"foo", "bar"}}); + check_case("unix:?foo=bar", "foo=bar", {{"foo", "bar"}}); + check_case("unix:?a=b&c=d", "a=b&c=d", {{"a", "b"}, {"c", "d"}}); + + // With escaped values + check_case("unix:?a=some+value&b=c", "a=some+value&b=c", + {{"a", "some value"}, {"b", "c"}}); + check_case("unix:?a=some%20value%2Fanother&b=c", "a=some%20value%2Fanother&b=c", + {{"a", "some value/another"}, {"b", "c"}}); } TEST(Uri, ParseHostPort) { From cd340b24fd7dd09429ed379f7489732a9863cb96 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 4 Nov 2019 19:09:41 +0100 Subject: [PATCH 2/2] Address review comments --- cpp/src/arrow/util/uri.cc | 12 +++++------- cpp/src/arrow/util/uri.h | 3 +++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index bfae492ae72..706b6dff4c6 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -124,31 +124,29 @@ std::string Uri::query_string() const { return TextRangeToString(impl_->uri_.que Result>> Uri::query_items() const { const auto& query = impl_->uri_.query; - UriQueryListA* query_list_start; + UriQueryListA* query_list; int item_count; std::vector> items; if (query.first == nullptr) { return items; } - if (uriDissectQueryMallocA(&query_list_start, &item_count, query.first, - query.afterLast) != URI_SUCCESS) { + if (uriDissectQueryMallocA(&query_list, &item_count, query.first, query.afterLast) != + URI_SUCCESS) { return Status::Invalid("Cannot parse query string: '", query_string(), "'"); } + std::unique_ptr query_guard( + query_list, uriFreeQueryListA); items.reserve(item_count); - auto query_list = query_list_start; while (query_list != nullptr) { if (query_list->value != nullptr) { items.emplace_back(query_list->key, query_list->value); } else { - // XXX This doesn't allow differentiating between an empty value - // and a missing value, e.g. between "a&b=1" and "a=&b=1" items.emplace_back(query_list->key, ""); } query_list = query_list->next; } - uriFreeQueryListA(query_list_start); return items; } diff --git a/cpp/src/arrow/util/uri.h b/cpp/src/arrow/util/uri.h index b64eb8c2ba6..c7a8079c49f 100644 --- a/cpp/src/arrow/util/uri.h +++ b/cpp/src/arrow/util/uri.h @@ -60,6 +60,9 @@ class ARROW_EXPORT Uri { /// The URI query string std::string query_string() const; /// The URI query items + /// + /// Note this API doesn't allow differentiating between an empty value + /// and a missing value, such in "a&b=1" vs. "a=&b=1". Result>> query_items() const; /// Get the string representation of this URI.