From f33aba1712deebf73d4b12ffb560409059a89e61 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 2 Mar 2023 17:13:53 -0800 Subject: [PATCH 1/8] Add a PathFromUriOrPath method to help convert URIs to paths after you already have a filesystem --- cpp/src/arrow/filesystem/filesystem.cc | 49 ++++++++++++++++++++++++ cpp/src/arrow/filesystem/filesystem.h | 18 +++++++++ cpp/src/arrow/filesystem/gcsfs_test.cc | 10 ++++- cpp/src/arrow/filesystem/hdfs_test.cc | 2 + cpp/src/arrow/filesystem/localfs_test.cc | 23 +++++++++++ cpp/src/arrow/filesystem/s3fs_test.cc | 3 ++ 6 files changed, 103 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 73b94d38288..8e94c281907 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -738,8 +738,57 @@ Result> FileSystemFromUriReal(const Uri& uri, return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); } +Status CheckFileSystem(const FileSystem* filesystem, + const std::string& expected_type_name, const std::string& uri) { + if (filesystem && filesystem->type_name() != expected_type_name) { + return Status::Invalid("URI ", uri, " is not compatible with filesystem of type ", + filesystem->type_name()); + } + return Status::OK(); +} + } // namespace +Result PathFromUriOrPath(const FileSystem* filesystem, + const std::string& uri_string) { + if (internal::DetectAbsolutePath(uri_string)) { + RETURN_NOT_OK(CheckFileSystem(filesystem, "local", uri_string)); + // Normalize path separators + return ToSlashes(uri_string); + } + + ARROW_ASSIGN_OR_RAISE(auto uri, ParseFileSystemUri(uri_string)); + const auto scheme = uri.scheme(); + std::string path; + if (scheme == "file") { + RETURN_NOT_OK(CheckFileSystem(filesystem, "local", uri_string)); + RETURN_NOT_OK(LocalFileSystemOptions::FromUri(uri, &path)); + } else if (scheme == "gs" || scheme == "gcs") { + RETURN_NOT_OK(CheckFileSystem(filesystem, "gcs", uri_string)); +#ifdef ARROW_GCS + RETURN_NOT_OK(GcsOptions::FromUri(uri, &path)); +#else + return Status::NotImplemented("Got GCS URI but Arrow compiled without GCS support"); +#endif + } else if (scheme == "hdfs" || scheme == "viewfs") { + RETURN_NOT_OK(CheckFileSystem(filesystem, "hdfs", uri_string)); + path = uri.path(); + } else if (scheme == "s3") { + RETURN_NOT_OK(CheckFileSystem(filesystem, "s3", uri_string)); +#ifdef ARROW_S3 + RETURN_NOT_OK(S3Options::FromUri(uri, &path)); +#else + return Status::NotImplemented("Got S3 URI but Arrow compiled without S3 support"); +#endif + } else if (scheme == "mock") { + RETURN_NOT_OK(CheckFileSystem(filesystem, "mock", uri_string)); + path = std::string(RemoveLeadingSlash(uri.path())); + } else { + return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); + } + return path; +} + Result> FileSystemFromUri(const std::string& uri_string, std::string* out_path) { return FileSystemFromUri(uri_string, io::default_io_context(), out_path); diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 6dc18d7de84..7478fd49500 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -476,6 +476,24 @@ Result> FileSystemFromUri(const std::string& uri, const io::IOContext& io_context, std::string* out_path = NULLPTR); +/// \brief Ensure a URI (or path) is compatible with the given filesystem and return the +/// path +/// +/// \param filesystem A filesystem that should be capable of fetching the URI +/// \param uri A URI representing a resource in the given filesystem. +/// +/// This method will check to ensure the given filesystem is compatible with the +/// URI. If that behavior is not desired then NULLPTR can be specified for the +/// filesystem and that check will be skipped. +/// +/// uri can be an absolute path instead of a URI. In that case it will ensure the +/// filesystem (if supplied) is the local filesystem and will normalize the path's +/// file separators. +/// +/// \return The path inside the filesystem that is indicated by the URI. +Result PathFromUriOrPath(const FileSystem* filesystem, + const std::string& uri); + /// \brief Create a new FileSystem by URI /// /// Same as FileSystemFromUri, but in addition also recognize non-URIs diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 15d596dc0b8..fe3f7e724c5 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -1383,9 +1383,15 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) { TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { // Smoke test for FileSystemFromUri - ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(std::string("gs://anonymous@") + - PreexistingBucketPath())); + std::string path; + ASSERT_OK_AND_ASSIGN( + auto fs, + FileSystemFromUri(std::string("gs://anonymous@") + PreexistingBucketPath(), &path)); EXPECT_EQ(fs->type_name(), "gcs"); + EXPECT_EQ(path, PreexistingBucketName()); + ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(fs.get(), std::string("gs://anonymous@") + + PreexistingBucketPath())); + EXPECT_EQ(path, PreexistingBucketName()); ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") + PreexistingBucketPath())); EXPECT_EQ(fs2->type_name(), "gcs"); diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test.cc index b1020231beb..30703e84db8 100644 --- a/cpp/src/arrow/filesystem/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test.cc @@ -119,6 +119,8 @@ class TestHadoopFileSystem : public ::testing::Test, public HadoopFileSystemTest ARROW_LOG(INFO) << "!!! uri = " << ss.str(); ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "/"); + ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(uri_fs.get(), ss.str())); + ASSERT_EQ(path, "/"); // Sanity check ASSERT_OK(uri_fs->CreateDir("AB")); diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 33f75dd845a..6b51ef8188f 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -210,13 +210,24 @@ class TestLocalFS : public LocalFSTestMixin { ASSERT_EQ(path, expected_path); } + void CheckLocalPathFromUriOrPath(const std::string& uri, + const std::string& expected_path) { + ASSERT_OK_AND_ASSIGN(std::string actual_path, PathFromUriOrPath(nullptr, uri)); + ASSERT_EQ(actual_path, expected_path); + LocalFileSystem lfs; + ASSERT_OK_AND_ASSIGN(actual_path, PathFromUriOrPath(&lfs, uri)); + ASSERT_EQ(actual_path, expected_path); + } + // Like TestFileSystemFromUri, but with an arbitrary non-existing path void TestLocalUri(const std::string& uri, const std::string& expected_path) { CheckLocalUri(uri, expected_path, FSFromUri); + CheckLocalPathFromUriOrPath(uri, expected_path); } void TestLocalUriOrPath(const std::string& uri, const std::string& expected_path) { CheckLocalUri(uri, expected_path, FSFromUriOrPath); + CheckLocalPathFromUriOrPath(uri, expected_path); } void TestInvalidUri(const std::string& uri) { @@ -231,6 +242,14 @@ class TestLocalFS : public LocalFSTestMixin { return; // skip } ASSERT_RAISES(Invalid, FileSystemFromUriOrPath(uri)); + LocalFileSystem lfs; + ASSERT_RAISES(Invalid, PathFromUriOrPath(&lfs, uri)); + } + + void TestNonMatchingUri(const std::string& uri) { + // Legitimate URI for the wrong filesystem + LocalFileSystem lfs; + ASSERT_RAISES(Invalid, PathFromUriOrPath(&lfs, uri)); } void CheckConcreteFile(const std::string& path, int64_t expected_size) { @@ -360,6 +379,10 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoSchemeBackslashes) { this->TestInvalidUriOrPath("foo\\bar"); } +TYPED_TEST(TestLocalFS, MismatchedFilesystemPathFromUri) { + this->TestNonMatchingUri("s3://foo"); +} + TYPED_TEST(TestLocalFS, DirectoryMTime) { TimePoint t1 = CurrentTimePoint(); ASSERT_OK(this->fs_->CreateDir("AB/CD/EF")); diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 38df84bdede..88656fd6967 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -1143,6 +1143,9 @@ TEST_F(TestS3FS, FileSystemFromUri) { ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); + ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(fs.get(), ss.str())); + ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); + // Check the filesystem has the right connection parameters AssertFileInfo(fs.get(), path, FileType::File, 8); } From d8d7fc8d0196cced1ed81231b33ba75589a85aeb Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 2 Mar 2023 17:16:10 -0800 Subject: [PATCH 2/8] Add ARROW_EXPORT for Windows --- cpp/src/arrow/filesystem/filesystem.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 7478fd49500..86405992a6f 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -491,6 +491,7 @@ Result> FileSystemFromUri(const std::string& uri, /// file separators. /// /// \return The path inside the filesystem that is indicated by the URI. +ARROW_EXPORT Result PathFromUriOrPath(const FileSystem* filesystem, const std::string& uri); From 7758b09f0bd1e05aadc2c6c783ce32a1b4e6c5dc Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 2 May 2023 09:16:59 -0700 Subject: [PATCH 3/8] Moved PathFromUriOrPath into an abstract filesystem method --- cpp/src/arrow/filesystem/filesystem.cc | 79 ++++------------------- cpp/src/arrow/filesystem/filesystem.h | 38 +++++------ cpp/src/arrow/filesystem/gcsfs.cc | 25 +++++++ cpp/src/arrow/filesystem/gcsfs.h | 1 + cpp/src/arrow/filesystem/gcsfs_test.cc | 15 ++++- cpp/src/arrow/filesystem/hdfs.cc | 16 +++++ cpp/src/arrow/filesystem/hdfs.h | 1 + cpp/src/arrow/filesystem/hdfs_test.cc | 2 +- cpp/src/arrow/filesystem/localfs.cc | 57 +++++++--------- cpp/src/arrow/filesystem/localfs.h | 9 +-- cpp/src/arrow/filesystem/localfs_test.cc | 20 ++---- cpp/src/arrow/filesystem/mockfs.cc | 13 ++++ cpp/src/arrow/filesystem/mockfs.h | 1 + cpp/src/arrow/filesystem/s3fs.cc | 29 +++++++++ cpp/src/arrow/filesystem/s3fs.h | 1 + cpp/src/arrow/filesystem/s3fs_test.cc | 24 ++++++- cpp/src/arrow/filesystem/util_internal.cc | 45 +++++++++++++ cpp/src/arrow/filesystem/util_internal.h | 13 ++++ 18 files changed, 245 insertions(+), 144 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 8e94c281907..6e23a176734 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -60,6 +60,7 @@ using internal::ConcatAbstractPath; using internal::EnsureTrailingSlash; using internal::GetAbstractPathParent; using internal::kSep; +using internal::ParseFileSystemUri; using internal::RemoveLeadingSlash; using internal::RemoveTrailingSlash; using internal::ToSlashes; @@ -254,6 +255,10 @@ Result> FileSystem::OpenAppendStream( return OpenAppendStream(path, std::shared_ptr{}); } +Result FileSystem::PathFromUri(const std::string& uri_string) const { + return Status::NotImplemented("PathFromUri is not yet supported on this filesystem"); +} + ////////////////////////////////////////////////////////////////////////// // SubTreeFileSystem implementation @@ -484,6 +489,10 @@ Result> SubTreeFileSystem::OpenAppendStream( return base_fs_->OpenAppendStream(real_path, metadata); } +Result SubTreeFileSystem::PathFromUri(const std::string& uri_string) const { + return base_fs_->PathFromUri(uri_string); +} + ////////////////////////////////////////////////////////////////////////// // SlowFileSystem implementation @@ -505,6 +514,10 @@ SlowFileSystem::SlowFileSystem(std::shared_ptr base_fs, bool SlowFileSystem::Equals(const FileSystem& other) const { return this == &other; } +Result SlowFileSystem::PathFromUri(const std::string& uri_string) const { + return base_fs_->PathFromUri(uri_string); +} + Result SlowFileSystem::GetFileInfo(const std::string& path) { latencies_->Sleep(); return base_fs_->GetFileInfo(path); @@ -662,23 +675,6 @@ Status CopyFiles(const std::shared_ptr& source_fs, namespace { -Result ParseFileSystemUri(const std::string& uri_string) { - Uri uri; - auto status = uri.Parse(uri_string); - if (!status.ok()) { -#ifdef _WIN32 - // Could be a "file:..." URI with backslashes instead of regular slashes. - RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string))); - if (uri.scheme() != "file") { - return status; - } -#else - return status; -#endif - } - return std::move(uri); -} - Result> FileSystemFromUriReal(const Uri& uri, const std::string& uri_string, const io::IOContext& io_context, @@ -738,57 +734,8 @@ Result> FileSystemFromUriReal(const Uri& uri, return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); } -Status CheckFileSystem(const FileSystem* filesystem, - const std::string& expected_type_name, const std::string& uri) { - if (filesystem && filesystem->type_name() != expected_type_name) { - return Status::Invalid("URI ", uri, " is not compatible with filesystem of type ", - filesystem->type_name()); - } - return Status::OK(); -} - } // namespace -Result PathFromUriOrPath(const FileSystem* filesystem, - const std::string& uri_string) { - if (internal::DetectAbsolutePath(uri_string)) { - RETURN_NOT_OK(CheckFileSystem(filesystem, "local", uri_string)); - // Normalize path separators - return ToSlashes(uri_string); - } - - ARROW_ASSIGN_OR_RAISE(auto uri, ParseFileSystemUri(uri_string)); - const auto scheme = uri.scheme(); - std::string path; - if (scheme == "file") { - RETURN_NOT_OK(CheckFileSystem(filesystem, "local", uri_string)); - RETURN_NOT_OK(LocalFileSystemOptions::FromUri(uri, &path)); - } else if (scheme == "gs" || scheme == "gcs") { - RETURN_NOT_OK(CheckFileSystem(filesystem, "gcs", uri_string)); -#ifdef ARROW_GCS - RETURN_NOT_OK(GcsOptions::FromUri(uri, &path)); -#else - return Status::NotImplemented("Got GCS URI but Arrow compiled without GCS support"); -#endif - } else if (scheme == "hdfs" || scheme == "viewfs") { - RETURN_NOT_OK(CheckFileSystem(filesystem, "hdfs", uri_string)); - path = uri.path(); - } else if (scheme == "s3") { - RETURN_NOT_OK(CheckFileSystem(filesystem, "s3", uri_string)); -#ifdef ARROW_S3 - RETURN_NOT_OK(S3Options::FromUri(uri, &path)); -#else - return Status::NotImplemented("Got S3 URI but Arrow compiled without S3 support"); -#endif - } else if (scheme == "mock") { - RETURN_NOT_OK(CheckFileSystem(filesystem, "mock", uri_string)); - path = std::string(RemoveLeadingSlash(uri.path())); - } else { - return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); - } - return path; -} - Result> FileSystemFromUri(const std::string& uri_string, std::string* out_path) { return FileSystemFromUri(uri_string, io::default_io_context(), out_path); diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 86405992a6f..f0bc3277a5f 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -171,6 +171,23 @@ class ARROW_EXPORT FileSystem : public std::enable_shared_from_this /// may allow normalizing irregular path forms (such as Windows local paths). virtual Result NormalizePath(std::string path); + /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the + /// path + /// + /// \param uri_string A URI representing a resource in the given filesystem. + /// + /// This method will check to ensure the given filesystem is compatible with the + /// URI. This can be useful when the user provides both a URI and a filesystem or + /// when a user provides multiple URIs that should be compatible with the same + /// filesystem. + /// + /// uri can be an absolute path instead of a URI. In that case it will ensure the + /// filesystem (if supplied) is the local filesystem (or some custom filesystem that + /// is capable of reading local paths) and will normalize the path's file separators. + /// + /// \return The path inside the filesystem that is indicated by the URI. + virtual Result PathFromUri(const std::string& uri_string) const; + virtual bool Equals(const FileSystem& other) const = 0; virtual bool Equals(const std::shared_ptr& other) const { @@ -336,6 +353,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { std::shared_ptr base_fs() const { return base_fs_; } Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; bool Equals(const FileSystem& other) const override; @@ -410,6 +428,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem { std::string type_name() const override { return "slow"; } bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; using FileSystem::GetFileInfo; Result GetFileInfo(const std::string& path) override; @@ -476,25 +495,6 @@ Result> FileSystemFromUri(const std::string& uri, const io::IOContext& io_context, std::string* out_path = NULLPTR); -/// \brief Ensure a URI (or path) is compatible with the given filesystem and return the -/// path -/// -/// \param filesystem A filesystem that should be capable of fetching the URI -/// \param uri A URI representing a resource in the given filesystem. -/// -/// This method will check to ensure the given filesystem is compatible with the -/// URI. If that behavior is not desired then NULLPTR can be specified for the -/// filesystem and that check will be skipped. -/// -/// uri can be an absolute path instead of a URI. In that case it will ensure the -/// filesystem (if supplied) is the local filesystem and will normalize the path's -/// file separators. -/// -/// \return The path inside the filesystem that is indicated by the URI. -ARROW_EXPORT -Result PathFromUriOrPath(const FileSystem* filesystem, - const std::string& uri); - /// \brief Create a new FileSystem by URI /// /// Same as FileSystemFromUri, but in addition also recognize non-URIs diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index f063e31b5c5..ffe04f626fa 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -873,6 +873,31 @@ bool GcsFileSystem::Equals(const FileSystem& other) const { return impl_->options().Equals(fs.impl_->options()); } +Result GcsFileSystem::PathFromUri(const std::string& uri_string) const { + if (internal::DetectAbsolutePath(uri_string)) { + return Status::Invalid( + "The GCS filesystem is not capable of loading local paths. URIs must start " + "with gs:// or gcs://"); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (uri.scheme() != "gs" && uri.scheme() != "gcs") { + return Status::Invalid("GCS URIs must start with gs:// or gcs:// but received ", + uri_string); + } + std::string path; + ARROW_ASSIGN_OR_RAISE(GcsOptions parsed_options, GcsOptions::FromUri(uri, &path)); + const GcsOptions& existing_options = impl_->options(); + if (parsed_options.endpoint_override != existing_options.endpoint_override) { + return Status::Invalid("Provided URI specified endpoint '", + parsed_options.endpoint_override, + "' but existing filesystem is configured for endpoint '", + existing_options.endpoint_override, "'"); + } + return path; +} + Result GcsFileSystem::GetFileInfo(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); return impl_->GetFileInfo(p); diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index c3d03b5cb21..d4b919ec814 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -178,6 +178,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { const GcsOptions& options() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; Result GetFileInfo(const std::string& path) override; Result GetFileInfo(const FileSelector& select) override; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index fe3f7e724c5..e09d9c366bb 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -54,6 +54,7 @@ #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/util/future.h" #include "arrow/util/key_value_metadata.h" @@ -1389,12 +1390,22 @@ TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { FileSystemFromUri(std::string("gs://anonymous@") + PreexistingBucketPath(), &path)); EXPECT_EQ(fs->type_name(), "gcs"); EXPECT_EQ(path, PreexistingBucketName()); - ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(fs.get(), std::string("gs://anonymous@") + - PreexistingBucketPath())); + ASSERT_OK_AND_ASSIGN( + path, fs->PathFromUri(std::string("gs://anonymous@") + PreexistingBucketPath())); EXPECT_EQ(path, PreexistingBucketName()); ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") + PreexistingBucketPath())); EXPECT_EQ(fs2->type_name(), "gcs"); + ASSERT_THAT(fs->PathFromUri("/foo/bar"), + Raises(StatusCode::Invalid, testing::HasSubstr("URIs must start with"))); + ASSERT_THAT( + fs->PathFromUri("s3:///foo/bar"), + Raises(StatusCode::Invalid, testing::HasSubstr("GCS URIs must start with"))); + ASSERT_THAT( + fs->PathFromUri(std::string("gs://anonymous@") + PreexistingBucketPath() + + "?endpoint_override=foo"), + Raises(StatusCode::Invalid, + testing::HasSubstr("but existing filesystem is configured for endpoint"))); } } // namespace diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 8709ab45625..0094de30f96 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -473,6 +473,22 @@ bool HadoopFileSystem::Equals(const FileSystem& other) const { return options().Equals(hdfs.options()); } +Result HadoopFileSystem::PathFromUri(const std::string& uri_string) const { + if (internal::DetectAbsolutePath(uri_string)) { + return Status::Invalid( + "The HDFS filesystem is not capable of loading local paths. URIs must start " + "with hdfs:// or viewfs://"); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (uri.scheme() != "hdfs" && uri.scheme() != "viewfs") { + return Status::Invalid("HDFS URIs must start with hdfs:// or viewfs:// but received ", + uri_string); + } + return uri.path(); +} + Result> HadoopFileSystem::GetFileInfo(const FileSelector& select) { return impl_->GetFileInfo(select); } diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index bed0ac4c613..798aac0ea90 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { std::string type_name() const override { return "hdfs"; } HdfsOptions options() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; /// \cond FALSE using FileSystem::GetFileInfo; diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test.cc index 30703e84db8..7ad9e6cd40e 100644 --- a/cpp/src/arrow/filesystem/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test.cc @@ -119,7 +119,7 @@ class TestHadoopFileSystem : public ::testing::Test, public HadoopFileSystemTest ARROW_LOG(INFO) << "!!! uri = " << ss.str(); ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "/"); - ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(uri_fs.get(), ss.str())); + ASSERT_OK_AND_ASSIGN(path, uri_fs->PathFromUri(ss.str())); ASSERT_EQ(path, "/"); // Sanity check diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 03b4ad3bc72..93815f7fe24 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -52,37 +52,6 @@ using ::arrow::internal::IOErrorFromWinError; using ::arrow::internal::NativePathString; using ::arrow::internal::PlatformFilename; -namespace internal { - -#ifdef _WIN32 -static bool IsDriveLetter(char c) { - // Can't use locale-dependent functions from the C/C++ stdlib - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); -} -#endif - -bool DetectAbsolutePath(const std::string& s) { - // Is it a /-prefixed local path? - if (s.length() >= 1 && s[0] == '/') { - return true; - } -#ifdef _WIN32 - // Is it a \-prefixed local path? - if (s.length() >= 1 && s[0] == '\\') { - return true; - } - // Does it start with a drive letter in addition to being /- or \-prefixed, - // e.g. "C:\..."? - if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && - IsDriveLetter(s[0])) { - return true; - } -#endif - return false; -} - -} // namespace internal - namespace { Status ValidatePath(std::string_view s) { @@ -92,6 +61,12 @@ Status ValidatePath(std::string_view s) { return Status::OK(); } +Result DoNormalizePath(std::string path) { + RETURN_NOT_OK(ValidatePath(path)); + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + return fn.ToString(); +} + #ifdef _WIN32 std::string NativeToString(const NativePathString& ns) { @@ -286,9 +261,23 @@ LocalFileSystem::LocalFileSystem(const LocalFileSystemOptions& options, LocalFileSystem::~LocalFileSystem() {} Result LocalFileSystem::NormalizePath(std::string path) { - RETURN_NOT_OK(ValidatePath(path)); - ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); - return fn.ToString(); + return DoNormalizePath(std::move(path)); +} + +Result LocalFileSystem::PathFromUri(const std::string& uri_string) const { + if (internal::DetectAbsolutePath(uri_string)) { + return DoNormalizePath(uri_string); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (scheme != "file") { + return Status::Invalid("Local URIs must begin with file:// but received ", + uri_string); + } + std::string path; + RETURN_NOT_OK(LocalFileSystemOptions::FromUri(uri, &path)); + return path; } bool LocalFileSystem::Equals(const FileSystem& other) const { diff --git a/cpp/src/arrow/filesystem/localfs.h b/cpp/src/arrow/filesystem/localfs.h index 75eaf314e4d..108530c2b27 100644 --- a/cpp/src/arrow/filesystem/localfs.h +++ b/cpp/src/arrow/filesystem/localfs.h @@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem { std::string type_name() const override { return "local"; } Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; bool Equals(const FileSystem& other) const override; @@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem { LocalFileSystemOptions options_; }; -namespace internal { - -// Return whether the string is detected as a local absolute path. -ARROW_EXPORT -bool DetectAbsolutePath(const std::string& s); - -} // namespace internal - } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 6b51ef8188f..7121df833c5 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -31,6 +31,7 @@ #include "arrow/filesystem/test_util.h" #include "arrow/filesystem/util_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/util/io_util.h" #include "arrow/util/uri.h" @@ -208,26 +209,17 @@ class TestLocalFS : public LocalFSTestMixin { ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path)); ASSERT_EQ(fs_->type_name(), "local"); ASSERT_EQ(path, expected_path); - } - - void CheckLocalPathFromUriOrPath(const std::string& uri, - const std::string& expected_path) { - ASSERT_OK_AND_ASSIGN(std::string actual_path, PathFromUriOrPath(nullptr, uri)); - ASSERT_EQ(actual_path, expected_path); - LocalFileSystem lfs; - ASSERT_OK_AND_ASSIGN(actual_path, PathFromUriOrPath(&lfs, uri)); - ASSERT_EQ(actual_path, expected_path); + ASSERT_OK_AND_ASSIGN(path, fs_->PathFromUri(uri)); + ASSERT_EQ(path, expected_path); } // Like TestFileSystemFromUri, but with an arbitrary non-existing path void TestLocalUri(const std::string& uri, const std::string& expected_path) { CheckLocalUri(uri, expected_path, FSFromUri); - CheckLocalPathFromUriOrPath(uri, expected_path); } void TestLocalUriOrPath(const std::string& uri, const std::string& expected_path) { CheckLocalUri(uri, expected_path, FSFromUriOrPath); - CheckLocalPathFromUriOrPath(uri, expected_path); } void TestInvalidUri(const std::string& uri) { @@ -243,13 +235,15 @@ class TestLocalFS : public LocalFSTestMixin { } ASSERT_RAISES(Invalid, FileSystemFromUriOrPath(uri)); LocalFileSystem lfs; - ASSERT_RAISES(Invalid, PathFromUriOrPath(&lfs, uri)); + ASSERT_RAISES(Invalid, lfs.PathFromUri(uri)); } void TestNonMatchingUri(const std::string& uri) { // Legitimate URI for the wrong filesystem LocalFileSystem lfs; - ASSERT_RAISES(Invalid, PathFromUriOrPath(&lfs, uri)); + ASSERT_THAT( + lfs.PathFromUri(uri), + Raises(StatusCode::Invalid, testing::HasSubstr("must begin with file://"))); } void CheckConcreteFile(const std::string& path, int64_t expected_size) { diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index 3bc6f4464eb..9cd775558d3 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -436,6 +436,19 @@ MockFileSystem::MockFileSystem(TimePoint current_time, const io::IOContext& io_c bool MockFileSystem::Equals(const FileSystem& other) const { return this == &other; } +Result MockFileSystem::PathFromUri(const std::string& uri_string) const { + if (internal::DetectAbsolutePath(uri_string)) { + return std::string(RemoveLeadingSlash(uri_string)); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (uri.scheme() != "mock") { + return Status::Invalid("Mock URIs must start with mock:// but received ", uri_string); + } + return std::string(RemoveLeadingSlash(uri.path())); +} + Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); diff --git a/cpp/src/arrow/filesystem/mockfs.h b/cpp/src/arrow/filesystem/mockfs.h index e12408f52ce..32d06e5910d 100644 --- a/cpp/src/arrow/filesystem/mockfs.h +++ b/cpp/src/arrow/filesystem/mockfs.h @@ -66,6 +66,7 @@ class ARROW_EXPORT MockFileSystem : public FileSystem { std::string type_name() const override { return "mock"; } bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; // XXX It's not very practical to have to explicitly declare inheritance // of default overrides. diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index a22d9c10bec..2a18fcbd813 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2235,6 +2235,35 @@ bool S3FileSystem::Equals(const FileSystem& other) const { return options().Equals(s3fs.options()); } +Result S3FileSystem::PathFromUri(const std::string& uri_string) const { + if (internal::DetectAbsolutePath(uri_string)) { + return Status::Invalid( + "The S3 filesystem is not capable of loading local paths. URIs must start " + "with s3://"); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (uri.scheme() != "s3") { + return Status::Invalid("S3 URIs must start with s3:// but received ", uri_string); + } + std::string path; + ARROW_ASSIGN_OR_RAISE(S3Options parsed_options, S3Options::FromUri(uri, &path)); + const S3Options& existing_options = impl_->options(); + if (parsed_options.endpoint_override != existing_options.endpoint_override) { + return Status::Invalid("Provided URI specified endpoint '", + parsed_options.endpoint_override, + "' but existing filesystem is configured for endpoint '", + existing_options.endpoint_override, "'"); + } + if (parsed_options.region != existing_options.region) { + return Status::Invalid("Provided URI specified region '", parsed_options.region, + "' but existing filesystem is configured for region '", + existing_options.region, "'"); + } + return path; +} + S3Options S3FileSystem::options() const { return impl_->options(); } std::string S3FileSystem::region() const { return impl_->region(); } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 2bccecafe8e..0a7ca73ccb2 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -247,6 +247,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { std::string region() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; /// \cond FALSE using FileSystem::GetFileInfo; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 88656fd6967..7210e9602f1 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -58,6 +58,7 @@ #include "arrow/status.h" #include "arrow/testing/future_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/util/async_generator.h" #include "arrow/util/checked_cast.h" @@ -1143,9 +1144,30 @@ TEST_F(TestS3FS, FileSystemFromUri) { ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); - ASSERT_OK_AND_ASSIGN(path, PathFromUriOrPath(fs.get(), ss.str())); + ASSERT_OK_AND_ASSIGN(path, fs->PathFromUri(ss.str())); ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); + // Incorrect scheme + ASSERT_THAT(fs->PathFromUri("file:///@bucket/somedir/subdir/subfile"), + Raises(StatusCode::Invalid, testing::HasSubstr("must start with s3://"))); + + // Not a URI + ASSERT_THAT(fs->PathFromUri("/@bucket/somedir/subdir/subfile"), + Raises(StatusCode::Invalid, testing::HasSubstr("must start with s3://"))); + + // Correct scheme, wrong endpoint + ASSERT_THAT( + fs->PathFromUri("s3://@bucket/somedir/subdir/subfile"), + Raises(StatusCode::Invalid, + testing::HasSubstr("but existing filesystem is configured for endpoint"))); + + // Correct scheme & endpoint, wrong region + ss << "®ion=us-west-2"; + ASSERT_THAT( + fs->PathFromUri(ss.str()), + Raises(StatusCode::Invalid, + testing::HasSubstr("but existing filesystem is configured for region"))); + // Check the filesystem has the right connection parameters AssertFileInfo(fs.get(), path, FileType::File, 8); } diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 79e8503818c..49944def996 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -28,6 +28,7 @@ namespace arrow { using internal::StatusDetailFromErrno; +using internal::Uri; namespace fs { namespace internal { @@ -76,6 +77,50 @@ Status InvalidDeleteDirContents(std::string_view path) { "If you wish to delete the root directory's contents, call DeleteRootDirContents."); } +Result ParseFileSystemUri(const std::string& uri_string) { + Uri uri; + auto status = uri.Parse(uri_string); + if (!status.ok()) { +#ifdef _WIN32 + // Could be a "file:..." URI with backslashes instead of regular slashes. + RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string))); + if (uri.scheme() != "file") { + return status; + } +#else + return status; +#endif + } + return std::move(uri); +} + +#ifdef _WIN32 +static bool IsDriveLetter(char c) { + // Can't use locale-dependent functions from the C/C++ stdlib + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} +#endif + +bool DetectAbsolutePath(const std::string& s) { + // Is it a /-prefixed local path? + if (s.length() >= 1 && s[0] == '/') { + return true; + } +#ifdef _WIN32 + // Is it a \-prefixed local path? + if (s.length() >= 1 && s[0] == '\\') { + return true; + } + // Does it start with a drive letter in addition to being /- or \-prefixed, + // e.g. "C:\..."? + if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && + IsDriveLetter(s[0])) { + return true; + } +#endif + return false; +} + Result GlobFiles(const std::shared_ptr& filesystem, const std::string& glob) { // TODO: ARROW-17640 diff --git a/cpp/src/arrow/filesystem/util_internal.h b/cpp/src/arrow/filesystem/util_internal.h index cc16dbba106..c5c4b7b50e1 100644 --- a/cpp/src/arrow/filesystem/util_internal.h +++ b/cpp/src/arrow/filesystem/util_internal.h @@ -24,9 +24,11 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" +#include "arrow/util/uri.h" #include "arrow/util/visibility.h" namespace arrow { +using internal::Uri; namespace fs { namespace internal { @@ -50,6 +52,17 @@ Status NotAFile(std::string_view path); ARROW_EXPORT Status InvalidDeleteDirContents(std::string_view path); +/// \brief Parse the string as a URI +/// \param uri_string the string to parse +/// +/// This is the same as Uri::Parse except it tolerates Windows +/// file URIs that contain backslash instead of / +Result ParseFileSystemUri(const std::string& uri_string); + +/// \brief check if the string is a local absolute path +ARROW_EXPORT +bool DetectAbsolutePath(const std::string& s); + /// \brief Return files matching the glob pattern on the filesystem /// /// Globbing starts from the root of the filesystem. From 2f2024a5bf0949e327d45e18925b51c69e527940 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 2 May 2023 10:28:07 -0700 Subject: [PATCH 4/8] Simplified the URI parsing logic by combining into one helper method --- cpp/src/arrow/filesystem/filesystem.h | 7 ++- cpp/src/arrow/filesystem/gcsfs.cc | 25 ++------- cpp/src/arrow/filesystem/gcsfs_test.cc | 8 +-- cpp/src/arrow/filesystem/hdfs.cc | 16 ++---- cpp/src/arrow/filesystem/localfs.cc | 20 +++---- cpp/src/arrow/filesystem/localfs_test.cc | 13 +++-- cpp/src/arrow/filesystem/mockfs.cc | 15 ++---- cpp/src/arrow/filesystem/s3fs.cc | 28 +--------- cpp/src/arrow/filesystem/s3fs_test.cc | 18 ++----- cpp/src/arrow/filesystem/util_internal.cc | 63 +++++++++++++++++++++++ cpp/src/arrow/filesystem/util_internal.h | 23 +++++++++ 11 files changed, 124 insertions(+), 112 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index f0bc3277a5f..cfadaeb0ce1 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -181,10 +181,13 @@ class ARROW_EXPORT FileSystem : public std::enable_shared_from_this /// when a user provides multiple URIs that should be compatible with the same /// filesystem. /// - /// uri can be an absolute path instead of a URI. In that case it will ensure the - /// filesystem (if supplied) is the local filesystem (or some custom filesystem that + /// uri_string can be an absolute path instead of a URI. In that case it will ensure + /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that /// is capable of reading local paths) and will normalize the path's file separators. /// + /// Note, this method only checks to ensure the URI scheme is valid. It will not detect + /// inconsistencies like a mismatching region or endpoint override. + /// /// \return The path inside the filesystem that is indicated by the URI. virtual Result PathFromUri(const std::string& uri_string) const; diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index ffe04f626fa..6fc75589b0a 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -874,28 +874,9 @@ bool GcsFileSystem::Equals(const FileSystem& other) const { } Result GcsFileSystem::PathFromUri(const std::string& uri_string) const { - if (internal::DetectAbsolutePath(uri_string)) { - return Status::Invalid( - "The GCS filesystem is not capable of loading local paths. URIs must start " - "with gs:// or gcs://"); - } - Uri uri; - ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); - const auto scheme = uri.scheme(); - if (uri.scheme() != "gs" && uri.scheme() != "gcs") { - return Status::Invalid("GCS URIs must start with gs:// or gcs:// but received ", - uri_string); - } - std::string path; - ARROW_ASSIGN_OR_RAISE(GcsOptions parsed_options, GcsOptions::FromUri(uri, &path)); - const GcsOptions& existing_options = impl_->options(); - if (parsed_options.endpoint_override != existing_options.endpoint_override) { - return Status::Invalid("Provided URI specified endpoint '", - parsed_options.endpoint_override, - "' but existing filesystem is configured for endpoint '", - existing_options.endpoint_override, "'"); - } - return path; + return internal::PathFromUriHelper(uri_string, {"gs", "gcs"}, + /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kPrepend); } Result GcsFileSystem::GetFileInfo(const std::string& path) { diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index e09d9c366bb..9d5136b47c9 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -1397,15 +1397,11 @@ TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { PreexistingBucketPath())); EXPECT_EQ(fs2->type_name(), "gcs"); ASSERT_THAT(fs->PathFromUri("/foo/bar"), - Raises(StatusCode::Invalid, testing::HasSubstr("URIs must start with"))); + Raises(StatusCode::Invalid, testing::HasSubstr("Expected a URI"))); ASSERT_THAT( fs->PathFromUri("s3:///foo/bar"), - Raises(StatusCode::Invalid, testing::HasSubstr("GCS URIs must start with"))); - ASSERT_THAT( - fs->PathFromUri(std::string("gs://anonymous@") + PreexistingBucketPath() + - "?endpoint_override=foo"), Raises(StatusCode::Invalid, - testing::HasSubstr("but existing filesystem is configured for endpoint"))); + testing::HasSubstr("expected a URI with one of the schemes (gs, gcs)"))); } } // namespace diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 0094de30f96..b227aae65d9 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -474,19 +474,9 @@ bool HadoopFileSystem::Equals(const FileSystem& other) const { } Result HadoopFileSystem::PathFromUri(const std::string& uri_string) const { - if (internal::DetectAbsolutePath(uri_string)) { - return Status::Invalid( - "The HDFS filesystem is not capable of loading local paths. URIs must start " - "with hdfs:// or viewfs://"); - } - Uri uri; - ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); - const auto scheme = uri.scheme(); - if (uri.scheme() != "hdfs" && uri.scheme() != "viewfs") { - return Status::Invalid("HDFS URIs must start with hdfs:// or viewfs:// but received ", - uri_string); - } - return uri.path(); + return internal::PathFromUriHelper(uri_string, {"hdfs", "viewfs"}, + /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kIgnore); } Result> HadoopFileSystem::GetFileInfo(const FileSelector& select) { diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 93815f7fe24..0bb9187d498 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -265,19 +265,13 @@ Result LocalFileSystem::NormalizePath(std::string path) { } Result LocalFileSystem::PathFromUri(const std::string& uri_string) const { - if (internal::DetectAbsolutePath(uri_string)) { - return DoNormalizePath(uri_string); - } - Uri uri; - ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); - const auto scheme = uri.scheme(); - if (scheme != "file") { - return Status::Invalid("Local URIs must begin with file:// but received ", - uri_string); - } - std::string path; - RETURN_NOT_OK(LocalFileSystemOptions::FromUri(uri, &path)); - return path; +#ifdef _WIN32 + auto authority_handling = internal::AuthorityHandlingBehavior::kWindows; +#else + auto authority_handling = internal::AuthorityHandlingBehavior::kDisallow; +#endif + return internal::PathFromUriHelper(uri_string, {"file"}, /*accept_local_paths=*/true, + authority_handling); } bool LocalFileSystem::Equals(const FileSystem& other) const { diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 7121df833c5..035ab435b50 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -223,6 +223,7 @@ class TestLocalFS : public LocalFSTestMixin { } void TestInvalidUri(const std::string& uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } @@ -230,6 +231,7 @@ class TestLocalFS : public LocalFSTestMixin { } void TestInvalidUriOrPath(const std::string& uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } @@ -238,12 +240,11 @@ class TestLocalFS : public LocalFSTestMixin { ASSERT_RAISES(Invalid, lfs.PathFromUri(uri)); } - void TestNonMatchingUri(const std::string& uri) { + void TestInvalidPathFromUri(const std::string& uri, const std::string& expected_err) { // Legitimate URI for the wrong filesystem LocalFileSystem lfs; - ASSERT_THAT( - lfs.PathFromUri(uri), - Raises(StatusCode::Invalid, testing::HasSubstr("must begin with file://"))); + ASSERT_THAT(lfs.PathFromUri(uri), + Raises(StatusCode::Invalid, testing::HasSubstr(expected_err))); } void CheckConcreteFile(const std::string& path, int64_t expected_size) { @@ -333,6 +334,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriFile) { "//some server/some share/some path"); #else this->TestInvalidUri("file://server/share/foo/bar"); + this->TestInvalidUriOrPath("file://server/share/foo/bar"); #endif // Relative paths @@ -374,7 +376,8 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoSchemeBackslashes) { } TYPED_TEST(TestLocalFS, MismatchedFilesystemPathFromUri) { - this->TestNonMatchingUri("s3://foo"); + this->TestInvalidPathFromUri("s3://foo", + "expected a URI with one of the schemes (file)"); } TYPED_TEST(TestLocalFS, DirectoryMTime) { diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index 9cd775558d3..8eff8ecc2f8 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -437,16 +437,11 @@ MockFileSystem::MockFileSystem(TimePoint current_time, const io::IOContext& io_c bool MockFileSystem::Equals(const FileSystem& other) const { return this == &other; } Result MockFileSystem::PathFromUri(const std::string& uri_string) const { - if (internal::DetectAbsolutePath(uri_string)) { - return std::string(RemoveLeadingSlash(uri_string)); - } - Uri uri; - ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); - const auto scheme = uri.scheme(); - if (uri.scheme() != "mock") { - return Status::Invalid("Mock URIs must start with mock:// but received ", uri_string); - } - return std::string(RemoveLeadingSlash(uri.path())); + ARROW_ASSIGN_OR_RAISE( + std::string parsed_path, + internal::PathFromUriHelper(uri_string, {"mock"}, /*accept_local_paths=*/true, + internal::AuthorityHandlingBehavior::kDisallow)); + return std::string(internal::RemoveLeadingSlash(parsed_path)); } Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 2a18fcbd813..dc033b99580 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2236,32 +2236,8 @@ bool S3FileSystem::Equals(const FileSystem& other) const { } Result S3FileSystem::PathFromUri(const std::string& uri_string) const { - if (internal::DetectAbsolutePath(uri_string)) { - return Status::Invalid( - "The S3 filesystem is not capable of loading local paths. URIs must start " - "with s3://"); - } - Uri uri; - ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); - const auto scheme = uri.scheme(); - if (uri.scheme() != "s3") { - return Status::Invalid("S3 URIs must start with s3:// but received ", uri_string); - } - std::string path; - ARROW_ASSIGN_OR_RAISE(S3Options parsed_options, S3Options::FromUri(uri, &path)); - const S3Options& existing_options = impl_->options(); - if (parsed_options.endpoint_override != existing_options.endpoint_override) { - return Status::Invalid("Provided URI specified endpoint '", - parsed_options.endpoint_override, - "' but existing filesystem is configured for endpoint '", - existing_options.endpoint_override, "'"); - } - if (parsed_options.region != existing_options.region) { - return Status::Invalid("Provided URI specified region '", parsed_options.region, - "' but existing filesystem is configured for region '", - existing_options.region, "'"); - } - return path; + return internal::PathFromUriHelper(uri_string, {"s3"}, /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kPrepend); } S3Options S3FileSystem::options() const { return impl_->options(); } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 7210e9602f1..5b0287d9971 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -1149,24 +1149,12 @@ TEST_F(TestS3FS, FileSystemFromUri) { // Incorrect scheme ASSERT_THAT(fs->PathFromUri("file:///@bucket/somedir/subdir/subfile"), - Raises(StatusCode::Invalid, testing::HasSubstr("must start with s3://"))); + Raises(StatusCode::Invalid, + testing::HasSubstr("expected a URI with one of the schemes (s3)"))); // Not a URI ASSERT_THAT(fs->PathFromUri("/@bucket/somedir/subdir/subfile"), - Raises(StatusCode::Invalid, testing::HasSubstr("must start with s3://"))); - - // Correct scheme, wrong endpoint - ASSERT_THAT( - fs->PathFromUri("s3://@bucket/somedir/subdir/subfile"), - Raises(StatusCode::Invalid, - testing::HasSubstr("but existing filesystem is configured for endpoint"))); - - // Correct scheme & endpoint, wrong region - ss << "®ion=us-west-2"; - ASSERT_THAT( - fs->PathFromUri(ss.str()), - Raises(StatusCode::Invalid, - testing::HasSubstr("but existing filesystem is configured for region"))); + Raises(StatusCode::Invalid, testing::HasSubstr("Expected a URI"))); // Check the filesystem has the right connection parameters AssertFileInfo(fs.get(), path, FileType::File, 8); diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 49944def996..93fe9fd40ae 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -24,6 +24,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/io_util.h" +#include "arrow/util/string.h" namespace arrow { @@ -121,6 +122,68 @@ bool DetectAbsolutePath(const std::string& s) { return false; } +Result PathFromUriHelper(const std::string& uri_string, + std::vector supported_schemes, + bool accept_local_paths, + AuthorityHandlingBehavior authority_handling) { + if (internal::DetectAbsolutePath(uri_string)) { + if (accept_local_paths) { + return uri_string; + } + return Status::Invalid( + "The filesystem is not capable of loading local paths. Expected a URI but " + "received ", + uri_string); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (std::find(supported_schemes.begin(), supported_schemes.end(), scheme) == + supported_schemes.end()) { + std::string expected_schemes = + ::arrow::internal::JoinStrings(supported_schemes, ", "); + return Status::Invalid("The filesystem expected a URI with one of the schemes (", + expected_schemes, ") but received ", uri_string); + } + std::string host = uri.host(); + std::string path = uri.path(); + if (host.empty()) { + // Just a path, may be absolute or relative, only allow relative paths if local + if (path[0] == '/') { + return std::string(internal::RemoveTrailingSlash(path)); + } + if (accept_local_paths) { + return std::string(internal::RemoveTrailingSlash(path)); + } + return Status::Invalid("The filesystem does not support relative paths. Received ", + uri_string); + } + if (authority_handling == AuthorityHandlingBehavior::kDisallow) { + return Status::Invalid( + "The filesystem does not support the authority (host) component of a URI. " + "Received ", + uri_string); + } + if (path[0] != '/') { + // This should not be possible + return Status::Invalid( + "The provided URI has a host component but a relative path which is not " + "supported. " + "Received ", + uri_string); + } + switch (authority_handling) { + case AuthorityHandlingBehavior::kPrepend: + return std::string(internal::RemoveTrailingSlash(host + path)); + case AuthorityHandlingBehavior::kWindows: + return std::string(internal::RemoveTrailingSlash("//" + host + path)); + case AuthorityHandlingBehavior::kIgnore: + return std::string(internal::RemoveTrailingSlash(path)); + default: + return Status::Invalid("Unrecognized authority_handling value"); + } +} + Result GlobFiles(const std::shared_ptr& filesystem, const std::string& glob) { // TODO: ARROW-17640 diff --git a/cpp/src/arrow/filesystem/util_internal.h b/cpp/src/arrow/filesystem/util_internal.h index c5c4b7b50e1..29a51512d0a 100644 --- a/cpp/src/arrow/filesystem/util_internal.h +++ b/cpp/src/arrow/filesystem/util_internal.h @@ -63,6 +63,29 @@ Result ParseFileSystemUri(const std::string& uri_string); ARROW_EXPORT bool DetectAbsolutePath(const std::string& s); +/// \brief describes how to handle the authority (host) component of the URI +enum class AuthorityHandlingBehavior { + // Return an invalid status if the authority is non-empty + kDisallow = 0, + // Prepend the authority to the path (e.g. authority/some/path) + kPrepend = 1, + // Convert to a Windows style network path (e.g. //authority/some/path) + kWindows = 2, + // Ignore the authority and just use the path + kIgnore = 3 +}; + +/// \brief check to see if uri_string matches one of the supported schemes and return the +/// path component +/// \param uri_string a uri or local path to test and convert +/// \param supported_schemes the set of URI schemes that should be accepted +/// \param accept_local_paths if true, allow an absolute path +/// \return the path portion of the URI +Result PathFromUriHelper(const std::string& uri_string, + std::vector supported_schemes, + bool accept_local_paths, + AuthorityHandlingBehavior authority_handling); + /// \brief Return files matching the glob pattern on the filesystem /// /// Globbing starts from the root of the filesystem. From 2f5673da12eeac41263cbff0b7d0fcd4dbe5ca99 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 2 May 2023 15:04:08 -0700 Subject: [PATCH 5/8] Make sure to normalize slashes if the user passes in a path and not a URI --- cpp/src/arrow/filesystem/localfs_test.cc | 1 + cpp/src/arrow/filesystem/util_internal.cc | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 035ab435b50..abf0db91807 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -202,6 +202,7 @@ class TestLocalFS : public LocalFSTestMixin { template void CheckLocalUri(const std::string& uri, const std::string& expected_path, FileSystemFromUriFunc&& fs_from_uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 93fe9fd40ae..50a953ce8e1 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -128,7 +128,8 @@ Result PathFromUriHelper(const std::string& uri_string, AuthorityHandlingBehavior authority_handling) { if (internal::DetectAbsolutePath(uri_string)) { if (accept_local_paths) { - return uri_string; + // Normalize the path and remove any trailing slash + return std::string(internal::RemoveTrailingSlash(ToSlashes(uri_string))); } return Status::Invalid( "The filesystem is not capable of loading local paths. Expected a URI but " From 7fe9d2b17fe351c8974c49bcfd3c39c8297e9fb4 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Tue, 2 May 2023 19:01:16 -0700 Subject: [PATCH 6/8] Change behavior to always remove trailing slash --- cpp/src/arrow/filesystem/filesystem.cc | 2 +- cpp/src/arrow/filesystem/localfs.cc | 4 ++-- cpp/src/arrow/filesystem/localfs_test.cc | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 6e23a176734..ebc740dcbf2 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -759,7 +759,7 @@ Result> FileSystemFromUriOrPath( if (internal::DetectAbsolutePath(uri_string)) { // Normalize path separators if (out_path != nullptr) { - *out_path = ToSlashes(uri_string); + *out_path = std::string(RemoveTrailingSlash(ToSlashes(uri_string))); } return std::make_shared(); } diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 0bb9187d498..5387ad1054f 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -238,13 +238,13 @@ Result LocalFileSystemOptions::FromUri( #ifdef _WIN32 std::stringstream ss; ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path()); - *out_path = ss.str(); + *out_path = std::string(internal::RemoveTrailingSlash(ss.str())); #else return Status::Invalid("Unsupported hostname in non-Windows local URI: '", uri.ToString(), "'"); #endif } else { - *out_path = uri.path(); + *out_path = std::string(internal::RemoveTrailingSlash(uri.path())); } // TODO handle use_mmap option diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index abf0db91807..5cc963cea88 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -184,7 +184,7 @@ class TestLocalFS : public LocalFSTestMixin { } std::string path; ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path)); - ASSERT_EQ(path, local_path_); + ASSERT_EQ(path, std::string(RemoveTrailingSlash(local_path_))); // Test that the right location on disk is accessed CreateFile(fs_.get(), local_path_ + "abc", "some data"); @@ -352,7 +352,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoScheme) { this->TestLocalUriOrPath(this->path_formatter_("/foo/bar"), "/foo/bar"); #ifdef _WIN32 - this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar/"); + this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar"); #endif // Relative paths From 275549a0baa7cbb8158d90be20517a21e892e57e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 3 May 2023 08:39:39 -0700 Subject: [PATCH 7/8] Add an option to RemoveTrailingSlash to keep / as / and then set that option to true everywhere it is used as part of PathFromUri --- cpp/src/arrow/filesystem/filesystem.cc | 3 ++- cpp/src/arrow/filesystem/localfs.cc | 6 ++++-- cpp/src/arrow/filesystem/localfs_test.cc | 1 + cpp/src/arrow/filesystem/path_util.cc | 6 +++++- cpp/src/arrow/filesystem/path_util.h | 5 ++++- cpp/src/arrow/filesystem/util_internal.cc | 3 ++- 6 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index ebc740dcbf2..6296dd8d85d 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -759,7 +759,8 @@ Result> FileSystemFromUriOrPath( if (internal::DetectAbsolutePath(uri_string)) { // Normalize path separators if (out_path != nullptr) { - *out_path = std::string(RemoveTrailingSlash(ToSlashes(uri_string))); + *out_path = + std::string(RemoveTrailingSlash(ToSlashes(uri_string), /*preserve_root=*/true)); } return std::make_shared(); } diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 5387ad1054f..e030014159c 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -238,13 +238,15 @@ Result LocalFileSystemOptions::FromUri( #ifdef _WIN32 std::stringstream ss; ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path()); - *out_path = std::string(internal::RemoveTrailingSlash(ss.str())); + *out_path = + std::string(internal::RemoveTrailingSlash(ss.str(), /*preserve_root=*/true)); #else return Status::Invalid("Unsupported hostname in non-Windows local URI: '", uri.ToString(), "'"); #endif } else { - *out_path = std::string(internal::RemoveTrailingSlash(uri.path())); + *out_path = + std::string(internal::RemoveTrailingSlash(uri.path(), /*preserve_root=*/true)); } // TODO handle use_mmap option diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 5cc963cea88..7ce2a569686 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -350,6 +350,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoScheme) { // Variations this->TestLocalUriOrPath(this->path_formatter_("/foo/bar"), "/foo/bar"); + this->TestLocalUriOrPath(this->path_formatter_("/"), "/"); #ifdef _WIN32 this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar"); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index ba4892a0ac9..6b2ea115cc8 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -129,7 +129,11 @@ std::string EnsureLeadingSlash(std::string_view v) { return std::string(v); } } -std::string_view RemoveTrailingSlash(std::string_view key) { +std::string_view RemoveTrailingSlash(std::string_view key, bool preserve_root) { + if (preserve_root && key.size() == 1) { + // If the user gives us "/" then don't return "" + return key; + } while (!key.empty() && key.back() == kSep) { key.remove_suffix(1); } diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 059827fb0a9..b821e793384 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,8 +69,11 @@ std::string_view RemoveLeadingSlash(std::string_view s); ARROW_EXPORT std::string EnsureTrailingSlash(std::string_view s); +/// \brief remove the forward slash (if any) from the given path +/// \param s the input path +/// \param preserve_root if true, allow a path of just "/" to remain unchanged ARROW_EXPORT -std::string_view RemoveTrailingSlash(std::string_view s); +std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false); ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 50a953ce8e1..a2f34fb1bb1 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -129,7 +129,8 @@ Result PathFromUriHelper(const std::string& uri_string, if (internal::DetectAbsolutePath(uri_string)) { if (accept_local_paths) { // Normalize the path and remove any trailing slash - return std::string(internal::RemoveTrailingSlash(ToSlashes(uri_string))); + return std::string( + internal::RemoveTrailingSlash(ToSlashes(uri_string), /*preserve_root=*/true)); } return Status::Invalid( "The filesystem is not capable of loading local paths. Expected a URI but " From c89e9cada3cdda85ebd831c94653a5f9262acbbd Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 3 May 2023 10:34:34 -0700 Subject: [PATCH 8/8] Add a check on Windows for remove trailing slash to handle C:/ style paths. --- cpp/src/arrow/filesystem/path_util.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 6b2ea115cc8..e25e544f034 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -134,6 +134,12 @@ std::string_view RemoveTrailingSlash(std::string_view key, bool preserve_root) { // If the user gives us "/" then don't return "" return key; } +#ifdef _WIN32 + if (preserve_root && key.size() == 3 && key[1] == ':' && key[0] != '/') { + // If the user gives us C:/ then don't return C: + return key; + } +#endif while (!key.empty() && key.back() == kSep) { key.remove_suffix(1); }