diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 73b94d38288..6296dd8d85d 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -60,6 +60,7 @@ using internal::ConcatAbstractPath; using internal::EnsureTrailingSlash; using internal::GetAbstractPathParent; using internal::kSep; +using internal::ParseFileSystemUri; using internal::RemoveLeadingSlash; using internal::RemoveTrailingSlash; using internal::ToSlashes; @@ -254,6 +255,10 @@ Result> FileSystem::OpenAppendStream( return OpenAppendStream(path, std::shared_ptr{}); } +Result FileSystem::PathFromUri(const std::string& uri_string) const { + return Status::NotImplemented("PathFromUri is not yet supported on this filesystem"); +} + ////////////////////////////////////////////////////////////////////////// // SubTreeFileSystem implementation @@ -484,6 +489,10 @@ Result> SubTreeFileSystem::OpenAppendStream( return base_fs_->OpenAppendStream(real_path, metadata); } +Result SubTreeFileSystem::PathFromUri(const std::string& uri_string) const { + return base_fs_->PathFromUri(uri_string); +} + ////////////////////////////////////////////////////////////////////////// // SlowFileSystem implementation @@ -505,6 +514,10 @@ SlowFileSystem::SlowFileSystem(std::shared_ptr base_fs, bool SlowFileSystem::Equals(const FileSystem& other) const { return this == &other; } +Result SlowFileSystem::PathFromUri(const std::string& uri_string) const { + return base_fs_->PathFromUri(uri_string); +} + Result SlowFileSystem::GetFileInfo(const std::string& path) { latencies_->Sleep(); return base_fs_->GetFileInfo(path); @@ -662,23 +675,6 @@ Status CopyFiles(const std::shared_ptr& source_fs, namespace { -Result ParseFileSystemUri(const std::string& uri_string) { - Uri uri; - auto status = uri.Parse(uri_string); - if (!status.ok()) { -#ifdef _WIN32 - // Could be a "file:..." URI with backslashes instead of regular slashes. - RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string))); - if (uri.scheme() != "file") { - return status; - } -#else - return status; -#endif - } - return std::move(uri); -} - Result> FileSystemFromUriReal(const Uri& uri, const std::string& uri_string, const io::IOContext& io_context, @@ -763,7 +759,8 @@ Result> FileSystemFromUriOrPath( if (internal::DetectAbsolutePath(uri_string)) { // Normalize path separators if (out_path != nullptr) { - *out_path = ToSlashes(uri_string); + *out_path = + std::string(RemoveTrailingSlash(ToSlashes(uri_string), /*preserve_root=*/true)); } return std::make_shared(); } diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 6dc18d7de84..cfadaeb0ce1 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -171,6 +171,26 @@ class ARROW_EXPORT FileSystem : public std::enable_shared_from_this /// may allow normalizing irregular path forms (such as Windows local paths). virtual Result NormalizePath(std::string path); + /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the + /// path + /// + /// \param uri_string A URI representing a resource in the given filesystem. + /// + /// This method will check to ensure the given filesystem is compatible with the + /// URI. This can be useful when the user provides both a URI and a filesystem or + /// when a user provides multiple URIs that should be compatible with the same + /// filesystem. + /// + /// uri_string can be an absolute path instead of a URI. In that case it will ensure + /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that + /// is capable of reading local paths) and will normalize the path's file separators. + /// + /// Note, this method only checks to ensure the URI scheme is valid. It will not detect + /// inconsistencies like a mismatching region or endpoint override. + /// + /// \return The path inside the filesystem that is indicated by the URI. + virtual Result PathFromUri(const std::string& uri_string) const; + virtual bool Equals(const FileSystem& other) const = 0; virtual bool Equals(const std::shared_ptr& other) const { @@ -336,6 +356,7 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { std::shared_ptr base_fs() const { return base_fs_; } Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; bool Equals(const FileSystem& other) const override; @@ -410,6 +431,7 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem { std::string type_name() const override { return "slow"; } bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; using FileSystem::GetFileInfo; Result GetFileInfo(const std::string& path) override; diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index f063e31b5c5..6fc75589b0a 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -873,6 +873,12 @@ bool GcsFileSystem::Equals(const FileSystem& other) const { return impl_->options().Equals(fs.impl_->options()); } +Result GcsFileSystem::PathFromUri(const std::string& uri_string) const { + return internal::PathFromUriHelper(uri_string, {"gs", "gcs"}, + /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kPrepend); +} + Result GcsFileSystem::GetFileInfo(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); return impl_->GetFileInfo(p); diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index c3d03b5cb21..d4b919ec814 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -178,6 +178,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { const GcsOptions& options() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; Result GetFileInfo(const std::string& path) override; Result GetFileInfo(const FileSelector& select) override; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 15d596dc0b8..9d5136b47c9 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -54,6 +54,7 @@ #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/test_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/util/future.h" #include "arrow/util/key_value_metadata.h" @@ -1383,12 +1384,24 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) { TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { // Smoke test for FileSystemFromUri - ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(std::string("gs://anonymous@") + - PreexistingBucketPath())); + std::string path; + ASSERT_OK_AND_ASSIGN( + auto fs, + FileSystemFromUri(std::string("gs://anonymous@") + PreexistingBucketPath(), &path)); EXPECT_EQ(fs->type_name(), "gcs"); + EXPECT_EQ(path, PreexistingBucketName()); + ASSERT_OK_AND_ASSIGN( + path, fs->PathFromUri(std::string("gs://anonymous@") + PreexistingBucketPath())); + EXPECT_EQ(path, PreexistingBucketName()); ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") + PreexistingBucketPath())); EXPECT_EQ(fs2->type_name(), "gcs"); + ASSERT_THAT(fs->PathFromUri("/foo/bar"), + Raises(StatusCode::Invalid, testing::HasSubstr("Expected a URI"))); + ASSERT_THAT( + fs->PathFromUri("s3:///foo/bar"), + Raises(StatusCode::Invalid, + testing::HasSubstr("expected a URI with one of the schemes (gs, gcs)"))); } } // namespace diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 8709ab45625..b227aae65d9 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -473,6 +473,12 @@ bool HadoopFileSystem::Equals(const FileSystem& other) const { return options().Equals(hdfs.options()); } +Result HadoopFileSystem::PathFromUri(const std::string& uri_string) const { + return internal::PathFromUriHelper(uri_string, {"hdfs", "viewfs"}, + /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kIgnore); +} + Result> HadoopFileSystem::GetFileInfo(const FileSelector& select) { return impl_->GetFileInfo(select); } diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index bed0ac4c613..798aac0ea90 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -66,6 +66,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { std::string type_name() const override { return "hdfs"; } HdfsOptions options() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; /// \cond FALSE using FileSystem::GetFileInfo; diff --git a/cpp/src/arrow/filesystem/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test.cc index b1020231beb..7ad9e6cd40e 100644 --- a/cpp/src/arrow/filesystem/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test.cc @@ -119,6 +119,8 @@ class TestHadoopFileSystem : public ::testing::Test, public HadoopFileSystemTest ARROW_LOG(INFO) << "!!! uri = " << ss.str(); ASSERT_OK_AND_ASSIGN(uri_fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "/"); + ASSERT_OK_AND_ASSIGN(path, uri_fs->PathFromUri(ss.str())); + ASSERT_EQ(path, "/"); // Sanity check ASSERT_OK(uri_fs->CreateDir("AB")); diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 03b4ad3bc72..e030014159c 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -52,37 +52,6 @@ using ::arrow::internal::IOErrorFromWinError; using ::arrow::internal::NativePathString; using ::arrow::internal::PlatformFilename; -namespace internal { - -#ifdef _WIN32 -static bool IsDriveLetter(char c) { - // Can't use locale-dependent functions from the C/C++ stdlib - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); -} -#endif - -bool DetectAbsolutePath(const std::string& s) { - // Is it a /-prefixed local path? - if (s.length() >= 1 && s[0] == '/') { - return true; - } -#ifdef _WIN32 - // Is it a \-prefixed local path? - if (s.length() >= 1 && s[0] == '\\') { - return true; - } - // Does it start with a drive letter in addition to being /- or \-prefixed, - // e.g. "C:\..."? - if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && - IsDriveLetter(s[0])) { - return true; - } -#endif - return false; -} - -} // namespace internal - namespace { Status ValidatePath(std::string_view s) { @@ -92,6 +61,12 @@ Status ValidatePath(std::string_view s) { return Status::OK(); } +Result DoNormalizePath(std::string path) { + RETURN_NOT_OK(ValidatePath(path)); + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + return fn.ToString(); +} + #ifdef _WIN32 std::string NativeToString(const NativePathString& ns) { @@ -263,13 +238,15 @@ Result LocalFileSystemOptions::FromUri( #ifdef _WIN32 std::stringstream ss; ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path()); - *out_path = ss.str(); + *out_path = + std::string(internal::RemoveTrailingSlash(ss.str(), /*preserve_root=*/true)); #else return Status::Invalid("Unsupported hostname in non-Windows local URI: '", uri.ToString(), "'"); #endif } else { - *out_path = uri.path(); + *out_path = + std::string(internal::RemoveTrailingSlash(uri.path(), /*preserve_root=*/true)); } // TODO handle use_mmap option @@ -286,9 +263,17 @@ LocalFileSystem::LocalFileSystem(const LocalFileSystemOptions& options, LocalFileSystem::~LocalFileSystem() {} Result LocalFileSystem::NormalizePath(std::string path) { - RETURN_NOT_OK(ValidatePath(path)); - ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); - return fn.ToString(); + return DoNormalizePath(std::move(path)); +} + +Result LocalFileSystem::PathFromUri(const std::string& uri_string) const { +#ifdef _WIN32 + auto authority_handling = internal::AuthorityHandlingBehavior::kWindows; +#else + auto authority_handling = internal::AuthorityHandlingBehavior::kDisallow; +#endif + return internal::PathFromUriHelper(uri_string, {"file"}, /*accept_local_paths=*/true, + authority_handling); } bool LocalFileSystem::Equals(const FileSystem& other) const { diff --git a/cpp/src/arrow/filesystem/localfs.h b/cpp/src/arrow/filesystem/localfs.h index 75eaf314e4d..108530c2b27 100644 --- a/cpp/src/arrow/filesystem/localfs.h +++ b/cpp/src/arrow/filesystem/localfs.h @@ -82,6 +82,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem { std::string type_name() const override { return "local"; } Result NormalizePath(std::string path) override; + Result PathFromUri(const std::string& uri_string) const override; bool Equals(const FileSystem& other) const override; @@ -121,13 +122,5 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem { LocalFileSystemOptions options_; }; -namespace internal { - -// Return whether the string is detected as a local absolute path. -ARROW_EXPORT -bool DetectAbsolutePath(const std::string& s); - -} // namespace internal - } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 33f75dd845a..7ce2a569686 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -31,6 +31,7 @@ #include "arrow/filesystem/test_util.h" #include "arrow/filesystem/util_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/util/io_util.h" #include "arrow/util/uri.h" @@ -183,7 +184,7 @@ class TestLocalFS : public LocalFSTestMixin { } std::string path; ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path)); - ASSERT_EQ(path, local_path_); + ASSERT_EQ(path, std::string(RemoveTrailingSlash(local_path_))); // Test that the right location on disk is accessed CreateFile(fs_.get(), local_path_ + "abc", "some data"); @@ -201,6 +202,7 @@ class TestLocalFS : public LocalFSTestMixin { template void CheckLocalUri(const std::string& uri, const std::string& expected_path, FileSystemFromUriFunc&& fs_from_uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } @@ -208,6 +210,8 @@ class TestLocalFS : public LocalFSTestMixin { ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path)); ASSERT_EQ(fs_->type_name(), "local"); ASSERT_EQ(path, expected_path); + ASSERT_OK_AND_ASSIGN(path, fs_->PathFromUri(uri)); + ASSERT_EQ(path, expected_path); } // Like TestFileSystemFromUri, but with an arbitrary non-existing path @@ -220,6 +224,7 @@ class TestLocalFS : public LocalFSTestMixin { } void TestInvalidUri(const std::string& uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } @@ -227,10 +232,20 @@ class TestLocalFS : public LocalFSTestMixin { } void TestInvalidUriOrPath(const std::string& uri) { + ARROW_SCOPED_TRACE("uri = ", uri); if (!path_formatter_.supports_uri()) { return; // skip } ASSERT_RAISES(Invalid, FileSystemFromUriOrPath(uri)); + LocalFileSystem lfs; + ASSERT_RAISES(Invalid, lfs.PathFromUri(uri)); + } + + void TestInvalidPathFromUri(const std::string& uri, const std::string& expected_err) { + // Legitimate URI for the wrong filesystem + LocalFileSystem lfs; + ASSERT_THAT(lfs.PathFromUri(uri), + Raises(StatusCode::Invalid, testing::HasSubstr(expected_err))); } void CheckConcreteFile(const std::string& path, int64_t expected_size) { @@ -320,6 +335,7 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriFile) { "//some server/some share/some path"); #else this->TestInvalidUri("file://server/share/foo/bar"); + this->TestInvalidUriOrPath("file://server/share/foo/bar"); #endif // Relative paths @@ -334,9 +350,10 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoScheme) { // Variations this->TestLocalUriOrPath(this->path_formatter_("/foo/bar"), "/foo/bar"); + this->TestLocalUriOrPath(this->path_formatter_("/"), "/"); #ifdef _WIN32 - this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar/"); + this->TestLocalUriOrPath(this->path_formatter_("C:/foo/bar/"), "C:/foo/bar"); #endif // Relative paths @@ -360,6 +377,11 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriNoSchemeBackslashes) { this->TestInvalidUriOrPath("foo\\bar"); } +TYPED_TEST(TestLocalFS, MismatchedFilesystemPathFromUri) { + this->TestInvalidPathFromUri("s3://foo", + "expected a URI with one of the schemes (file)"); +} + TYPED_TEST(TestLocalFS, DirectoryMTime) { TimePoint t1 = CurrentTimePoint(); ASSERT_OK(this->fs_->CreateDir("AB/CD/EF")); diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index 3bc6f4464eb..8eff8ecc2f8 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -436,6 +436,14 @@ MockFileSystem::MockFileSystem(TimePoint current_time, const io::IOContext& io_c bool MockFileSystem::Equals(const FileSystem& other) const { return this == &other; } +Result MockFileSystem::PathFromUri(const std::string& uri_string) const { + ARROW_ASSIGN_OR_RAISE( + std::string parsed_path, + internal::PathFromUriHelper(uri_string, {"mock"}, /*accept_local_paths=*/true, + internal::AuthorityHandlingBehavior::kDisallow)); + return std::string(internal::RemoveLeadingSlash(parsed_path)); +} + Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); diff --git a/cpp/src/arrow/filesystem/mockfs.h b/cpp/src/arrow/filesystem/mockfs.h index e12408f52ce..32d06e5910d 100644 --- a/cpp/src/arrow/filesystem/mockfs.h +++ b/cpp/src/arrow/filesystem/mockfs.h @@ -66,6 +66,7 @@ class ARROW_EXPORT MockFileSystem : public FileSystem { std::string type_name() const override { return "mock"; } bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; // XXX It's not very practical to have to explicitly declare inheritance // of default overrides. diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index ba4892a0ac9..e25e544f034 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -129,7 +129,17 @@ std::string EnsureLeadingSlash(std::string_view v) { return std::string(v); } } -std::string_view RemoveTrailingSlash(std::string_view key) { +std::string_view RemoveTrailingSlash(std::string_view key, bool preserve_root) { + if (preserve_root && key.size() == 1) { + // If the user gives us "/" then don't return "" + return key; + } +#ifdef _WIN32 + if (preserve_root && key.size() == 3 && key[1] == ':' && key[0] != '/') { + // If the user gives us C:/ then don't return C: + return key; + } +#endif while (!key.empty() && key.back() == kSep) { key.remove_suffix(1); } diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 059827fb0a9..b821e793384 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,8 +69,11 @@ std::string_view RemoveLeadingSlash(std::string_view s); ARROW_EXPORT std::string EnsureTrailingSlash(std::string_view s); +/// \brief remove the forward slash (if any) from the given path +/// \param s the input path +/// \param preserve_root if true, allow a path of just "/" to remain unchanged ARROW_EXPORT -std::string_view RemoveTrailingSlash(std::string_view s); +std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false); ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index a22d9c10bec..dc033b99580 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2235,6 +2235,11 @@ bool S3FileSystem::Equals(const FileSystem& other) const { return options().Equals(s3fs.options()); } +Result S3FileSystem::PathFromUri(const std::string& uri_string) const { + return internal::PathFromUriHelper(uri_string, {"s3"}, /*accept_local_paths=*/false, + internal::AuthorityHandlingBehavior::kPrepend); +} + S3Options S3FileSystem::options() const { return impl_->options(); } std::string S3FileSystem::region() const { return impl_->region(); } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 2bccecafe8e..0a7ca73ccb2 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -247,6 +247,7 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { std::string region() const; bool Equals(const FileSystem& other) const override; + Result PathFromUri(const std::string& uri_string) const override; /// \cond FALSE using FileSystem::GetFileInfo; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 38df84bdede..5b0287d9971 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -58,6 +58,7 @@ #include "arrow/status.h" #include "arrow/testing/future_util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/util/async_generator.h" #include "arrow/util/checked_cast.h" @@ -1143,6 +1144,18 @@ TEST_F(TestS3FS, FileSystemFromUri) { ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(ss.str(), &path)); ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); + ASSERT_OK_AND_ASSIGN(path, fs->PathFromUri(ss.str())); + ASSERT_EQ(path, "bucket/somedir/subdir/subfile"); + + // Incorrect scheme + ASSERT_THAT(fs->PathFromUri("file:///@bucket/somedir/subdir/subfile"), + Raises(StatusCode::Invalid, + testing::HasSubstr("expected a URI with one of the schemes (s3)"))); + + // Not a URI + ASSERT_THAT(fs->PathFromUri("/@bucket/somedir/subdir/subfile"), + Raises(StatusCode::Invalid, testing::HasSubstr("Expected a URI"))); + // Check the filesystem has the right connection parameters AssertFileInfo(fs.get(), path, FileType::File, 8); } diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 79e8503818c..a2f34fb1bb1 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -24,10 +24,12 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/io_util.h" +#include "arrow/util/string.h" namespace arrow { using internal::StatusDetailFromErrno; +using internal::Uri; namespace fs { namespace internal { @@ -76,6 +78,114 @@ Status InvalidDeleteDirContents(std::string_view path) { "If you wish to delete the root directory's contents, call DeleteRootDirContents."); } +Result ParseFileSystemUri(const std::string& uri_string) { + Uri uri; + auto status = uri.Parse(uri_string); + if (!status.ok()) { +#ifdef _WIN32 + // Could be a "file:..." URI with backslashes instead of regular slashes. + RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string))); + if (uri.scheme() != "file") { + return status; + } +#else + return status; +#endif + } + return std::move(uri); +} + +#ifdef _WIN32 +static bool IsDriveLetter(char c) { + // Can't use locale-dependent functions from the C/C++ stdlib + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} +#endif + +bool DetectAbsolutePath(const std::string& s) { + // Is it a /-prefixed local path? + if (s.length() >= 1 && s[0] == '/') { + return true; + } +#ifdef _WIN32 + // Is it a \-prefixed local path? + if (s.length() >= 1 && s[0] == '\\') { + return true; + } + // Does it start with a drive letter in addition to being /- or \-prefixed, + // e.g. "C:\..."? + if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && + IsDriveLetter(s[0])) { + return true; + } +#endif + return false; +} + +Result PathFromUriHelper(const std::string& uri_string, + std::vector supported_schemes, + bool accept_local_paths, + AuthorityHandlingBehavior authority_handling) { + if (internal::DetectAbsolutePath(uri_string)) { + if (accept_local_paths) { + // Normalize the path and remove any trailing slash + return std::string( + internal::RemoveTrailingSlash(ToSlashes(uri_string), /*preserve_root=*/true)); + } + return Status::Invalid( + "The filesystem is not capable of loading local paths. Expected a URI but " + "received ", + uri_string); + } + Uri uri; + ARROW_RETURN_NOT_OK(uri.Parse(uri_string)); + const auto scheme = uri.scheme(); + if (std::find(supported_schemes.begin(), supported_schemes.end(), scheme) == + supported_schemes.end()) { + std::string expected_schemes = + ::arrow::internal::JoinStrings(supported_schemes, ", "); + return Status::Invalid("The filesystem expected a URI with one of the schemes (", + expected_schemes, ") but received ", uri_string); + } + std::string host = uri.host(); + std::string path = uri.path(); + if (host.empty()) { + // Just a path, may be absolute or relative, only allow relative paths if local + if (path[0] == '/') { + return std::string(internal::RemoveTrailingSlash(path)); + } + if (accept_local_paths) { + return std::string(internal::RemoveTrailingSlash(path)); + } + return Status::Invalid("The filesystem does not support relative paths. Received ", + uri_string); + } + if (authority_handling == AuthorityHandlingBehavior::kDisallow) { + return Status::Invalid( + "The filesystem does not support the authority (host) component of a URI. " + "Received ", + uri_string); + } + if (path[0] != '/') { + // This should not be possible + return Status::Invalid( + "The provided URI has a host component but a relative path which is not " + "supported. " + "Received ", + uri_string); + } + switch (authority_handling) { + case AuthorityHandlingBehavior::kPrepend: + return std::string(internal::RemoveTrailingSlash(host + path)); + case AuthorityHandlingBehavior::kWindows: + return std::string(internal::RemoveTrailingSlash("//" + host + path)); + case AuthorityHandlingBehavior::kIgnore: + return std::string(internal::RemoveTrailingSlash(path)); + default: + return Status::Invalid("Unrecognized authority_handling value"); + } +} + Result GlobFiles(const std::shared_ptr& filesystem, const std::string& glob) { // TODO: ARROW-17640 diff --git a/cpp/src/arrow/filesystem/util_internal.h b/cpp/src/arrow/filesystem/util_internal.h index cc16dbba106..29a51512d0a 100644 --- a/cpp/src/arrow/filesystem/util_internal.h +++ b/cpp/src/arrow/filesystem/util_internal.h @@ -24,9 +24,11 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" +#include "arrow/util/uri.h" #include "arrow/util/visibility.h" namespace arrow { +using internal::Uri; namespace fs { namespace internal { @@ -50,6 +52,40 @@ Status NotAFile(std::string_view path); ARROW_EXPORT Status InvalidDeleteDirContents(std::string_view path); +/// \brief Parse the string as a URI +/// \param uri_string the string to parse +/// +/// This is the same as Uri::Parse except it tolerates Windows +/// file URIs that contain backslash instead of / +Result ParseFileSystemUri(const std::string& uri_string); + +/// \brief check if the string is a local absolute path +ARROW_EXPORT +bool DetectAbsolutePath(const std::string& s); + +/// \brief describes how to handle the authority (host) component of the URI +enum class AuthorityHandlingBehavior { + // Return an invalid status if the authority is non-empty + kDisallow = 0, + // Prepend the authority to the path (e.g. authority/some/path) + kPrepend = 1, + // Convert to a Windows style network path (e.g. //authority/some/path) + kWindows = 2, + // Ignore the authority and just use the path + kIgnore = 3 +}; + +/// \brief check to see if uri_string matches one of the supported schemes and return the +/// path component +/// \param uri_string a uri or local path to test and convert +/// \param supported_schemes the set of URI schemes that should be accepted +/// \param accept_local_paths if true, allow an absolute path +/// \return the path portion of the URI +Result PathFromUriHelper(const std::string& uri_string, + std::vector supported_schemes, + bool accept_local_paths, + AuthorityHandlingBehavior authority_handling); + /// \brief Return files matching the glob pattern on the filesystem /// /// Globbing starts from the root of the filesystem.