From 37f7c6df7264dd1b14609a311ab957c998a3726f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 14:25:17 -0700 Subject: [PATCH 1/7] chore: add failing test --- cpp/src/arrow/filesystem/test_util.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 3e197e110b1..66641819c54 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -899,6 +899,14 @@ void GenericFileSystemTest::TestOpenOutputStream(FileSystem* fs) { ASSERT_RAISES(Invalid, stream->Write("x")); // Stream is closed + // Trailing slash ignored + ASSERT_OK_AND_ASSIGN(stream, fs->OpenOutputStream("CD/ghi/")); + ASSERT_OK(stream->Write("new data")); + ASSERT_OK(stream->Close()); + AssertAllDirs(fs, {"CD"}); + AssertAllFiles(fs, {"CD/ghi", "abc"}); + AssertFileContents(fs, "CD/ghi", "new data"); + // Storing metadata along file auto metadata = KeyValueMetadata::Make({"Content-Type", "Content-Language"}, {"x-arrow/filesystem-test", "fr_FR"}); From 753ed3dcfe5510f1f6df380f2b9f98f535236e77 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 14:46:06 -0700 Subject: [PATCH 2/7] fix: drop slash only for openoutputstream --- cpp/src/arrow/filesystem/gcsfs.cc | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 82d2439a607..531cb064da9 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -71,22 +71,28 @@ struct GcsPath { std::string bucket; std::string object; - static Result FromString(const std::string& s) { - if (internal::IsLikelyUri(s)) { + static Result FromString(const std::string& s, + bool is_definitely_file = false) { + // If we know it's definitely a file, we should remove the trailing slash + const auto src = + is_definitely_file ? internal::RemoveTrailingSlash(s) : util::string_view(s); + + if (internal::IsLikelyUri(src)) { return Status::Invalid( - "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); + "Expected a GCS object path of the form 'bucket/key...', got a URI: '", src, + "'"); } - auto const first_sep = s.find_first_of(internal::kSep); + auto const first_sep = src.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", s, "')"); + return Status::Invalid("Path cannot start with a separator ('", src, "')"); } if (first_sep == std::string::npos) { return GcsPath{s, internal::RemoveTrailingSlash(s).to_string(), ""}; } GcsPath path; - path.full_path = s; - path.bucket = s.substr(0, first_sep); - path.object = s.substr(first_sep + 1); + path.full_path = src.to_string(); + path.bucket = src.substr(0, first_sep).to_string(); + path.object = src.substr(first_sep + 1).to_string(); return path; } @@ -941,7 +947,7 @@ Result> GcsFileSystem::OpenInputFile( Result> GcsFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); return impl_->OpenOutputStream(p, metadata); } From e6d900de7e67007d5e86e84a7442c6706adf9b6b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 18:42:37 -0700 Subject: [PATCH 3/7] fix: allow rejection of trailing slash too --- cpp/src/arrow/filesystem/test_util.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 66641819c54..d40c210464f 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -899,13 +899,18 @@ void GenericFileSystemTest::TestOpenOutputStream(FileSystem* fs) { ASSERT_RAISES(Invalid, stream->Write("x")); // Stream is closed - // Trailing slash ignored - ASSERT_OK_AND_ASSIGN(stream, fs->OpenOutputStream("CD/ghi/")); - ASSERT_OK(stream->Write("new data")); - ASSERT_OK(stream->Close()); - AssertAllDirs(fs, {"CD"}); - AssertAllFiles(fs, {"CD/ghi", "abc"}); - AssertFileContents(fs, "CD/ghi", "new data"); + // Trailing slash ignored or rejected + auto maybe_stream = fs->OpenOutputStream("CD/ghi/"); + if (maybe_stream.ok()) { + ASSERT_OK_AND_ASSIGN(stream, fs->OpenOutputStream("CD/ghi/")); + ASSERT_OK(stream->Write("new data")); + ASSERT_OK(stream->Close()); + AssertAllDirs(fs, {"CD"}); + AssertAllFiles(fs, {"CD/ghi", "abc"}); + AssertFileContents(fs, "CD/ghi", "new data"); + } else { + ASSERT_RAISES(IOError, maybe_stream); + } // Storing metadata along file auto metadata = KeyValueMetadata::Make({"Content-Type", "Content-Language"}, From df2480aae03e89e142147d828c481992cab18264 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 11 Jul 2022 20:48:32 -0700 Subject: [PATCH 4/7] fix: Also test input stream and file --- cpp/src/arrow/filesystem/gcsfs.cc | 10 ++++++---- cpp/src/arrow/filesystem/test_util.cc | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 531cb064da9..8e706908fea 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -893,7 +893,7 @@ Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) Result> GcsFileSystem::OpenInputStream( const std::string& path) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset()); } @@ -903,13 +903,14 @@ Result> GcsFileSystem::OpenInputStream( return Status::IOError("Cannot open directory '", info.path(), "' as an input stream"); } - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); + ARROW_ASSIGN_OR_RAISE(auto p, + GcsPath::FromString(info.path(), /*is_definitely_file=*/true)); return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset()); } Result> GcsFileSystem::OpenInputFile( const std::string& path) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); auto metadata = impl_->GetObjectMetadata(p); ARROW_GCS_RETURN_NOT_OK(metadata.status()); auto impl = impl_; @@ -930,7 +931,8 @@ Result> GcsFileSystem::OpenInputFile( return Status::IOError("Cannot open directory '", info.path(), "' as an input stream"); } - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); + ARROW_ASSIGN_OR_RAISE(auto p, + GcsPath::FromString(info.path(), /*is_definitely_file=*/true)); auto metadata = impl_->GetObjectMetadata(p); ARROW_GCS_RETURN_NOT_OK(metadata.status()); auto impl = impl_; diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index d40c210464f..9d001cc8499 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -987,6 +987,17 @@ void GenericFileSystemTest::TestOpenInputStream(FileSystem* fs) { ASSERT_OK(stream->Close()); ASSERT_RAISES(Invalid, stream->Read(1)); // Stream is closed + // Either ignore or reject trailing slash on a file + auto maybe_stream = fs->OpenInputStream("AB/abc/"); + if (maybe_stream.ok()) { + ASSERT_OK_AND_ASSIGN(stream, maybe_stream); + ASSERT_OK_AND_ASSIGN(buffer, stream->Read(4)); + AssertBufferEqual(*buffer, "some"); + ASSERT_OK(stream->Close()); + } else { + ASSERT_RAISES(IOError, maybe_stream); + } + // File does not exist AssertRaisesWithErrno(ENOENT, fs->OpenInputStream("AB/def")); AssertRaisesWithErrno(ENOENT, fs->OpenInputStream("def")); @@ -1057,6 +1068,17 @@ void GenericFileSystemTest::TestOpenInputFile(FileSystem* fs) { ASSERT_OK(file->Close()); ASSERT_RAISES(Invalid, file->ReadAt(1, 1)); // Stream is closed + // Either ignore or reject trailing slash on a file + auto maybe_file = fs->OpenInputFile("AB/abc/"); + if (maybe_file.ok()) { + ASSERT_OK_AND_ASSIGN(file, maybe_file); + ASSERT_OK_AND_ASSIGN(buffer, file->ReadAt(5, 6)); + AssertBufferEqual(*buffer, "other "); + ASSERT_OK(file->Close()); + } else { + ASSERT_RAISES(IOError, maybe_file); + } + // File does not exist AssertRaisesWithErrno(ENOENT, fs->OpenInputFile("AB/def")); AssertRaisesWithErrno(ENOENT, fs->OpenInputFile("def")); From 470d43f5563b4b91431ad713dce37e2202cc89c2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 12 Jul 2022 13:10:11 -0700 Subject: [PATCH 5/7] fix: always reject trailing slashes for file paths --- cpp/src/arrow/filesystem/gcsfs.cc | 39 ++++++++--------- cpp/src/arrow/filesystem/mockfs.cc | 4 ++ cpp/src/arrow/filesystem/path_util.cc | 8 ++++ cpp/src/arrow/filesystem/path_util.h | 3 ++ cpp/src/arrow/filesystem/s3fs.cc | 3 ++ cpp/src/arrow/filesystem/test_util.cc | 62 +++++++++++++-------------- r/R/io.R | 6 +-- 7 files changed, 69 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 8e706908fea..da7b856be47 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -71,28 +71,22 @@ struct GcsPath { std::string bucket; std::string object; - static Result FromString(const std::string& s, - bool is_definitely_file = false) { - // If we know it's definitely a file, we should remove the trailing slash - const auto src = - is_definitely_file ? internal::RemoveTrailingSlash(s) : util::string_view(s); - - if (internal::IsLikelyUri(src)) { + static Result FromString(const std::string& s) { + if (internal::IsLikelyUri(s)) { return Status::Invalid( - "Expected a GCS object path of the form 'bucket/key...', got a URI: '", src, - "'"); + "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); } - auto const first_sep = src.find_first_of(internal::kSep); + auto const first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", src, "')"); + return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { return GcsPath{s, internal::RemoveTrailingSlash(s).to_string(), ""}; } GcsPath path; - path.full_path = src.to_string(); - path.bucket = src.substr(0, first_sep).to_string(); - path.object = src.substr(first_sep + 1).to_string(); + path.full_path = s; + path.bucket = s.substr(0, first_sep); + path.object = s.substr(first_sep + 1); return path; } @@ -893,7 +887,8 @@ Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) Result> GcsFileSystem::OpenInputStream( const std::string& path) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset()); } @@ -903,14 +898,15 @@ Result> GcsFileSystem::OpenInputStream( return Status::IOError("Cannot open directory '", info.path(), "' as an input stream"); } - ARROW_ASSIGN_OR_RAISE(auto p, - GcsPath::FromString(info.path(), /*is_definitely_file=*/true)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path())); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); return impl_->OpenInputStream(p, gcs::Generation(), gcs::ReadFromOffset()); } Result> GcsFileSystem::OpenInputFile( const std::string& path) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); auto metadata = impl_->GetObjectMetadata(p); ARROW_GCS_RETURN_NOT_OK(metadata.status()); auto impl = impl_; @@ -931,8 +927,8 @@ Result> GcsFileSystem::OpenInputFile( return Status::IOError("Cannot open directory '", info.path(), "' as an input stream"); } - ARROW_ASSIGN_OR_RAISE(auto p, - GcsPath::FromString(info.path(), /*is_definitely_file=*/true)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path())); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); auto metadata = impl_->GetObjectMetadata(p); ARROW_GCS_RETURN_NOT_OK(metadata.status()); auto impl = impl_; @@ -949,7 +945,8 @@ Result> GcsFileSystem::OpenInputFile( Result> GcsFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path, /*is_definitely_file=*/true)); + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); + ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); return impl_->OpenOutputStream(p, metadata); } diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index 31dba27326b..f8d9aefbfe8 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -382,6 +382,7 @@ class MockFileSystem::Impl { Result> OpenOutputStream( const std::string& path, bool append, const std::shared_ptr& metadata) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -412,6 +413,7 @@ class MockFileSystem::Impl { } Result> OpenInputReader(const std::string& path) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -719,6 +721,7 @@ Result> MockFileSystem::OpenInputFile( Result> MockFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); @@ -727,6 +730,7 @@ Result> MockFileSystem::OpenOutputStream( Result> MockFileSystem::OpenAppendStream( const std::string& path, const std::shared_ptr& metadata) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 04484dd5bda..ddba968b85a 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -19,6 +19,7 @@ #include #include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/util_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/logging.h" @@ -139,6 +140,13 @@ util::string_view RemoveLeadingSlash(util::string_view key) { return key; } +Status AssertNoTrailingSlash(const std::string& key) { + if (key.back() == '/') { + return NotAFile(key); + } + return Status::OK(); +} + Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { if (base.empty() || base.front() != kSep) { diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 75745ee44ce..7e77cda1e0c 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -72,6 +72,9 @@ std::string EnsureTrailingSlash(util::string_view s); ARROW_EXPORT util::string_view RemoveTrailingSlash(util::string_view s); +ARROW_EXPORT +Status AssertNoTrailingSlash(const std::string& s); + ARROW_EXPORT bool IsAncestorOf(util::string_view ancestor, util::string_view descendant); diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 7a3df543506..dd3973ba771 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -2169,6 +2169,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this> OpenInputFile(const std::string& s, S3FileSystem* fs) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); @@ -2179,6 +2180,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this> OpenInputFile(const FileInfo& info, S3FileSystem* fs) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path())); if (info.type() == FileType::NotFound) { return ::arrow::fs::internal::PathNotFound(info.path()); } @@ -2537,6 +2539,7 @@ Result> S3FileSystem::OpenInputFile( Result> S3FileSystem::OpenOutputStream( const std::string& s, const std::shared_ptr& metadata) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 9d001cc8499..d72386a64a9 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -899,18 +899,8 @@ void GenericFileSystemTest::TestOpenOutputStream(FileSystem* fs) { ASSERT_RAISES(Invalid, stream->Write("x")); // Stream is closed - // Trailing slash ignored or rejected - auto maybe_stream = fs->OpenOutputStream("CD/ghi/"); - if (maybe_stream.ok()) { - ASSERT_OK_AND_ASSIGN(stream, fs->OpenOutputStream("CD/ghi/")); - ASSERT_OK(stream->Write("new data")); - ASSERT_OK(stream->Close()); - AssertAllDirs(fs, {"CD"}); - AssertAllFiles(fs, {"CD/ghi", "abc"}); - AssertFileContents(fs, "CD/ghi", "new data"); - } else { - ASSERT_RAISES(IOError, maybe_stream); - } + // Trailing slash rejected + ASSERT_RAISES(IOError, fs->OpenOutputStream("CD/ghi/")); // Storing metadata along file auto metadata = KeyValueMetadata::Make({"Content-Type", "Content-Language"}, @@ -944,6 +934,9 @@ void GenericFileSystemTest::TestOpenAppendStream(FileSystem* fs) { std::shared_ptr stream; + // Trailing slash rejected + ASSERT_RAISES(IOError, fs->OpenAppendStream("abc/")); + if (allow_append_to_new_file()) { ASSERT_OK_AND_ASSIGN(stream, fs->OpenAppendStream("abc")); } else { @@ -987,16 +980,8 @@ void GenericFileSystemTest::TestOpenInputStream(FileSystem* fs) { ASSERT_OK(stream->Close()); ASSERT_RAISES(Invalid, stream->Read(1)); // Stream is closed - // Either ignore or reject trailing slash on a file - auto maybe_stream = fs->OpenInputStream("AB/abc/"); - if (maybe_stream.ok()) { - ASSERT_OK_AND_ASSIGN(stream, maybe_stream); - ASSERT_OK_AND_ASSIGN(buffer, stream->Read(4)); - AssertBufferEqual(*buffer, "some"); - ASSERT_OK(stream->Close()); - } else { - ASSERT_RAISES(IOError, maybe_stream); - } + // Trailing slash rejected + ASSERT_RAISES(IOError, fs->OpenInputStream("AB/abc/")); // File does not exist AssertRaisesWithErrno(ENOENT, fs->OpenInputStream("AB/def")); @@ -1033,6 +1018,15 @@ void GenericFileSystemTest::TestOpenInputStreamWithFileInfo(FileSystem* fs) { info.set_type(FileType::Unknown); AssertRaisesWithErrno(ENOENT, fs->OpenInputStream(info)); + // Trailing slash rejected + auto maybe_info = fs->GetFileInfo("AB/abc/"); + if (maybe_info.ok()) { + ASSERT_OK_AND_ASSIGN(info, maybe_info); + ASSERT_RAISES(IOError, fs->OpenInputStream(info)); + } else { + ASSERT_RAISES(IOError, maybe_info); + } + // Cannot open directory ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo("AB")); ASSERT_RAISES(IOError, fs->OpenInputStream(info)); @@ -1068,16 +1062,8 @@ void GenericFileSystemTest::TestOpenInputFile(FileSystem* fs) { ASSERT_OK(file->Close()); ASSERT_RAISES(Invalid, file->ReadAt(1, 1)); // Stream is closed - // Either ignore or reject trailing slash on a file - auto maybe_file = fs->OpenInputFile("AB/abc/"); - if (maybe_file.ok()) { - ASSERT_OK_AND_ASSIGN(file, maybe_file); - ASSERT_OK_AND_ASSIGN(buffer, file->ReadAt(5, 6)); - AssertBufferEqual(*buffer, "other "); - ASSERT_OK(file->Close()); - } else { - ASSERT_RAISES(IOError, maybe_file); - } + // Trailing slash rejected + ASSERT_RAISES(IOError, fs->OpenInputFile("AB/abc/")); // File does not exist AssertRaisesWithErrno(ENOENT, fs->OpenInputFile("AB/def")); @@ -1102,6 +1088,9 @@ void GenericFileSystemTest::TestOpenInputFileAsync(FileSystem* fs) { // File does not exist AssertRaisesWithErrno(ENOENT, fs->OpenInputFileAsync("AB/def").result()); + + // Trailing slash rejected + ASSERT_RAISES(IOError, fs->OpenInputFileAsync("AB/abc/").result()); } void GenericFileSystemTest::TestOpenInputFileWithFileInfo(FileSystem* fs) { @@ -1131,6 +1120,15 @@ void GenericFileSystemTest::TestOpenInputFileWithFileInfo(FileSystem* fs) { info.set_type(FileType::Unknown); AssertRaisesWithErrno(ENOENT, fs->OpenInputFile(info)); + // Trailing slash rejected + auto maybe_info = fs->GetFileInfo("AB/abc/"); + if (maybe_info.ok()) { + ASSERT_OK_AND_ASSIGN(info, maybe_info); + ASSERT_RAISES(IOError, fs->OpenInputFile(info)); + } else { + ASSERT_RAISES(IOError, maybe_info); + } + // Cannot open directory ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo("AB")); ASSERT_RAISES(IOError, fs->OpenInputFile(info)); diff --git a/r/R/io.R b/r/R/io.R index 8e72187b431..d184d5861da 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -241,7 +241,7 @@ mmap_open <- function(path, mode = c("read", "write", "readwrite")) { make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) { if (inherits(file, "SubTreeFileSystem")) { filesystem <- file$base_fs - file <- file$base_path + file <- gsub("/$", "", file$base_path) } if (is.string(file)) { if (is_url(file)) { @@ -303,7 +303,7 @@ make_output_stream <- function(x, filesystem = NULL, compression = NULL) { if (inherits(x, "SubTreeFileSystem")) { filesystem <- x$base_fs - x <- x$base_path + x <- gsub("/$", "", x$base_path) } else if (is_url(x)) { fs_and_path <- FileSystem$from_uri(x) filesystem <- fs_and_path$fs @@ -317,7 +317,7 @@ make_output_stream <- function(x, filesystem = NULL, compression = NULL) { assert_that(is.string(x)) if (is.null(filesystem) && is_compressed(compression)) { - CompressedOutputStream$create(x) ##compressed local + CompressedOutputStream$create(x) ## compressed local } else if (is.null(filesystem) && !is_compressed(compression)) { FileOutputStream$create(x) ## uncompressed local } else if (!is.null(filesystem) && is_compressed(compression)) { From fc8dbea2e04da33c350c88defeb42fc9a50b476e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 12 Jul 2022 13:33:34 -0700 Subject: [PATCH 6/7] chore: move some internal APIs to use string_view --- cpp/src/arrow/filesystem/mockfs.cc | 3 +-- cpp/src/arrow/filesystem/path_util.cc | 2 +- cpp/src/arrow/filesystem/path_util.h | 2 +- cpp/src/arrow/filesystem/util_internal.cc | 8 ++++---- cpp/src/arrow/filesystem/util_internal.h | 9 +++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index f8d9aefbfe8..778b70e7159 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -721,8 +721,7 @@ Result> MockFileSystem::OpenInputFile( Result> MockFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - - RETURN_NOT_OK(ValidatePath(path)); + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenOutputStream(path, /*append=*/false, metadata); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index ddba968b85a..1afc3b2a89b 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -140,7 +140,7 @@ util::string_view RemoveLeadingSlash(util::string_view key) { return key; } -Status AssertNoTrailingSlash(const std::string& key) { +Status AssertNoTrailingSlash(util::string_view key) { if (key.back() == '/') { return NotAFile(key); } diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 7e77cda1e0c..d4083d3b5c9 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -73,7 +73,7 @@ ARROW_EXPORT util::string_view RemoveTrailingSlash(util::string_view s); ARROW_EXPORT -Status AssertNoTrailingSlash(const std::string& s); +Status AssertNoTrailingSlash(util::string_view s); ARROW_EXPORT bool IsAncestorOf(util::string_view ancestor, util::string_view descendant); diff --git a/cpp/src/arrow/filesystem/util_internal.cc b/cpp/src/arrow/filesystem/util_internal.cc index 4851fccd55d..0d2ad709026 100644 --- a/cpp/src/arrow/filesystem/util_internal.cc +++ b/cpp/src/arrow/filesystem/util_internal.cc @@ -56,21 +56,21 @@ Status CopyStream(const std::shared_ptr& src, return Status::OK(); } -Status PathNotFound(const std::string& path) { +Status PathNotFound(util::string_view path) { return Status::IOError("Path does not exist '", path, "'") .WithDetail(StatusDetailFromErrno(ENOENT)); } -Status NotADir(const std::string& path) { +Status NotADir(util::string_view path) { return Status::IOError("Not a directory: '", path, "'") .WithDetail(StatusDetailFromErrno(ENOTDIR)); } -Status NotAFile(const std::string& path) { +Status NotAFile(util::string_view path) { return Status::IOError("Not a regular file: '", path, "'"); } -Status InvalidDeleteDirContents(const std::string& path) { +Status InvalidDeleteDirContents(util::string_view path) { return Status::Invalid( "DeleteDirContents called on invalid path '", path, "'. ", "If you wish to delete the root directory's contents, call DeleteRootDirContents."); diff --git a/cpp/src/arrow/filesystem/util_internal.h b/cpp/src/arrow/filesystem/util_internal.h index 80a29feb850..75a2d3a2ef5 100644 --- a/cpp/src/arrow/filesystem/util_internal.h +++ b/cpp/src/arrow/filesystem/util_internal.h @@ -23,6 +23,7 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -38,16 +39,16 @@ Status CopyStream(const std::shared_ptr& src, const io::IOContext& io_context); ARROW_EXPORT -Status PathNotFound(const std::string& path); +Status PathNotFound(util::string_view path); ARROW_EXPORT -Status NotADir(const std::string& path); +Status NotADir(util::string_view path); ARROW_EXPORT -Status NotAFile(const std::string& path); +Status NotAFile(util::string_view path); ARROW_EXPORT -Status InvalidDeleteDirContents(const std::string& path); +Status InvalidDeleteDirContents(util::string_view path); /// \brief Return files matching the glob pattern on the filesystem /// From fdcb5b7f0cb4aae44a75c886f44753e371d4bb82 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 12 Jul 2022 14:38:50 -0700 Subject: [PATCH 7/7] chore: explain in R code why we need to drop trailing slash --- cpp/src/arrow/filesystem/mockfs.cc | 2 +- r/R/io.R | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index 778b70e7159..69e49b32043 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -721,7 +721,7 @@ Result> MockFileSystem::OpenInputFile( Result> MockFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - RETURN_NOT_OK(ValidatePath(path)); + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenOutputStream(path, /*append=*/false, metadata); diff --git a/r/R/io.R b/r/R/io.R index d184d5861da..82e3847df5e 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -241,7 +241,9 @@ mmap_open <- function(path, mode = c("read", "write", "readwrite")) { make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem = NULL) { if (inherits(file, "SubTreeFileSystem")) { filesystem <- file$base_fs - file <- gsub("/$", "", file$base_path) + # SubTreeFileSystem adds a slash to base_path, but filesystems will reject file names + # with trailing slashes, so we need to remove it here. + file <- sub("/$", "", file$base_path) } if (is.string(file)) { if (is_url(file)) { @@ -303,7 +305,9 @@ make_output_stream <- function(x, filesystem = NULL, compression = NULL) { if (inherits(x, "SubTreeFileSystem")) { filesystem <- x$base_fs - x <- gsub("/$", "", x$base_path) + # SubTreeFileSystem adds a slash to base_path, but filesystems will reject file names + # with trailing slashes, so we need to remove it here. + x <- sub("/$", "", x$base_path) } else if (is_url(x)) { fs_and_path <- FileSystem$from_uri(x) filesystem <- fs_and_path$fs @@ -333,7 +337,7 @@ detect_compression <- function(path) { } # Remove any trailing slashes, which FileSystem$from_uri may add - path <- gsub("/$", "", path) + path <- sub("/$", "", path) switch(tools::file_ext(path), bz2 = "bz2",