Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 79 additions & 15 deletions cpp/src/arrow/filesystem/gcsfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ namespace {

namespace gcs = google::cloud::storage;

auto constexpr kSep = '/';
// Change the default upload buffer size. In general, sending larger buffers is more
// efficient with GCS, as each buffer requires a roundtrip to the service. With formatted
// output (when using `operator<<`), keeping a larger buffer in memory before uploading
Expand All @@ -49,18 +48,17 @@ struct GcsPath {
std::string object;

static Result<GcsPath> FromString(const std::string& s) {
const auto src = internal::RemoveTrailingSlash(s);
auto const first_sep = src.find_first_of(kSep);
auto const first_sep = s.find_first_of(internal::kSep);
if (first_sep == 0) {
return Status::Invalid("Path cannot start with a separator ('", s, "')");
}
if (first_sep == std::string::npos) {
return GcsPath{std::string(src), std::string(src), ""};
return GcsPath{s, internal::RemoveTrailingSlash(s).to_string(), ""};
}
GcsPath path;
path.full_path = std::string(src);
path.bucket = std::string(src.substr(0, first_sep));
path.object = std::string(src.substr(first_sep + 1));
path.full_path = s;
path.bucket = s.substr(0, first_sep);
path.object = s.substr(first_sep + 1);
return path;
}

Expand Down Expand Up @@ -275,12 +273,73 @@ class GcsFileSystem::Impl {
const GcsOptions& options() const { return options_; }

Result<FileInfo> GetFileInfo(const GcsPath& path) {
if (!path.object.empty()) {
auto meta = client_.GetObjectMetadata(path.bucket, path.object);
return GetFileInfoImpl(path, std::move(meta).status(), FileType::File);
if (path.object.empty()) {
auto meta = client_.GetBucketMetadata(path.bucket);
return GetFileInfoImpl(path, std::move(meta).status(), FileType::Directory);
}
auto meta = client_.GetBucketMetadata(path.bucket);
return GetFileInfoImpl(path, std::move(meta).status(), FileType::Directory);
auto meta = client_.GetObjectMetadata(path.bucket, path.object);
return GetFileInfoImpl(
path, std::move(meta).status(),
path.object.back() == '/' ? FileType::Directory : FileType::File);
}

// GCS does not have directories or folders. But folders can be emulated (with some
// limitations) using marker objects. That and listing with prefixes creates the
// illusion of folders.
google::cloud::Status CreateDirMarker(const std::string& bucket,
util::string_view name) {
// Make the name canonical.
const auto canonical = internal::EnsureTrailingSlash(name);
return client_
.InsertObject(bucket, canonical, std::string(),
gcs::WithObjectMetadata(gcs::ObjectMetadata().upsert_metadata(
"arrow/gcsfs", "directory")))
.status();
}

google::cloud::Status CreateDirMarkerRecursive(const std::string& bucket,
const std::string& object) {
using GcsCode = google::cloud::StatusCode;
auto get_parent = [](std::string const& path) {
return std::move(internal::GetAbstractPathParent(path).first);
};
// Maybe counterintuitively we create the markers from the most nested and up. Because
// GCS does not have directories creating `a/b/c` will succeed, even if `a/` or `a/b/`
// does not exist. In the common case, where `a/b/` may already exist, it is more
// efficient to just create `a/b/c/` and then find out that `a/b/` was already there.
// In the case where none exists, it does not matter which order we follow.
auto status = CreateDirMarker(bucket, object);
if (status.code() == GcsCode::kAlreadyExists) return {};
if (status.code() == GcsCode::kNotFound) {
// Missing bucket, create it first ...
status = client_.CreateBucket(bucket, gcs::BucketMetadata()).status();
if (status.code() != GcsCode::kOk && status.code() != GcsCode::kAlreadyExists) {
return status;
}
}

for (auto parent = get_parent(object); !parent.empty(); parent = get_parent(parent)) {
status = CreateDirMarker(bucket, parent);
if (status.code() == GcsCode::kAlreadyExists) {
break;
}
if (!status.ok()) {
return status;
}
}
return {};
}

Status CreateDir(const GcsPath& p) {
if (p.object.empty()) {
return internal::ToArrowStatus(
client_.CreateBucket(p.bucket, gcs::BucketMetadata()).status());
}
return internal::ToArrowStatus(CreateDirMarker(p.bucket, p.object));
}

Status CreateDirRecursive(const GcsPath& p) {
return internal::ToArrowStatus(CreateDirMarkerRecursive(p.bucket, p.object));
}

Status DeleteFile(const GcsPath& p) {
Expand Down Expand Up @@ -332,12 +391,15 @@ class GcsFileSystem::Impl {
static Result<FileInfo> GetFileInfoImpl(const GcsPath& path,
const google::cloud::Status& status,
FileType type) {
const auto& canonical = type == FileType::Directory
? internal::EnsureTrailingSlash(path.full_path)
: path.full_path;
if (status.ok()) {
return FileInfo(path.full_path, type);
return FileInfo(canonical, type);
}
using ::google::cloud::StatusCode;
if (status.code() == StatusCode::kNotFound) {
return FileInfo(path.full_path, FileType::NotFound);
return FileInfo(canonical, FileType::NotFound);
}
return internal::ToArrowStatus(status);
}
Expand Down Expand Up @@ -373,7 +435,9 @@ Result<FileInfoVector> GcsFileSystem::GetFileInfo(const FileSelector& select) {
}

Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path));
if (!recursive) return impl_->CreateDir(p);
return impl_->CreateDirRecursive(p);
}

Status GcsFileSystem::DeleteDir(const std::string& path) {
Expand Down
52 changes: 36 additions & 16 deletions cpp/src/arrow/filesystem/gcsfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,43 @@ struct ARROW_EXPORT GcsOptions {
bool Equals(const GcsOptions& other) const;
};

// - TODO(ARROW-1231) - review this documentation before closing the bug.
/// \brief GCS-backed FileSystem implementation.
///
/// Some implementation notes:
/// - TODO(ARROW-1231) - review all the notes once completed.
/// - buckets are treated as top-level directories on a "root".
/// - GCS buckets are in a global namespace, only one bucket
/// named `foo` exists in Google Cloud.
/// - Creating new top-level directories is implemented by creating
/// a bucket, this may be a slower operation than usual.
/// - A principal (service account, user, etc) can only list the
/// buckets for a single project, but can access the buckets
/// for many projects. It is possible that listing "all"
/// the buckets returns fewer buckets than you have access to.
/// - GCS does not have directories, they are emulated in this
/// library by listing objects with a common prefix.
/// - In general, GCS has much higher latency than local filesystems.
/// The throughput of GCS is comparable to the throughput of
/// a local file system.
/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
/// storage system for any amount of data. The main abstractions in GCS are buckets and
/// objects. A bucket is a namespace for objects, buckets can store any number of objects,
/// tens of millions and even billions is not uncommon. Each object contains a single
/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single
/// version of each object, but versioning can be enabled. Versioning is important because
/// objects are immutable, once created one cannot append data to the object or modify the
/// object data in any way.
///
/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
/// named `foo` no other customer can create a bucket with the same name. Note that a
/// principal (a user or service account) may only list the buckets they are entitled to,
/// and then only within a project. It is not possible to list "all" the buckets.
///
/// Within each bucket objects are in flat namespace. GCS does not have folders or
/// directories. However, following some conventions it is possible to emulate
/// directories. To this end, this class:
///
/// - All buckets are treated as directories at the "root"
/// - Creating a root directory results in a new bucket being created, this may be slower
/// than most GCS operations.
/// - Any object with a name ending with a slash (`/`) character is treated as a
/// directory.
/// - The class creates marker objects for a directory, using a trailing slash in the
/// marker names. For debugging purposes, the metadata of these marker objects indicate
/// that they are markers created by this class. The class does not rely on this
/// annotation.
/// - GCS can list all the objects with a given prefix, this is used to emulate listing
/// of directories.
/// - In object lists GCS can summarize all the objects with a common prefix as a single
/// entry, this is used to emulate non-recursive lists. Note that GCS list time is
/// proportional to the number of objects in the prefix. Listing recursively takes
/// almost the same time as non-recursive lists.
///
class ARROW_EXPORT GcsFileSystem : public FileSystem {
public:
~GcsFileSystem() override = default;
Expand All @@ -75,6 +94,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem {

Status DeleteDirContents(const std::string& path) override;

/// This is not implemented in GcsFileSystem, as it would be too dangerous.
Status DeleteRootDirContents() override;

Status DeleteFile(const std::string& path) override;
Expand Down
Loading