From c2b37350967833499dd2af888b79c2f197fda3fc Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 18 Mar 2025 10:28:12 +0100 Subject: [PATCH 01/27] First remove only ObjectType --- cpp/src/arrow/io/hdfs.cc | 3 ++- cpp/src/arrow/io/hdfs.h | 10 +++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index c092a1ff7bc..d2b4f4fd89d 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -29,6 +29,7 @@ #include #include "arrow/buffer.h" +#include "arrow/filesystem/type_fwd.h" #include "arrow/io/hdfs.h" #include "arrow/io/hdfs_internal.h" #include "arrow/io/interfaces.h" @@ -340,7 +341,7 @@ Result HdfsOutputStream::Tell() const { return impl_->Tell(); } // TODO(wesm): this could throw std::bad_alloc in the course of copying strings // into the path info object static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { - out->kind = input->mKind == kObjectKindFile ? ObjectType::FILE : ObjectType::DIRECTORY; + out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File : arrow::fs::FileType::Directory; out->name = std::string(input->mName); out->owner = std::string(input->mOwner); out->group = std::string(input->mGroup); diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 46038070ae4..e90032c347e 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/filesystem/type_fwd.h" #include "arrow/io/interfaces.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -38,16 +39,11 @@ namespace io { class HdfsReadableFile; class HdfsOutputStream; -/// DEPRECATED. Use the FileSystem API in arrow::fs instead. -struct ObjectType { - enum type { FILE, DIRECTORY }; -}; - /// DEPRECATED. Use the FileSystem API in arrow::fs instead. struct ARROW_EXPORT FileStatistics { /// Size of file, -1 if finding length is unsupported int64_t size; - ObjectType::type kind; + arrow::fs::FileType kind; }; class ARROW_EXPORT FileSystem { @@ -67,7 +63,7 @@ class ARROW_EXPORT FileSystem { }; struct HdfsPathInfo { - ObjectType::type kind; + arrow::fs::FileType kind; std::string name; std::string owner; From e59f0d21600a9fb3d9a414b3ab8a1a58caa27fc4 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 1 Apr 2025 11:54:41 +0200 Subject: [PATCH 02/27] Remove FileStatistics --- cpp/src/arrow/io/hdfs.cc | 10 +++++----- cpp/src/arrow/io/hdfs.h | 12 +++--------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index d2b4f4fd89d..6303019eba6 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -457,12 +457,12 @@ class HadoopFileSystem::HadoopFileSystemImpl { return Status::OK(); } - Status Stat(const std::string& path, FileStatistics* stat) { + Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) { HdfsPathInfo info; RETURN_NOT_OK(GetPathInfo(path, &info)); - stat->size = info.size; - stat->kind = info.kind; + file_info->set_size(info.size); + file_info->set_type(info.kind); return Status::OK(); } @@ -627,8 +627,8 @@ Status HadoopFileSystem::GetPathInfo(const std::string& path, HdfsPathInfo* info return impl_->GetPathInfo(path, info); } -Status HadoopFileSystem::Stat(const std::string& path, FileStatistics* stat) { - return impl_->Stat(path, stat); +Status HadoopFileSystem::Stat(const std::string& path, arrow::fs::FileInfo* file_info) { + return impl_->Stat(path, file_info); } Status HadoopFileSystem::GetCapacity(int64_t* nbytes) { diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index e90032c347e..25fba7756ee 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/type_fwd.h" #include "arrow/io/interfaces.h" #include "arrow/util/macros.h" @@ -39,13 +40,6 @@ namespace io { class HdfsReadableFile; class HdfsOutputStream; -/// DEPRECATED. Use the FileSystem API in arrow::fs instead. -struct ARROW_EXPORT FileStatistics { - /// Size of file, -1 if finding length is unsupported - int64_t size; - arrow::fs::FileType kind; -}; - class ARROW_EXPORT FileSystem { public: virtual ~FileSystem() = default; @@ -59,7 +53,7 @@ class ARROW_EXPORT FileSystem { virtual Status Rename(const std::string& src, const std::string& dst) = 0; - virtual Status Stat(const std::string& path, FileStatistics* stat) = 0; + virtual Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) = 0; }; struct HdfsPathInfo { @@ -173,7 +167,7 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { Status Move(const std::string& src, const std::string& dst); - Status Stat(const std::string& path, FileStatistics* stat) override; + Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) override; // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory From 938322a9dd441193c03c98ffd005e2fc735b99af Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 1 Apr 2025 12:27:03 +0200 Subject: [PATCH 03/27] Run linter --- cpp/src/arrow/io/hdfs.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6303019eba6..cc55dd8d143 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -341,7 +341,8 @@ Result HdfsOutputStream::Tell() const { return impl_->Tell(); } // TODO(wesm): this could throw std::bad_alloc in the course of copying strings // into the path info object static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { - out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File : arrow::fs::FileType::Directory; + out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File + : arrow::fs::FileType::Directory; out->name = std::string(input->mName); out->owner = std::string(input->mOwner); out->group = std::string(input->mGroup); From 55321ad0add6b8314f62d392bd970eb9f7d940e8 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 1 Apr 2025 13:35:04 +0200 Subject: [PATCH 04/27] Update tests and filesystem/hdfs.cc --- cpp/src/arrow/filesystem/hdfs.cc | 10 +++++----- cpp/src/arrow/io/hdfs_test.cc | 11 ++++++----- python/pyarrow/includes/libarrow.pxd | 12 ++---------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index d59b2a342d7..151c2525e0b 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -194,7 +194,7 @@ class HadoopFileSystem::Impl { // Check existence of path, and that it's a directory io::HdfsPathInfo info; RETURN_NOT_OK(client_->GetPathInfo(path, &info)); - if (info.kind != io::ObjectType::DIRECTORY) { + if (info.kind != FileType::Directory) { return Status::IOError("Cannot ", action, " directory '", path, "': not a directory"); } @@ -275,10 +275,10 @@ class HadoopFileSystem::Impl { std::shared_ptr<::arrow::io::HadoopFileSystem> client_; void PathInfoToFileInfo(const io::HdfsPathInfo& info, FileInfo* out) { - if (info.kind == io::ObjectType::DIRECTORY) { + if (info.kind == FileType::Directory) { out->set_type(FileType::Directory); out->set_size(kNoSize); - } else if (info.kind == io::ObjectType::FILE) { + } else if (info.kind == FileType::File) { out->set_type(FileType::File); out->set_size(info.size); } @@ -297,12 +297,12 @@ class HadoopFileSystem::Impl { bool IsDirectory(const std::string& path) { io::HdfsPathInfo info; - return GetPathInfo(path, &info) && info.kind == io::ObjectType::DIRECTORY; + return GetPathInfo(path, &info) && info.kind == FileType::Directory; } bool IsFile(const std::string& path) { io::HdfsPathInfo info; - return GetPathInfo(path, &info) && info.kind == io::ObjectType::FILE; + return GetPathInfo(path, &info) && info.kind == FileType::File; } bool GetPathInfo(const std::string& path, io::HdfsPathInfo* info) { diff --git a/cpp/src/arrow/io/hdfs_test.cc b/cpp/src/arrow/io/hdfs_test.cc index 4f0a280b768..e0389473ea9 100644 --- a/cpp/src/arrow/io/hdfs_test.cc +++ b/cpp/src/arrow/io/hdfs_test.cc @@ -30,6 +30,7 @@ #include #include "arrow/buffer.h" +#include "arrow/filesystem/type_fwd.h" #include "arrow/io/hdfs.h" #include "arrow/io/hdfs_internal.h" #include "arrow/io/interfaces.h" @@ -209,7 +210,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfo) { // Directory info ASSERT_OK(this->client_->GetPathInfo(this->scratch_dir_, &info)); - ASSERT_EQ(ObjectType::DIRECTORY, info.kind); + ASSERT_EQ(arrow::fs::FileType::Directory, info.kind); ASSERT_EQ(this->HdfsAbsPath(this->scratch_dir_), info.name); ASSERT_EQ(this->conf_.user, info.owner); @@ -224,7 +225,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfo) { ASSERT_OK(this->WriteDummyFile(path, buffer.data(), size)); ASSERT_OK(this->client_->GetPathInfo(path, &info)); - ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(this->HdfsAbsPath(path), info.name); ASSERT_EQ(this->conf_.user, info.owner); ASSERT_EQ(size, info.size); @@ -293,13 +294,13 @@ TEST_F(TestHadoopFileSystem, ListDirectory) { for (size_t i = 0; i < listing.size(); ++i) { const HdfsPathInfo& info = listing[i]; if (info.name == this->HdfsAbsPath(p1)) { - ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(size, info.size); } else if (info.name == this->HdfsAbsPath(p2)) { - ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(size / 2, info.size); } else if (info.name == this->HdfsAbsPath(d1)) { - ASSERT_EQ(ObjectType::DIRECTORY, info.kind); + ASSERT_EQ(arrow::fs::FileType::Directory, info.kind); } else { FAIL() << "Unexpected path: " << info.name; } diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 39dc3a77d98..30df6b2d307 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1585,10 +1585,6 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: FileMode_WRITE" arrow::io::FileMode::WRITE" FileMode_READWRITE" arrow::io::FileMode::READWRITE" - cdef enum ObjectType" arrow::io::ObjectType::type": - ObjectType_FILE" arrow::io::ObjectType::FILE" - ObjectType_DIRECTORY" arrow::io::ObjectType::DIRECTORY" - cdef cppclass CIOContext" arrow::io::IOContext": CIOContext() CIOContext(CStopToken) @@ -1599,10 +1595,6 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: int GetIOThreadPoolCapacity() CStatus SetIOThreadPoolCapacity(int threads) - cdef cppclass FileStatistics: - int64_t size - ObjectType kind - cdef cppclass FileInterface: CStatus Close() CResult[int64_t] Tell() @@ -1671,7 +1663,7 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: pass cdef cppclass CIOFileSystem" arrow::io::FileSystem": - CStatus Stat(const c_string& path, FileStatistics* stat) + CStatus Stat(const c_string& path, FileInfo* stat) cdef cppclass FileOutputStream(COutputStream): @staticmethod @@ -1774,7 +1766,7 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: HdfsDriver driver cdef cppclass HdfsPathInfo: - ObjectType kind + FileType kind c_string name c_string owner c_string group From 3279a5439b33f56f284ef24068b0f088a80d427b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 1 Apr 2025 14:19:46 +0200 Subject: [PATCH 05/27] Move definitions from libarrow.pxd to libarrow_fs.pxd --- python/pyarrow/includes/libarrow.pxd | 51 ------------------------ python/pyarrow/includes/libarrow_fs.pxd | 52 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 30df6b2d307..932b3d9548f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1662,9 +1662,6 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: WritableFile): pass - cdef cppclass CIOFileSystem" arrow::io::FileSystem": - CStatus Stat(const c_string& path, FileInfo* stat) - cdef cppclass FileOutputStream(COutputStream): @staticmethod CResult[shared_ptr[COutputStream]] Open(const c_string& path) @@ -1765,60 +1762,12 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: unordered_map[c_string, c_string] extra_conf HdfsDriver driver - cdef cppclass HdfsPathInfo: - FileType kind - c_string name - c_string owner - c_string group - int32_t last_modified_time - int32_t last_access_time - int64_t size - int16_t replication - int64_t block_size - int16_t permissions - cdef cppclass HdfsReadableFile(CRandomAccessFile): pass cdef cppclass HdfsOutputStream(COutputStream): pass - cdef cppclass CIOHadoopFileSystem \ - "arrow::io::HadoopFileSystem"(CIOFileSystem): - @staticmethod - CStatus Connect(const HdfsConnectionConfig* config, - shared_ptr[CIOHadoopFileSystem]* client) - - CStatus MakeDirectory(const c_string& path) - - CStatus Delete(const c_string& path, c_bool recursive) - - CStatus Disconnect() - - c_bool Exists(const c_string& path) - - CStatus Chmod(const c_string& path, int mode) - CStatus Chown(const c_string& path, const char* owner, - const char* group) - - CStatus GetCapacity(int64_t* nbytes) - CStatus GetUsed(int64_t* nbytes) - - CStatus ListDirectory(const c_string& path, - vector[HdfsPathInfo]* listing) - - CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) - - CStatus Rename(const c_string& src, const c_string& dst) - - CStatus OpenReadable(const c_string& path, - shared_ptr[HdfsReadableFile]* handle) - - CStatus OpenWritable(const c_string& path, c_bool append, - int32_t buffer_size, int16_t replication, - int64_t default_block_size, - shared_ptr[HdfsOutputStream]* handle) - cdef cppclass CBufferReader \ " arrow::io::BufferReader"(CRandomAccessFile): CBufferReader(const shared_ptr[CBuffer]& buffer) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index af01c47c8c7..6ee014ce309 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -362,3 +362,55 @@ cdef extern from "arrow/python/filesystem.h" namespace "arrow::py::fs" nogil: CPyFileSystemVtable vtable) PyObject* handler() + +cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: + cdef cppclass CIOFileSystem" arrow::io::FileSystem": + CStatus Stat(const c_string& path, CFileInfo* stat) + + cdef cppclass HdfsPathInfo: + CFileType kind + c_string name + c_string owner + c_string group + int32_t last_modified_time + int32_t last_access_time + int64_t size + int16_t replication + int64_t block_size + int16_t permissions + + CStatus ListDirectory(const c_string& path, + vector[HdfsPathInfo]* listing) + + CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) + + cdef cppclass CIOHadoopFileSystem \ + "arrow::io::HadoopFileSystem"(CIOFileSystem): + @staticmethod + CStatus Connect(const HdfsConnectionConfig* config, + shared_ptr[CIOHadoopFileSystem]* client) + + CStatus MakeDirectory(const c_string& path) + + CStatus Delete(const c_string& path, c_bool recursive) + + CStatus Disconnect() + + c_bool Exists(const c_string& path) + + CStatus Chmod(const c_string& path, int mode) + CStatus Chown(const c_string& path, const char* owner, + const char* group) + + CStatus GetCapacity(int64_t* nbytes) + CStatus GetUsed(int64_t* nbytes) + + CStatus Rename(const c_string& src, const c_string& dst) + + CStatus OpenReadable(const c_string& path, + shared_ptr[HdfsReadableFile]* handle) + + CStatus OpenWritable(const c_string& path, c_bool append, + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + shared_ptr[HdfsOutputStream]* handle) From 7b0afcacf4c336add319b84e8392aad0704cf1be Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 8 Apr 2025 15:18:23 +0200 Subject: [PATCH 06/27] Move hdfs code from io to filesystem --- cpp/src/arrow/CMakeLists.txt | 5 +- cpp/src/arrow/filesystem/CMakeLists.txt | 8 + cpp/src/arrow/filesystem/api.h | 3 + cpp/src/arrow/filesystem/hdfs.cc | 4 +- cpp/src/arrow/filesystem/hdfs.h | 2 +- cpp/src/arrow/filesystem/hdfs_internal.cc | 506 ++++++++++++ .../arrow/{io => filesystem}/hdfs_internal.h | 0 cpp/src/arrow/filesystem/hdfs_io.cc | 720 ++++++++++++++++++ .../arrow/{io/hdfs.h => filesystem/hdfs_io.h} | 0 .../hdfs_test_io.cc} | 54 +- cpp/src/arrow/io/CMakeLists.txt | 10 - cpp/src/arrow/io/api.h | 1 - cpp/src/arrow/meson.build | 2 - python/pyarrow/__init__.py | 10 +- python/pyarrow/_hdfs.pyx | 13 + python/pyarrow/includes/libarrow.pxd | 24 - python/pyarrow/includes/libarrow_fs.pxd | 126 +-- python/pyarrow/io.pxi | 14 +- 18 files changed, 1364 insertions(+), 138 deletions(-) create mode 100644 cpp/src/arrow/filesystem/hdfs_internal.cc rename cpp/src/arrow/{io => filesystem}/hdfs_internal.h (100%) create mode 100644 cpp/src/arrow/filesystem/hdfs_io.cc rename cpp/src/arrow/{io/hdfs.h => filesystem/hdfs_io.h} (100%) rename cpp/src/arrow/{io/hdfs_test.cc => filesystem/hdfs_test_io.cc} (91%) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9acd7083437..bfa3770b4bd 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -408,15 +408,12 @@ arrow_add_object_library(ARROW_IO io/caching.cc io/compressed.cc io/file.cc - io/hdfs.cc - io/hdfs_internal.cc io/interfaces.cc io/memory.cc io/slow.cc io/stdio.cc io/transform.cc) foreach(ARROW_IO_TARGET ${ARROW_IO_TARGETS}) - target_link_libraries(${ARROW_IO_TARGET} PRIVATE arrow::hadoop) if(NOT MSVC) target_link_libraries(${ARROW_IO_TARGET} PRIVATE ${CMAKE_DL_LIBS}) endif() @@ -906,7 +903,7 @@ if(ARROW_FILESYSTEM) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() if(ARROW_HDFS) - list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc) + list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc filesystem/hdfs_io.cc filesystem/hdfs_internal.cc) endif() if(ARROW_S3) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/s3fs.cc) diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 5250ed2a887..32007b4653c 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -143,4 +143,12 @@ endif() if(ARROW_HDFS) add_arrow_test(hdfs_test EXTRA_LABELS filesystem) + add_arrow_test(hdfs_test_io + NO_VALGRIND + EXTRA_LABELS + filesystem + EXTRA_LINK_LIBS + arrow::hadoop + Boost::filesystem + Boost::system) endif() diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index 7211ad5c2cc..703368bb475 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -32,3 +32,6 @@ #ifdef ARROW_S3 # include "arrow/filesystem/s3fs.h" // IWYU pragma: export #endif +#ifdef ARROW_HDFS +# include "arrow/filesystem/hdfs_io.h" // IWYU pragma: export +#endif diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 151c2525e0b..810e1fbdb8b 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -21,10 +21,10 @@ #include #include "arrow/filesystem/hdfs.h" +#include "arrow/filesystem/hdfs_internal.h" +#include "arrow/filesystem/hdfs_io.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" -#include "arrow/io/hdfs.h" -#include "arrow/io/hdfs_internal.h" #include "arrow/util/checked_cast.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index 25604a39e3a..f330da92f0e 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -22,7 +22,7 @@ #include #include "arrow/filesystem/filesystem.h" -#include "arrow/io/hdfs.h" +#include "arrow/filesystem/hdfs_io.h" #include "arrow/util/uri.h" namespace arrow::fs { diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc new file mode 100644 index 00000000000..a4e6b79a5a1 --- /dev/null +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -0,0 +1,506 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This shim interface to libhdfs (for runtime shared library loading) has been +// adapted from the SFrame project, released under the ASF-compatible 3-clause +// BSD license +// +// Using this required having the $JAVA_HOME and $HADOOP_HOME environment +// variables set, so that libjvm and libhdfs can be located easily + +// Copyright (C) 2015 Dato, Inc. +// All rights reserved. +// +// This software may be modified and distributed under the terms +// of the BSD license. See the LICENSE file for details. + +#include "arrow/filesystem/hdfs_internal.h" + +#include +#include +#include +#include // IWYU pragma: keep +#include +#include +#include +#include "arrow/util/basic_decimal.h" + +#ifndef _WIN32 +# include +#endif + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::GetEnvVarNative; +using internal::PlatformFilename; + +namespace io::internal { + +namespace { + +template +Status SetSymbol(void* handle, char const* name, T** symbol) { + if (*symbol != nullptr) return Status::OK(); + + auto maybe_symbol = ::arrow::internal::GetSymbolAs(handle, name); + if (Required || maybe_symbol.ok()) { + ARROW_ASSIGN_OR_RAISE(*symbol, maybe_symbol); + } + return Status::OK(); +} + +#define GET_SYMBOL_REQUIRED(SHIM, SYMBOL_NAME) \ + RETURN_NOT_OK(SetSymbol(SHIM->handle, #SYMBOL_NAME, &SHIM->SYMBOL_NAME)); + +#define GET_SYMBOL(SHIM, SYMBOL_NAME) \ + DCHECK_OK(SetSymbol(SHIM->handle, #SYMBOL_NAME, &SHIM->SYMBOL_NAME)); + +Result> MakeFilenameVector( + const std::vector& names) { + std::vector filenames(names.size()); + for (size_t i = 0; i < names.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(filenames[i], PlatformFilename::FromString(names[i])); + } + return filenames; +} + +void AppendEnvVarFilename(const char* var_name, + std::vector* filenames) { + auto maybe_env_var = GetEnvVarNative(var_name); + if (maybe_env_var.ok()) { + filenames->emplace_back(std::move(*maybe_env_var)); + } +} + +void AppendEnvVarFilename(const char* var_name, const char* suffix, + std::vector* filenames) { + auto maybe_env_var = GetEnvVarNative(var_name); + if (maybe_env_var.ok()) { + auto maybe_env_var_with_suffix = + PlatformFilename(std::move(*maybe_env_var)).Join(suffix); + if (maybe_env_var_with_suffix.ok()) { + filenames->emplace_back(std::move(*maybe_env_var_with_suffix)); + } + } +} + +void InsertEnvVarFilename(const char* var_name, + std::vector* filenames) { + auto maybe_env_var = GetEnvVarNative(var_name); + if (maybe_env_var.ok()) { + filenames->emplace(filenames->begin(), PlatformFilename(std::move(*maybe_env_var))); + } +} + +Result> get_potential_libhdfs_paths() { + std::vector potential_paths; + std::string file_name; + +// OS-specific file name +#ifdef _WIN32 + file_name = "hdfs.dll"; +#elif __APPLE__ + file_name = "libhdfs.dylib"; +#else + file_name = "libhdfs.so"; +#endif + + // Common paths + ARROW_ASSIGN_OR_RAISE(auto search_paths, MakeFilenameVector({"", "."})); + + // Path from environment variable + AppendEnvVarFilename("HADOOP_HOME", "lib/native", &search_paths); + AppendEnvVarFilename("ARROW_LIBHDFS_DIR", &search_paths); + + // All paths with file name + for (const auto& path : search_paths) { + ARROW_ASSIGN_OR_RAISE(auto full_path, path.Join(file_name)); + potential_paths.push_back(std::move(full_path)); + } + + return potential_paths; +} + +Result> get_potential_libjvm_paths() { + std::vector potential_paths; + + std::vector search_prefixes; + std::vector search_suffixes; + std::string file_name; + +// From heuristics +#ifdef _WIN32 + ARROW_ASSIGN_OR_RAISE(search_prefixes, MakeFilenameVector({""})); + ARROW_ASSIGN_OR_RAISE(search_suffixes, + MakeFilenameVector({"/jre/bin/server", "/bin/server"})); + file_name = "jvm.dll"; +#elif __APPLE__ + ARROW_ASSIGN_OR_RAISE(search_prefixes, MakeFilenameVector({""})); + ARROW_ASSIGN_OR_RAISE(search_suffixes, + MakeFilenameVector({"/jre/lib/server", "/lib/server"})); + file_name = "libjvm.dylib"; + +// SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are +// expecting users to set an environment variable +#else +# if defined(__aarch64__) + const std::string prefix_arch{"arm64"}; + const std::string suffix_arch{"aarch64"}; +# else + const std::string prefix_arch{"amd64"}; + const std::string suffix_arch{"amd64"}; +# endif + ARROW_ASSIGN_OR_RAISE( + search_prefixes, + MakeFilenameVector({ + "/usr/lib/jvm/default-java", // ubuntu / debian distros + "/usr/lib/jvm/java", // rhel6 + "/usr/lib/jvm", // centos6 + "/usr/lib64/jvm", // opensuse 13 + "/usr/local/lib/jvm/default-java", // alt ubuntu / debian distros + "/usr/local/lib/jvm/java", // alt rhel6 + "/usr/local/lib/jvm", // alt centos6 + "/usr/local/lib64/jvm", // alt opensuse 13 + "/usr/local/lib/jvm/java-8-openjdk-" + + prefix_arch, // alt ubuntu / debian distros + "/usr/lib/jvm/java-8-openjdk-" + prefix_arch, // alt ubuntu / debian distros + "/usr/local/lib/jvm/java-7-openjdk-" + + prefix_arch, // alt ubuntu / debian distros + "/usr/lib/jvm/java-7-openjdk-" + prefix_arch, // alt ubuntu / debian distros + "/usr/local/lib/jvm/java-6-openjdk-" + + prefix_arch, // alt ubuntu / debian distros + "/usr/lib/jvm/java-6-openjdk-" + prefix_arch, // alt ubuntu / debian distros + "/usr/lib/jvm/java-7-oracle", // alt ubuntu + "/usr/lib/jvm/java-8-oracle", // alt ubuntu + "/usr/lib/jvm/java-6-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-7-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-8-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-6-oracle", // alt ubuntu + "/usr/lib/jvm/default", // alt centos + "/usr/java/latest" // alt centos + })); + ARROW_ASSIGN_OR_RAISE( + search_suffixes, + MakeFilenameVector({"", "/lib/server", "/jre/lib/" + suffix_arch + "/server", + "/lib/" + suffix_arch + "/server"})); + file_name = "libjvm.so"; +#endif + + // From direct environment variable + InsertEnvVarFilename("JAVA_HOME", &search_prefixes); + + // Generate cross product between search_prefixes, search_suffixes, and file_name + for (auto& prefix : search_prefixes) { + for (auto& suffix : search_suffixes) { + ARROW_ASSIGN_OR_RAISE(auto path, prefix.Join(suffix).Join(file_name)); + potential_paths.push_back(std::move(path)); + } + } + + return potential_paths; +} + +Result try_dlopen(const std::vector& potential_paths, + std::string name) { + std::string error_message = "Unable to load " + name; + + for (const auto& p : potential_paths) { + auto maybe_handle = arrow::internal::LoadDynamicLibrary(p); + if (maybe_handle.ok()) return *maybe_handle; + error_message += "\n" + maybe_handle.status().message(); + } + + return Status(StatusCode::IOError, std::move(error_message)); +} + +LibHdfsShim libhdfs_shim; + +} // namespace + +Status LibHdfsShim::GetRequiredSymbols() { + GET_SYMBOL_REQUIRED(this, hdfsNewBuilder); + GET_SYMBOL_REQUIRED(this, hdfsBuilderSetNameNode); + GET_SYMBOL_REQUIRED(this, hdfsBuilderSetNameNodePort); + GET_SYMBOL_REQUIRED(this, hdfsBuilderSetUserName); + GET_SYMBOL_REQUIRED(this, hdfsBuilderSetKerbTicketCachePath); + GET_SYMBOL_REQUIRED(this, hdfsBuilderSetForceNewInstance); + GET_SYMBOL_REQUIRED(this, hdfsBuilderConfSetStr); + GET_SYMBOL_REQUIRED(this, hdfsBuilderConnect); + GET_SYMBOL_REQUIRED(this, hdfsCreateDirectory); + GET_SYMBOL_REQUIRED(this, hdfsDelete); + GET_SYMBOL_REQUIRED(this, hdfsDisconnect); + GET_SYMBOL_REQUIRED(this, hdfsExists); + GET_SYMBOL_REQUIRED(this, hdfsFreeFileInfo); + GET_SYMBOL_REQUIRED(this, hdfsGetCapacity); + GET_SYMBOL_REQUIRED(this, hdfsGetUsed); + GET_SYMBOL_REQUIRED(this, hdfsGetPathInfo); + GET_SYMBOL_REQUIRED(this, hdfsListDirectory); + GET_SYMBOL_REQUIRED(this, hdfsChown); + GET_SYMBOL_REQUIRED(this, hdfsChmod); + + // File methods + GET_SYMBOL_REQUIRED(this, hdfsCloseFile); + GET_SYMBOL_REQUIRED(this, hdfsFlush); + GET_SYMBOL_REQUIRED(this, hdfsOpenFile); + GET_SYMBOL_REQUIRED(this, hdfsRead); + GET_SYMBOL_REQUIRED(this, hdfsSeek); + GET_SYMBOL_REQUIRED(this, hdfsTell); + GET_SYMBOL_REQUIRED(this, hdfsWrite); + + return Status::OK(); +} + +Status ConnectLibHdfs(LibHdfsShim** driver) { + static std::mutex lock; + std::lock_guard guard(lock); + + LibHdfsShim* shim = &libhdfs_shim; + + static bool shim_attempted = false; + if (!shim_attempted) { + shim_attempted = true; + + shim->Initialize(); + + ARROW_ASSIGN_OR_RAISE(auto libjvm_potential_paths, get_potential_libjvm_paths()); + RETURN_NOT_OK(try_dlopen(libjvm_potential_paths, "libjvm")); + + ARROW_ASSIGN_OR_RAISE(auto libhdfs_potential_paths, get_potential_libhdfs_paths()); + ARROW_ASSIGN_OR_RAISE(shim->handle, try_dlopen(libhdfs_potential_paths, "libhdfs")); + } else if (shim->handle == nullptr) { + return Status::IOError("Prior attempt to load libhdfs failed"); + } + + *driver = shim; + return shim->GetRequiredSymbols(); +} + +/////////////////////////////////////////////////////////////////////////// +// HDFS thin wrapper methods + +hdfsBuilder* LibHdfsShim::NewBuilder() { return this->hdfsNewBuilder(); } + +void LibHdfsShim::BuilderSetNameNode(hdfsBuilder* bld, const char* nn) { + this->hdfsBuilderSetNameNode(bld, nn); +} + +void LibHdfsShim::BuilderSetNameNodePort(hdfsBuilder* bld, tPort port) { + this->hdfsBuilderSetNameNodePort(bld, port); +} + +void LibHdfsShim::BuilderSetUserName(hdfsBuilder* bld, const char* userName) { + this->hdfsBuilderSetUserName(bld, userName); +} + +void LibHdfsShim::BuilderSetKerbTicketCachePath(hdfsBuilder* bld, + const char* kerbTicketCachePath) { + this->hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath); +} + +void LibHdfsShim::BuilderSetForceNewInstance(hdfsBuilder* bld) { + this->hdfsBuilderSetForceNewInstance(bld); +} + +hdfsFS LibHdfsShim::BuilderConnect(hdfsBuilder* bld) { + return this->hdfsBuilderConnect(bld); +} + +int LibHdfsShim::BuilderConfSetStr(hdfsBuilder* bld, const char* key, const char* val) { + return this->hdfsBuilderConfSetStr(bld, key, val); +} + +int LibHdfsShim::Disconnect(hdfsFS fs) { return this->hdfsDisconnect(fs); } + +hdfsFile LibHdfsShim::OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, + short replication, tSize blocksize) { // NOLINT + return this->hdfsOpenFile(fs, path, flags, bufferSize, replication, blocksize); +} + +int LibHdfsShim::CloseFile(hdfsFS fs, hdfsFile file) { + return this->hdfsCloseFile(fs, file); +} + +int LibHdfsShim::Exists(hdfsFS fs, const char* path) { + return this->hdfsExists(fs, path); +} + +int LibHdfsShim::Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos) { + return this->hdfsSeek(fs, file, desiredPos); +} + +tOffset LibHdfsShim::Tell(hdfsFS fs, hdfsFile file) { return this->hdfsTell(fs, file); } + +tSize LibHdfsShim::Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length) { + return this->hdfsRead(fs, file, buffer, length); +} + +bool LibHdfsShim::HasPread() { + GET_SYMBOL(this, hdfsPread); + return this->hdfsPread != nullptr; +} + +tSize LibHdfsShim::Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, + tSize length) { + GET_SYMBOL(this, hdfsPread); + DCHECK(this->hdfsPread); + return this->hdfsPread(fs, file, position, buffer, length); +} + +tSize LibHdfsShim::Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length) { + return this->hdfsWrite(fs, file, buffer, length); +} + +int LibHdfsShim::Flush(hdfsFS fs, hdfsFile file) { return this->hdfsFlush(fs, file); } + +int LibHdfsShim::Available(hdfsFS fs, hdfsFile file) { + GET_SYMBOL(this, hdfsAvailable); + if (this->hdfsAvailable) { + return this->hdfsAvailable(fs, file); + } else { + return 0; + } +} + +int LibHdfsShim::Copy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { + GET_SYMBOL(this, hdfsCopy); + if (this->hdfsCopy) { + return this->hdfsCopy(srcFS, src, dstFS, dst); + } else { + return 0; + } +} + +int LibHdfsShim::Move(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { + GET_SYMBOL(this, hdfsMove); + if (this->hdfsMove) { + return this->hdfsMove(srcFS, src, dstFS, dst); + } else { + return 0; + } +} + +int LibHdfsShim::Delete(hdfsFS fs, const char* path, int recursive) { + return this->hdfsDelete(fs, path, recursive); +} + +int LibHdfsShim::Rename(hdfsFS fs, const char* oldPath, const char* newPath) { + GET_SYMBOL(this, hdfsRename); + if (this->hdfsRename) { + return this->hdfsRename(fs, oldPath, newPath); + } else { + return 0; + } +} + +char* LibHdfsShim::GetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize) { + GET_SYMBOL(this, hdfsGetWorkingDirectory); + if (this->hdfsGetWorkingDirectory) { + return this->hdfsGetWorkingDirectory(fs, buffer, bufferSize); + } else { + return nullptr; + } +} + +int LibHdfsShim::SetWorkingDirectory(hdfsFS fs, const char* path) { + GET_SYMBOL(this, hdfsSetWorkingDirectory); + if (this->hdfsSetWorkingDirectory) { + return this->hdfsSetWorkingDirectory(fs, path); + } else { + return 0; + } +} + +int LibHdfsShim::MakeDirectory(hdfsFS fs, const char* path) { + return this->hdfsCreateDirectory(fs, path); +} + +int LibHdfsShim::SetReplication(hdfsFS fs, const char* path, int16_t replication) { + GET_SYMBOL(this, hdfsSetReplication); + if (this->hdfsSetReplication) { + return this->hdfsSetReplication(fs, path, replication); + } else { + return 0; + } +} + +hdfsFileInfo* LibHdfsShim::ListDirectory(hdfsFS fs, const char* path, int* numEntries) { + return this->hdfsListDirectory(fs, path, numEntries); +} + +hdfsFileInfo* LibHdfsShim::GetPathInfo(hdfsFS fs, const char* path) { + return this->hdfsGetPathInfo(fs, path); +} + +void LibHdfsShim::FreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries) { + this->hdfsFreeFileInfo(hdfsFileInfo, numEntries); +} + +char*** LibHdfsShim::GetHosts(hdfsFS fs, const char* path, tOffset start, + tOffset length) { + GET_SYMBOL(this, hdfsGetHosts); + if (this->hdfsGetHosts) { + return this->hdfsGetHosts(fs, path, start, length); + } else { + return nullptr; + } +} + +void LibHdfsShim::FreeHosts(char*** blockHosts) { + GET_SYMBOL(this, hdfsFreeHosts); + if (this->hdfsFreeHosts) { + this->hdfsFreeHosts(blockHosts); + } +} + +tOffset LibHdfsShim::GetDefaultBlockSize(hdfsFS fs) { + GET_SYMBOL(this, hdfsGetDefaultBlockSize); + if (this->hdfsGetDefaultBlockSize) { + return this->hdfsGetDefaultBlockSize(fs); + } else { + return 0; + } +} + +tOffset LibHdfsShim::GetCapacity(hdfsFS fs) { return this->hdfsGetCapacity(fs); } + +tOffset LibHdfsShim::GetUsed(hdfsFS fs) { return this->hdfsGetUsed(fs); } + +int LibHdfsShim::Chown(hdfsFS fs, const char* path, const char* owner, + const char* group) { + return this->hdfsChown(fs, path, owner, group); +} + +int LibHdfsShim::Chmod(hdfsFS fs, const char* path, short mode) { // NOLINT + return this->hdfsChmod(fs, path, mode); +} + +int LibHdfsShim::Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { + GET_SYMBOL(this, hdfsUtime); + if (this->hdfsUtime) { + return this->hdfsUtime(fs, path, mtime, atime); + } else { + return 0; + } +} + +} // namespace io::internal +} // namespace arrow diff --git a/cpp/src/arrow/io/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h similarity index 100% rename from cpp/src/arrow/io/hdfs_internal.h rename to cpp/src/arrow/filesystem/hdfs_internal.h diff --git a/cpp/src/arrow/filesystem/hdfs_io.cc b/cpp/src/arrow/filesystem/hdfs_io.cc new file mode 100644 index 00000000000..cc55dd8d143 --- /dev/null +++ b/cpp/src/arrow/filesystem/hdfs_io.cc @@ -0,0 +1,720 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/filesystem/type_fwd.h" +#include "arrow/io/hdfs.h" +#include "arrow/io/hdfs_internal.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging_internal.h" + +using std::size_t; + +namespace arrow { + +using internal::IOErrorFromErrno; + +namespace io { + +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return IOErrorFromErrno(errno, "HDFS ", WHAT, " failed"); \ + } \ + } while (0) + +static constexpr int kDefaultHdfsBufferSize = 1 << 16; + +// ---------------------------------------------------------------------- +// File reading + +class HdfsAnyFileImpl { + public: + void set_members(const std::string& path, internal::LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle) { + path_ = path; + driver_ = driver; + fs_ = fs; + file_ = handle; + is_open_ = true; + } + + Status Seek(int64_t position) { + RETURN_NOT_OK(CheckClosed()); + int ret = driver_->Seek(fs_, file_, position); + CHECK_FAILURE(ret, "seek"); + return Status::OK(); + } + + Result Tell() { + RETURN_NOT_OK(CheckClosed()); + int64_t ret = driver_->Tell(fs_, file_); + CHECK_FAILURE(ret, "tell"); + return ret; + } + + bool is_open() const { return is_open_; } + + protected: + Status CheckClosed() { + if (!is_open_) { + return Status::Invalid("Operation on closed HDFS file"); + } + return Status::OK(); + } + + std::string path_; + + internal::LibHdfsShim* driver_; + + // For threadsafety + std::mutex lock_; + + // These are pointers in libhdfs, so OK to copy + hdfsFS fs_; + hdfsFile file_; + + bool is_open_; +}; + +namespace { + +Status GetPathInfoFailed(const std::string& path) { + return IOErrorFromErrno(errno, "Calling GetPathInfo for '", path, "' failed"); +} + +} // namespace + +// Private implementation for read-only files +class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { + public: + explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} + + Status Close() { + if (is_open_) { + // is_open_ must be set to false in the beginning, because the destructor + // attempts to close the stream again, and if the first close fails, then + // the error doesn't get propagated properly and the second close + // initiated by the destructor raises a segfault + is_open_ = false; + int ret = driver_->CloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + } + return Status::OK(); + } + + bool closed() const { return !is_open_; } + + Result ReadAt(int64_t position, int64_t nbytes, uint8_t* buffer) { + RETURN_NOT_OK(CheckClosed()); + if (!driver_->HasPread()) { + std::lock_guard guard(lock_); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, buffer); + } + + constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); + int64_t total_bytes = 0; + while (nbytes > 0) { + const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); + tSize ret = + driver_->Pread(fs_, file_, static_cast(position), buffer, block_size); + CHECK_FAILURE(ret, "read"); + DCHECK_LE(ret, block_size); + if (ret == 0) { + break; // EOF + } + buffer += ret; + total_bytes += ret; + position += ret; + nbytes -= ret; + } + return total_bytes; + } + + Result> ReadAt(int64_t position, int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + ReadAt(position, nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + buffer->ZeroPadding(); + } + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buffer)); + } + + Result Read(int64_t nbytes, void* buffer) { + RETURN_NOT_OK(CheckClosed()); + + int64_t total_bytes = 0; + while (total_bytes < nbytes) { + tSize ret = driver_->Read( + fs_, file_, reinterpret_cast(buffer) + total_bytes, + static_cast(std::min(buffer_size_, nbytes - total_bytes))); + CHECK_FAILURE(ret, "read"); + total_bytes += ret; + if (ret == 0) { + break; + } + } + return total_bytes; + } + + Result> Read(int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + } + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buffer)); + } + + Result GetSize() { + RETURN_NOT_OK(CheckClosed()); + + hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); + if (entry == nullptr) { + return GetPathInfoFailed(path_); + } + int64_t size = entry->mSize; + driver_->FreeFileInfo(entry, 1); + return size; + } + + void set_memory_pool(MemoryPool* pool) { pool_ = pool; } + + void set_buffer_size(int32_t buffer_size) { buffer_size_ = buffer_size; } + + private: + MemoryPool* pool_; + int32_t buffer_size_; +}; + +HdfsReadableFile::HdfsReadableFile(const io::IOContext& io_context) { + impl_.reset(new HdfsReadableFileImpl(io_context.pool())); +} + +HdfsReadableFile::~HdfsReadableFile() { + ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsReadableFile"); +} + +Status HdfsReadableFile::Close() { return impl_->Close(); } + +bool HdfsReadableFile::closed() const { return impl_->closed(); } + +Result HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, void* buffer) { + return impl_->ReadAt(position, nbytes, reinterpret_cast(buffer)); +} + +Result> HdfsReadableFile::ReadAt(int64_t position, + int64_t nbytes) { + return impl_->ReadAt(position, nbytes); +} + +Result HdfsReadableFile::Read(int64_t nbytes, void* buffer) { + return impl_->Read(nbytes, buffer); +} + +Result> HdfsReadableFile::Read(int64_t nbytes) { + return impl_->Read(nbytes); +} + +Result HdfsReadableFile::GetSize() { return impl_->GetSize(); } + +Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); } + +Result HdfsReadableFile::Tell() const { return impl_->Tell(); } + +// ---------------------------------------------------------------------- +// File writing + +// Private implementation for writable-only files +class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { + public: + HdfsOutputStreamImpl() {} + + Status Close() { + if (is_open_) { + // is_open_ must be set to false in the beginning, because the destructor + // attempts to close the stream again, and if the first close fails, then + // the error doesn't get propagated properly and the second close + // initiated by the destructor raises a segfault + is_open_ = false; + RETURN_NOT_OK(FlushInternal()); + int ret = driver_->CloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + } + return Status::OK(); + } + + bool closed() const { return !is_open_; } + + Status Flush() { + RETURN_NOT_OK(CheckClosed()); + + return FlushInternal(); + } + + Status Write(const uint8_t* buffer, int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); + + std::lock_guard guard(lock_); + while (nbytes > 0) { + const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); + tSize ret = driver_->Write(fs_, file_, buffer, block_size); + CHECK_FAILURE(ret, "Write"); + DCHECK_LE(ret, block_size); + buffer += ret; + nbytes -= ret; + } + return Status::OK(); + } + + protected: + Status FlushInternal() { + int ret = driver_->Flush(fs_, file_); + CHECK_FAILURE(ret, "Flush"); + return Status::OK(); + } +}; + +HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } + +HdfsOutputStream::~HdfsOutputStream() { + ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsOutputStream"); +} + +Status HdfsOutputStream::Close() { return impl_->Close(); } + +bool HdfsOutputStream::closed() const { return impl_->closed(); } + +Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes) { + return impl_->Write(reinterpret_cast(buffer), nbytes); +} + +Status HdfsOutputStream::Flush() { return impl_->Flush(); } + +Result HdfsOutputStream::Tell() const { return impl_->Tell(); } + +// ---------------------------------------------------------------------- +// HDFS client + +// TODO(wesm): this could throw std::bad_alloc in the course of copying strings +// into the path info object +static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { + out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File + : arrow::fs::FileType::Directory; + out->name = std::string(input->mName); + out->owner = std::string(input->mOwner); + out->group = std::string(input->mGroup); + + out->last_access_time = static_cast(input->mLastAccess); + out->last_modified_time = static_cast(input->mLastMod); + out->size = static_cast(input->mSize); + + out->replication = input->mReplication; + out->block_size = input->mBlockSize; + + out->permissions = input->mPermissions; +} + +// Private implementation +class HadoopFileSystem::HadoopFileSystemImpl { + public: + HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} + + Status Connect(const HdfsConnectionConfig* config) { + RETURN_NOT_OK(ConnectLibHdfs(&driver_)); + + // connect to HDFS with the builder object + hdfsBuilder* builder = driver_->NewBuilder(); + if (!config->host.empty()) { + driver_->BuilderSetNameNode(builder, config->host.c_str()); + } + driver_->BuilderSetNameNodePort(builder, static_cast(config->port)); + if (!config->user.empty()) { + driver_->BuilderSetUserName(builder, config->user.c_str()); + } + if (!config->kerb_ticket.empty()) { + driver_->BuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); + } + + for (const auto& kv : config->extra_conf) { + int ret = driver_->BuilderConfSetStr(builder, kv.first.c_str(), kv.second.c_str()); + CHECK_FAILURE(ret, "confsetstr"); + } + + driver_->BuilderSetForceNewInstance(builder); + fs_ = driver_->BuilderConnect(builder); + + if (fs_ == nullptr) { + return Status::IOError("HDFS connection failed"); + } + namenode_host_ = config->host; + port_ = config->port; + user_ = config->user; + kerb_ticket_ = config->kerb_ticket; + + return Status::OK(); + } + + Status MakeDirectory(const std::string& path) { + int ret = driver_->MakeDirectory(fs_, path.c_str()); + CHECK_FAILURE(ret, "create directory"); + return Status::OK(); + } + + Status Delete(const std::string& path, bool recursive) { + int ret = driver_->Delete(fs_, path.c_str(), static_cast(recursive)); + CHECK_FAILURE(ret, "delete"); + return Status::OK(); + } + + Status Disconnect() { + int ret = driver_->Disconnect(fs_); + CHECK_FAILURE(ret, "hdfsFS::Disconnect"); + return Status::OK(); + } + + bool Exists(const std::string& path) { + // hdfsExists does not distinguish between RPC failure and the file not + // existing + int ret = driver_->Exists(fs_, path.c_str()); + return ret == 0; + } + + Status GetCapacity(int64_t* nbytes) { + tOffset ret = driver_->GetCapacity(fs_); + CHECK_FAILURE(ret, "GetCapacity"); + *nbytes = ret; + return Status::OK(); + } + + Status GetUsed(int64_t* nbytes) { + tOffset ret = driver_->GetUsed(fs_); + CHECK_FAILURE(ret, "GetUsed"); + *nbytes = ret; + return Status::OK(); + } + + Status GetWorkingDirectory(std::string* out) { + char buffer[2048]; + if (driver_->GetWorkingDirectory(fs_, buffer, sizeof(buffer) - 1) == nullptr) { + return IOErrorFromErrno(errno, "HDFS GetWorkingDirectory failed"); + } + *out = buffer; + return Status::OK(); + } + + Status GetPathInfo(const std::string& path, HdfsPathInfo* info) { + hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); + + if (entry == nullptr) { + return GetPathInfoFailed(path); + } + + SetPathInfo(entry, info); + driver_->FreeFileInfo(entry, 1); + + return Status::OK(); + } + + Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) { + HdfsPathInfo info; + RETURN_NOT_OK(GetPathInfo(path, &info)); + + file_info->set_size(info.size); + file_info->set_type(info.kind); + return Status::OK(); + } + + Status GetChildren(const std::string& path, std::vector* listing) { + std::vector detailed_listing; + RETURN_NOT_OK(ListDirectory(path, &detailed_listing)); + for (const auto& info : detailed_listing) { + listing->push_back(info.name); + } + return Status::OK(); + } + + Status ListDirectory(const std::string& path, std::vector* listing) { + int num_entries = 0; + errno = 0; + hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries); + + if (entries == nullptr) { + // If the directory is empty, entries is NULL but errno is 0. Non-zero + // errno indicates error + // + // Note: errno is thread-local + // + // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set + // errno 2/ENOENT for empty directories. To be more robust to this we + // double check this case + if ((errno == 0) || (errno == ENOENT && Exists(path))) { + num_entries = 0; + } else { + return IOErrorFromErrno(errno, "HDFS list directory failed"); + } + } + + // Allocate additional space for elements + int vec_offset = static_cast(listing->size()); + listing->resize(vec_offset + num_entries); + + for (int i = 0; i < num_entries; ++i) { + SetPathInfo(entries + i, &(*listing)[vec_offset + i]); + } + + // Free libhdfs file info + driver_->FreeFileInfo(entries, num_entries); + + return Status::OK(); + } + + Status OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file) { + errno = 0; + hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); + + if (handle == nullptr) { + return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); + } + + // std::make_shared does not work with private ctors + *file = std::shared_ptr(new HdfsReadableFile(io_context)); + (*file)->impl_->set_members(path, driver_, fs_, handle); + (*file)->impl_->set_buffer_size(buffer_size); + + return Status::OK(); + } + + Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file) { + int flags = O_WRONLY; + if (append) flags |= O_APPEND; + + errno = 0; + hdfsFile handle = + driver_->OpenFile(fs_, path.c_str(), flags, buffer_size, replication, + static_cast(default_block_size)); + + if (handle == nullptr) { + return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); + } + + // std::make_shared does not work with private ctors + *file = std::shared_ptr(new HdfsOutputStream()); + (*file)->impl_->set_members(path, driver_, fs_, handle); + + return Status::OK(); + } + + Status Rename(const std::string& src, const std::string& dst) { + int ret = driver_->Rename(fs_, src.c_str(), dst.c_str()); + CHECK_FAILURE(ret, "Rename"); + return Status::OK(); + } + + Status Copy(const std::string& src, const std::string& dst) { + int ret = driver_->Copy(fs_, src.c_str(), fs_, dst.c_str()); + CHECK_FAILURE(ret, "Rename"); + return Status::OK(); + } + + Status Move(const std::string& src, const std::string& dst) { + int ret = driver_->Move(fs_, src.c_str(), fs_, dst.c_str()); + CHECK_FAILURE(ret, "Rename"); + return Status::OK(); + } + + Status Chmod(const std::string& path, int mode) { + int ret = driver_->Chmod(fs_, path.c_str(), static_cast(mode)); // NOLINT + CHECK_FAILURE(ret, "Chmod"); + return Status::OK(); + } + + Status Chown(const std::string& path, const char* owner, const char* group) { + int ret = driver_->Chown(fs_, path.c_str(), owner, group); + CHECK_FAILURE(ret, "Chown"); + return Status::OK(); + } + + private: + internal::LibHdfsShim* driver_; + + std::string namenode_host_; + std::string user_; + int port_; + std::string kerb_ticket_; + + hdfsFS fs_; +}; + +// ---------------------------------------------------------------------- +// Public API for HDFSClient + +HadoopFileSystem::HadoopFileSystem() { impl_.reset(new HadoopFileSystemImpl()); } + +HadoopFileSystem::~HadoopFileSystem() {} + +Status HadoopFileSystem::Connect(const HdfsConnectionConfig* config, + std::shared_ptr* fs) { + // ctor is private, make_shared will not work + *fs = std::shared_ptr(new HadoopFileSystem()); + + RETURN_NOT_OK((*fs)->impl_->Connect(config)); + return Status::OK(); +} + +Status HadoopFileSystem::MakeDirectory(const std::string& path) { + return impl_->MakeDirectory(path); +} + +Status HadoopFileSystem::Delete(const std::string& path, bool recursive) { + return impl_->Delete(path, recursive); +} + +Status HadoopFileSystem::DeleteDirectory(const std::string& path) { + return Delete(path, true); +} + +Status HadoopFileSystem::Disconnect() { return impl_->Disconnect(); } + +bool HadoopFileSystem::Exists(const std::string& path) { return impl_->Exists(path); } + +Status HadoopFileSystem::GetPathInfo(const std::string& path, HdfsPathInfo* info) { + return impl_->GetPathInfo(path, info); +} + +Status HadoopFileSystem::Stat(const std::string& path, arrow::fs::FileInfo* file_info) { + return impl_->Stat(path, file_info); +} + +Status HadoopFileSystem::GetCapacity(int64_t* nbytes) { + return impl_->GetCapacity(nbytes); +} + +Status HadoopFileSystem::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); } + +Status HadoopFileSystem::GetWorkingDirectory(std::string* out) { + return impl_->GetWorkingDirectory(out); +} + +Status HadoopFileSystem::GetChildren(const std::string& path, + std::vector* listing) { + return impl_->GetChildren(path, listing); +} + +Status HadoopFileSystem::ListDirectory(const std::string& path, + std::vector* listing) { + return impl_->ListDirectory(path, listing); +} + +Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file) { + return impl_->OpenReadable(path, buffer_size, io::default_io_context(), file); +} + +Status HadoopFileSystem::OpenReadable(const std::string& path, + std::shared_ptr* file) { + return OpenReadable(path, kDefaultHdfsBufferSize, io::default_io_context(), file); +} + +Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file) { + return impl_->OpenReadable(path, buffer_size, io_context, file); +} + +Status HadoopFileSystem::OpenReadable(const std::string& path, + const io::IOContext& io_context, + std::shared_ptr* file) { + return OpenReadable(path, kDefaultHdfsBufferSize, io_context, file); +} + +Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + std::shared_ptr* file) { + return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, + file); +} + +Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, + std::shared_ptr* file) { + return OpenWritable(path, append, 0, 0, 0, file); +} + +Status HadoopFileSystem::Chmod(const std::string& path, int mode) { + return impl_->Chmod(path, mode); +} + +Status HadoopFileSystem::Chown(const std::string& path, const char* owner, + const char* group) { + return impl_->Chown(path, owner, group); +} + +Status HadoopFileSystem::Rename(const std::string& src, const std::string& dst) { + return impl_->Rename(src, dst); +} + +Status HadoopFileSystem::Copy(const std::string& src, const std::string& dst) { + return impl_->Copy(src, dst); +} + +Status HadoopFileSystem::Move(const std::string& src, const std::string& dst) { + return impl_->Move(src, dst); +} + +// ---------------------------------------------------------------------- +// Allow public API users to check whether we are set up correctly + +Status HaveLibHdfs() { + internal::LibHdfsShim* driver; + return internal::ConnectLibHdfs(&driver); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/filesystem/hdfs_io.h similarity index 100% rename from cpp/src/arrow/io/hdfs.h rename to cpp/src/arrow/filesystem/hdfs_io.h diff --git a/cpp/src/arrow/io/hdfs_test.cc b/cpp/src/arrow/filesystem/hdfs_test_io.cc similarity index 91% rename from cpp/src/arrow/io/hdfs_test.cc rename to cpp/src/arrow/filesystem/hdfs_test_io.cc index e0389473ea9..125716a02dc 100644 --- a/cpp/src/arrow/io/hdfs_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_test_io.cc @@ -31,15 +31,15 @@ #include "arrow/buffer.h" #include "arrow/filesystem/type_fwd.h" -#include "arrow/io/hdfs.h" -#include "arrow/io/hdfs_internal.h" +#include "arrow/filesystem/hdfs_internal.h" +#include "arrow/filesystem/hdfs_io.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" namespace arrow { -namespace io { +namespace fs { std::vector RandomData(int64_t size) { std::vector buffer(size); @@ -59,7 +59,7 @@ class TestHadoopFileSystem : public ::testing::Test { Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, bool append = false, int buffer_size = 0, int16_t replication = 0, int default_block_size = 0) { - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(client_->OpenWritable(path, append, buffer_size, replication, default_block_size, &file)); @@ -83,7 +83,7 @@ class TestHadoopFileSystem : public ::testing::Test { // Set up shared state between unit tests void SetUp() { - internal::LibHdfsShim* driver_shim; + io::internal::LibHdfsShim* driver_shim; client_ = nullptr; scratch_dir_ = @@ -119,7 +119,7 @@ class TestHadoopFileSystem : public ::testing::Test { conf_.user = user; conf_.port = port == nullptr ? 20500 : atoi(port); - ASSERT_OK(HadoopFileSystem::Connect(&conf_, &client_)); + ASSERT_OK(io::HadoopFileSystem::Connect(&conf_, &client_)); } void TearDown() { @@ -131,12 +131,12 @@ class TestHadoopFileSystem : public ::testing::Test { } } - HdfsConnectionConfig conf_; + io::HdfsConnectionConfig conf_; bool loaded_driver_; // Resources shared amongst unit tests std::string scratch_dir_; - std::shared_ptr client_; + std::shared_ptr client_; }; #define SKIP_IF_NO_DRIVER() \ @@ -147,8 +147,8 @@ class TestHadoopFileSystem : public ::testing::Test { TEST_F(TestHadoopFileSystem, ConnectsAgain) { SKIP_IF_NO_DRIVER(); - std::shared_ptr client; - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client)); + std::shared_ptr client; + ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client)); ASSERT_OK(client->Disconnect()); } @@ -157,14 +157,14 @@ TEST_F(TestHadoopFileSystem, MultipleClients) { ASSERT_OK(this->MakeScratchDir()); - std::shared_ptr client1; - std::shared_ptr client2; - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client1)); - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client2)); + std::shared_ptr client1; + std::shared_ptr client2; + ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client1)); + ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client2)); ASSERT_OK(client1->Disconnect()); // client2 continues to function after equivalent client1 has shutdown - std::vector listing; + std::vector listing; ASSERT_OK(client2->ListDirectory(this->scratch_dir_, &listing)); ASSERT_OK(client2->Disconnect()); } @@ -180,7 +180,7 @@ TEST_F(TestHadoopFileSystem, MakeDirectory) { ASSERT_OK(this->client_->MakeDirectory(path)); ASSERT_TRUE(this->client_->Exists(path)); - std::vector listing; + std::vector listing; ARROW_EXPECT_OK(this->client_->ListDirectory(path, &listing)); ASSERT_EQ(0, listing.size()); ARROW_EXPECT_OK(this->client_->Delete(path, true)); @@ -204,7 +204,7 @@ TEST_F(TestHadoopFileSystem, GetCapacityUsed) { TEST_F(TestHadoopFileSystem, GetPathInfo) { SKIP_IF_NO_DRIVER(); - HdfsPathInfo info; + io::HdfsPathInfo info; ASSERT_OK(this->MakeScratchDir()); @@ -238,7 +238,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfoNotExist) { ASSERT_OK(this->MakeScratchDir()); auto path = this->ScratchPath("path-does-not-exist"); - HdfsPathInfo info; + io::HdfsPathInfo info; Status s = this->client_->GetPathInfo(path, &info); ASSERT_TRUE(s.IsIOError()); @@ -262,7 +262,7 @@ TEST_F(TestHadoopFileSystem, AppendToFile) { // now append ASSERT_OK(this->WriteDummyFile(path, buffer.data(), size, true)); - HdfsPathInfo info; + io::HdfsPathInfo info; ASSERT_OK(this->client_->GetPathInfo(path, &info)); ASSERT_EQ(size * 2, info.size); } @@ -282,7 +282,7 @@ TEST_F(TestHadoopFileSystem, ListDirectory) { ASSERT_OK(this->WriteDummyFile(p2, data.data(), size / 2)); ASSERT_OK(this->client_->MakeDirectory(d1)); - std::vector listing; + std::vector listing; ASSERT_OK(this->client_->ListDirectory(this->scratch_dir_, &listing)); // Do it again, appends! @@ -292,7 +292,7 @@ TEST_F(TestHadoopFileSystem, ListDirectory) { // Argh, well, shouldn't expect the listing to be in any particular order for (size_t i = 0; i < listing.size(); ++i) { - const HdfsPathInfo& info = listing[i]; + const io::HdfsPathInfo& info = listing[i]; if (info.name == this->HdfsAbsPath(p1)) { ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(size, info.size); @@ -318,7 +318,7 @@ TEST_F(TestHadoopFileSystem, ReadableMethods) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); // Test GetSize -- move this into its own unit test if ever needed @@ -355,7 +355,7 @@ TEST_F(TestHadoopFileSystem, LargeFile) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); ASSERT_FALSE(file->closed()); @@ -366,7 +366,7 @@ TEST_F(TestHadoopFileSystem, LargeFile) { ASSERT_EQ(0, std::memcmp(buffer->data(), data.data(), size)); // explicit buffer size - std::shared_ptr file2; + std::shared_ptr file2; ASSERT_OK(this->client_->OpenReadable(path, 1 << 18, &file2)); ASSERT_OK_AND_ASSIGN(auto buffer2, AllocateBuffer(size)); @@ -404,7 +404,7 @@ TEST_F(TestHadoopFileSystem, ChmodChown) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - HdfsPathInfo info; + io::HdfsPathInfo info; ASSERT_OK(this->client_->Chmod(path, mode)); ASSERT_OK(this->client_->GetPathInfo(path, &info)); ASSERT_EQ(mode, info.permissions); @@ -427,7 +427,7 @@ TEST_F(TestHadoopFileSystem, ThreadSafety) { ASSERT_OK(this->WriteDummyFile(src_path, reinterpret_cast(data.c_str()), static_cast(data.size()))); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(src_path, &file)); std::atomic correct_count(0); @@ -463,5 +463,5 @@ TEST_F(TestHadoopFileSystem, ThreadSafety) { ASSERT_EQ(niter * 4, correct_count); } -} // namespace io +} // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 0398422b053..647f54c9b7f 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -21,16 +21,6 @@ add_arrow_test(buffered_test PREFIX "arrow-io") add_arrow_test(compressed_test PREFIX "arrow-io") add_arrow_test(file_test PREFIX "arrow-io") - -if(ARROW_HDFS) - add_arrow_test(hdfs_test - NO_VALGRIND - PREFIX - "arrow-io" - EXTRA_LINK_LIBS - arrow::hadoop) -endif() - add_arrow_test(memory_test PREFIX "arrow-io") add_arrow_benchmark(file_benchmark PREFIX "arrow-io") diff --git a/cpp/src/arrow/io/api.h b/cpp/src/arrow/io/api.h index d55b2c2d55a..3bfde6de452 100644 --- a/cpp/src/arrow/io/api.h +++ b/cpp/src/arrow/io/api.h @@ -20,6 +20,5 @@ #include "arrow/io/buffered.h" #include "arrow/io/compressed.h" #include "arrow/io/file.h" -#include "arrow/io/hdfs.h" #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 33a9c6ed40c..19d78d1a841 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -84,8 +84,6 @@ arrow_components = { 'io/caching.cc', 'io/compressed.cc', 'io/file.cc', - 'io/hdfs.cc', - 'io/hdfs_internal.cc', 'io/interfaces.cc', 'io/memory.cc', 'io/slow.cc', diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index da2fe966475..c99f480287e 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -110,6 +110,13 @@ def _filesystem_is_available(fs): else: return True +def have_libhdfs(): + try: + from pyarrow._hdfs import _have_libhdfs # noqa + return _have_libhdfs() + except ImportError: + return False + def show_info(): """ @@ -260,8 +267,7 @@ def print_entry(label, value): BufferReader, BufferOutputStream, OSFile, MemoryMappedFile, memory_map, create_memory_map, MockOutputStream, - input_stream, output_stream, - have_libhdfs) + input_stream, output_stream) from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, concat_arrays, concat_tables, TableGroupBy, diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx index 8a9fddee3dd..b03de2fc17e 100644 --- a/python/pyarrow/_hdfs.pyx +++ b/python/pyarrow/_hdfs.pyx @@ -23,9 +23,22 @@ from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem from pyarrow.lib import frombytes, tobytes +from pyarrow.lib cimport check_status from pyarrow.util import _stringify_path +def _have_libhdfs(): + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ + try: + with nogil: + check_status(HaveLibHdfs()) + return True + except Exception: + return False + + cdef class HadoopFileSystem(FileSystem): """ HDFS backed FileSystem implementation diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 932b3d9548f..c359a4a84c7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1744,30 +1744,6 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: CTransformInputStreamVTable vtable, object method_arg) - # ---------------------------------------------------------------------- - # HDFS - - CStatus HaveLibHdfs() - CStatus HaveLibHdfs3() - - cdef enum HdfsDriver" arrow::io::HdfsDriver": - HdfsDriver_LIBHDFS" arrow::io::HdfsDriver::LIBHDFS" - HdfsDriver_LIBHDFS3" arrow::io::HdfsDriver::LIBHDFS3" - - cdef cppclass HdfsConnectionConfig: - c_string host - int port - c_string user - c_string kerb_ticket - unordered_map[c_string, c_string] extra_conf - HdfsDriver driver - - cdef cppclass HdfsReadableFile(CRandomAccessFile): - pass - - cdef cppclass HdfsOutputStream(COutputStream): - pass - cdef cppclass CBufferReader \ " arrow::io::BufferReader"(CRandomAccessFile): CBufferReader(const shared_ptr[CBuffer]& buffer) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 6ee014ce309..b67880a88f1 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -306,6 +306,80 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: const CIOContext& io_context, int64_t chunk_size, c_bool use_threads) +cdef extern from "arrow/filesystem/hdfs_io.h" namespace "arrow::io" nogil: + + cdef cppclass CIOFileSystem" arrow::io::FileSystem": + CStatus Stat(const c_string& path, CFileInfo* stat) + + CStatus HaveLibHdfs() + CStatus HaveLibHdfs3() + + cdef enum HdfsDriver" arrow::io::HdfsDriver": + HdfsDriver_LIBHDFS" arrow::io::HdfsDriver::LIBHDFS" + HdfsDriver_LIBHDFS3" arrow::io::HdfsDriver::LIBHDFS3" + + cdef cppclass HdfsConnectionConfig: + c_string host + int port + c_string user + c_string kerb_ticket + unordered_map[c_string, c_string] extra_conf + HdfsDriver driver + + cdef cppclass HdfsPathInfo: + CFileType kind + c_string name + c_string owner + c_string group + int32_t last_modified_time + int32_t last_access_time + int64_t size + int16_t replication + int64_t block_size + int16_t permissions + + cdef cppclass HdfsReadableFile(CRandomAccessFile): + pass + + cdef cppclass HdfsOutputStream(COutputStream): + pass + + cdef cppclass CIOHadoopFileSystem \ + "arrow::io::HadoopFileSystem"(CIOFileSystem): + @staticmethod + CStatus Connect(const HdfsConnectionConfig* config, + shared_ptr[CIOHadoopFileSystem]* client) + + CStatus MakeDirectory(const c_string& path) + + CStatus Delete(const c_string& path, c_bool recursive) + + CStatus Disconnect() + + c_bool Exists(const c_string& path) + + CStatus Chmod(const c_string& path, int mode) + CStatus Chown(const c_string& path, const char* owner, + const char* group) + + CStatus GetCapacity(int64_t* nbytes) + CStatus GetUsed(int64_t* nbytes) + + CStatus ListDirectory(const c_string& path, + vector[HdfsPathInfo]* listing) + + CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) + + CStatus Rename(const c_string& src, const c_string& dst) + + CStatus OpenReadable(const c_string& path, + shared_ptr[HdfsReadableFile]* handle) + + CStatus OpenWritable(const c_string& path, c_bool append, + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + shared_ptr[HdfsOutputStream]* handle) + # Callbacks for implementing Python filesystems # Use typedef to emulate syntax for std::function @@ -362,55 +436,3 @@ cdef extern from "arrow/python/filesystem.h" namespace "arrow::py::fs" nogil: CPyFileSystemVtable vtable) PyObject* handler() - -cdef extern from "arrow/io/api.h" namespace "arrow::io" nogil: - cdef cppclass CIOFileSystem" arrow::io::FileSystem": - CStatus Stat(const c_string& path, CFileInfo* stat) - - cdef cppclass HdfsPathInfo: - CFileType kind - c_string name - c_string owner - c_string group - int32_t last_modified_time - int32_t last_access_time - int64_t size - int16_t replication - int64_t block_size - int16_t permissions - - CStatus ListDirectory(const c_string& path, - vector[HdfsPathInfo]* listing) - - CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) - - cdef cppclass CIOHadoopFileSystem \ - "arrow::io::HadoopFileSystem"(CIOFileSystem): - @staticmethod - CStatus Connect(const HdfsConnectionConfig* config, - shared_ptr[CIOHadoopFileSystem]* client) - - CStatus MakeDirectory(const c_string& path) - - CStatus Delete(const c_string& path, c_bool recursive) - - CStatus Disconnect() - - c_bool Exists(const c_string& path) - - CStatus Chmod(const c_string& path, int mode) - CStatus Chown(const c_string& path, const char* owner, - const char* group) - - CStatus GetCapacity(int64_t* nbytes) - CStatus GetUsed(int64_t* nbytes) - - CStatus Rename(const c_string& src, const c_string& dst) - - CStatus OpenReadable(const c_string& path, - shared_ptr[HdfsReadableFile]* handle) - - CStatus OpenWritable(const c_string& path, c_bool append, - int32_t buffer_size, int16_t replication, - int64_t default_block_size, - shared_ptr[HdfsOutputStream]* handle) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index fd2d4df42cc..71a980632b9 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -32,7 +32,7 @@ import warnings from io import BufferedIOBase, IOBase, TextIOBase, UnsupportedOperation from queue import Queue, Empty as QueueEmpty -from pyarrow.lib cimport check_status, HaveLibHdfs +from pyarrow.lib cimport check_status from pyarrow.util import _is_path_like, _stringify_path @@ -46,18 +46,6 @@ cdef extern from "Python.h": char *v, Py_ssize_t len) except NULL -def have_libhdfs(): - """ - Return true if HDFS (HadoopFileSystem) library is set up correctly. - """ - try: - with nogil: - check_status(HaveLibHdfs()) - return True - except Exception: - return False - - def io_thread_count(): """ Return the number of threads to use for I/O operations. From 07f2aaea19002eb4b7d637dfd4f3d81166f514ec Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 8 Apr 2025 15:31:27 +0200 Subject: [PATCH 07/27] Fix a small mess after merge conflict resolve --- cpp/src/arrow/filesystem/hdfs_internal.cc | 2 +- cpp/src/arrow/io/hdfs.cc | 720 ---------------------- cpp/src/arrow/io/hdfs_internal.cc | 506 --------------- 3 files changed, 1 insertion(+), 1227 deletions(-) delete mode 100644 cpp/src/arrow/io/hdfs.cc delete mode 100644 cpp/src/arrow/io/hdfs_internal.cc diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index a4e6b79a5a1..16a18dc6dc7 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -46,7 +46,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/io_util.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging_internal.h" namespace arrow { diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc deleted file mode 100644 index cc55dd8d143..00000000000 --- a/cpp/src/arrow/io/hdfs.cc +++ /dev/null @@ -1,720 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/filesystem/type_fwd.h" -#include "arrow/io/hdfs.h" -#include "arrow/io/hdfs_internal.h" -#include "arrow/io/interfaces.h" -#include "arrow/memory_pool.h" -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/io_util.h" -#include "arrow/util/logging_internal.h" - -using std::size_t; - -namespace arrow { - -using internal::IOErrorFromErrno; - -namespace io { - -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - return IOErrorFromErrno(errno, "HDFS ", WHAT, " failed"); \ - } \ - } while (0) - -static constexpr int kDefaultHdfsBufferSize = 1 << 16; - -// ---------------------------------------------------------------------- -// File reading - -class HdfsAnyFileImpl { - public: - void set_members(const std::string& path, internal::LibHdfsShim* driver, hdfsFS fs, - hdfsFile handle) { - path_ = path; - driver_ = driver; - fs_ = fs; - file_ = handle; - is_open_ = true; - } - - Status Seek(int64_t position) { - RETURN_NOT_OK(CheckClosed()); - int ret = driver_->Seek(fs_, file_, position); - CHECK_FAILURE(ret, "seek"); - return Status::OK(); - } - - Result Tell() { - RETURN_NOT_OK(CheckClosed()); - int64_t ret = driver_->Tell(fs_, file_); - CHECK_FAILURE(ret, "tell"); - return ret; - } - - bool is_open() const { return is_open_; } - - protected: - Status CheckClosed() { - if (!is_open_) { - return Status::Invalid("Operation on closed HDFS file"); - } - return Status::OK(); - } - - std::string path_; - - internal::LibHdfsShim* driver_; - - // For threadsafety - std::mutex lock_; - - // These are pointers in libhdfs, so OK to copy - hdfsFS fs_; - hdfsFile file_; - - bool is_open_; -}; - -namespace { - -Status GetPathInfoFailed(const std::string& path) { - return IOErrorFromErrno(errno, "Calling GetPathInfo for '", path, "' failed"); -} - -} // namespace - -// Private implementation for read-only files -class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { - public: - explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} - - Status Close() { - if (is_open_) { - // is_open_ must be set to false in the beginning, because the destructor - // attempts to close the stream again, and if the first close fails, then - // the error doesn't get propagated properly and the second close - // initiated by the destructor raises a segfault - is_open_ = false; - int ret = driver_->CloseFile(fs_, file_); - CHECK_FAILURE(ret, "CloseFile"); - } - return Status::OK(); - } - - bool closed() const { return !is_open_; } - - Result ReadAt(int64_t position, int64_t nbytes, uint8_t* buffer) { - RETURN_NOT_OK(CheckClosed()); - if (!driver_->HasPread()) { - std::lock_guard guard(lock_); - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, buffer); - } - - constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); - int64_t total_bytes = 0; - while (nbytes > 0) { - const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); - tSize ret = - driver_->Pread(fs_, file_, static_cast(position), buffer, block_size); - CHECK_FAILURE(ret, "read"); - DCHECK_LE(ret, block_size); - if (ret == 0) { - break; // EOF - } - buffer += ret; - total_bytes += ret; - position += ret; - nbytes -= ret; - } - return total_bytes; - } - - Result> ReadAt(int64_t position, int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); - ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, - ReadAt(position, nbytes, buffer->mutable_data())); - if (bytes_read < nbytes) { - RETURN_NOT_OK(buffer->Resize(bytes_read)); - buffer->ZeroPadding(); - } - // R build with openSUSE155 requires an explicit shared_ptr construction - return std::shared_ptr(std::move(buffer)); - } - - Result Read(int64_t nbytes, void* buffer) { - RETURN_NOT_OK(CheckClosed()); - - int64_t total_bytes = 0; - while (total_bytes < nbytes) { - tSize ret = driver_->Read( - fs_, file_, reinterpret_cast(buffer) + total_bytes, - static_cast(std::min(buffer_size_, nbytes - total_bytes))); - CHECK_FAILURE(ret, "read"); - total_bytes += ret; - if (ret == 0) { - break; - } - } - return total_bytes; - } - - Result> Read(int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); - ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data())); - if (bytes_read < nbytes) { - RETURN_NOT_OK(buffer->Resize(bytes_read)); - } - // R build with openSUSE155 requires an explicit shared_ptr construction - return std::shared_ptr(std::move(buffer)); - } - - Result GetSize() { - RETURN_NOT_OK(CheckClosed()); - - hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); - if (entry == nullptr) { - return GetPathInfoFailed(path_); - } - int64_t size = entry->mSize; - driver_->FreeFileInfo(entry, 1); - return size; - } - - void set_memory_pool(MemoryPool* pool) { pool_ = pool; } - - void set_buffer_size(int32_t buffer_size) { buffer_size_ = buffer_size; } - - private: - MemoryPool* pool_; - int32_t buffer_size_; -}; - -HdfsReadableFile::HdfsReadableFile(const io::IOContext& io_context) { - impl_.reset(new HdfsReadableFileImpl(io_context.pool())); -} - -HdfsReadableFile::~HdfsReadableFile() { - ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsReadableFile"); -} - -Status HdfsReadableFile::Close() { return impl_->Close(); } - -bool HdfsReadableFile::closed() const { return impl_->closed(); } - -Result HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, void* buffer) { - return impl_->ReadAt(position, nbytes, reinterpret_cast(buffer)); -} - -Result> HdfsReadableFile::ReadAt(int64_t position, - int64_t nbytes) { - return impl_->ReadAt(position, nbytes); -} - -Result HdfsReadableFile::Read(int64_t nbytes, void* buffer) { - return impl_->Read(nbytes, buffer); -} - -Result> HdfsReadableFile::Read(int64_t nbytes) { - return impl_->Read(nbytes); -} - -Result HdfsReadableFile::GetSize() { return impl_->GetSize(); } - -Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); } - -Result HdfsReadableFile::Tell() const { return impl_->Tell(); } - -// ---------------------------------------------------------------------- -// File writing - -// Private implementation for writable-only files -class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { - public: - HdfsOutputStreamImpl() {} - - Status Close() { - if (is_open_) { - // is_open_ must be set to false in the beginning, because the destructor - // attempts to close the stream again, and if the first close fails, then - // the error doesn't get propagated properly and the second close - // initiated by the destructor raises a segfault - is_open_ = false; - RETURN_NOT_OK(FlushInternal()); - int ret = driver_->CloseFile(fs_, file_); - CHECK_FAILURE(ret, "CloseFile"); - } - return Status::OK(); - } - - bool closed() const { return !is_open_; } - - Status Flush() { - RETURN_NOT_OK(CheckClosed()); - - return FlushInternal(); - } - - Status Write(const uint8_t* buffer, int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); - - std::lock_guard guard(lock_); - while (nbytes > 0) { - const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); - tSize ret = driver_->Write(fs_, file_, buffer, block_size); - CHECK_FAILURE(ret, "Write"); - DCHECK_LE(ret, block_size); - buffer += ret; - nbytes -= ret; - } - return Status::OK(); - } - - protected: - Status FlushInternal() { - int ret = driver_->Flush(fs_, file_); - CHECK_FAILURE(ret, "Flush"); - return Status::OK(); - } -}; - -HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } - -HdfsOutputStream::~HdfsOutputStream() { - ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsOutputStream"); -} - -Status HdfsOutputStream::Close() { return impl_->Close(); } - -bool HdfsOutputStream::closed() const { return impl_->closed(); } - -Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes) { - return impl_->Write(reinterpret_cast(buffer), nbytes); -} - -Status HdfsOutputStream::Flush() { return impl_->Flush(); } - -Result HdfsOutputStream::Tell() const { return impl_->Tell(); } - -// ---------------------------------------------------------------------- -// HDFS client - -// TODO(wesm): this could throw std::bad_alloc in the course of copying strings -// into the path info object -static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { - out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File - : arrow::fs::FileType::Directory; - out->name = std::string(input->mName); - out->owner = std::string(input->mOwner); - out->group = std::string(input->mGroup); - - out->last_access_time = static_cast(input->mLastAccess); - out->last_modified_time = static_cast(input->mLastMod); - out->size = static_cast(input->mSize); - - out->replication = input->mReplication; - out->block_size = input->mBlockSize; - - out->permissions = input->mPermissions; -} - -// Private implementation -class HadoopFileSystem::HadoopFileSystemImpl { - public: - HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} - - Status Connect(const HdfsConnectionConfig* config) { - RETURN_NOT_OK(ConnectLibHdfs(&driver_)); - - // connect to HDFS with the builder object - hdfsBuilder* builder = driver_->NewBuilder(); - if (!config->host.empty()) { - driver_->BuilderSetNameNode(builder, config->host.c_str()); - } - driver_->BuilderSetNameNodePort(builder, static_cast(config->port)); - if (!config->user.empty()) { - driver_->BuilderSetUserName(builder, config->user.c_str()); - } - if (!config->kerb_ticket.empty()) { - driver_->BuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); - } - - for (const auto& kv : config->extra_conf) { - int ret = driver_->BuilderConfSetStr(builder, kv.first.c_str(), kv.second.c_str()); - CHECK_FAILURE(ret, "confsetstr"); - } - - driver_->BuilderSetForceNewInstance(builder); - fs_ = driver_->BuilderConnect(builder); - - if (fs_ == nullptr) { - return Status::IOError("HDFS connection failed"); - } - namenode_host_ = config->host; - port_ = config->port; - user_ = config->user; - kerb_ticket_ = config->kerb_ticket; - - return Status::OK(); - } - - Status MakeDirectory(const std::string& path) { - int ret = driver_->MakeDirectory(fs_, path.c_str()); - CHECK_FAILURE(ret, "create directory"); - return Status::OK(); - } - - Status Delete(const std::string& path, bool recursive) { - int ret = driver_->Delete(fs_, path.c_str(), static_cast(recursive)); - CHECK_FAILURE(ret, "delete"); - return Status::OK(); - } - - Status Disconnect() { - int ret = driver_->Disconnect(fs_); - CHECK_FAILURE(ret, "hdfsFS::Disconnect"); - return Status::OK(); - } - - bool Exists(const std::string& path) { - // hdfsExists does not distinguish between RPC failure and the file not - // existing - int ret = driver_->Exists(fs_, path.c_str()); - return ret == 0; - } - - Status GetCapacity(int64_t* nbytes) { - tOffset ret = driver_->GetCapacity(fs_); - CHECK_FAILURE(ret, "GetCapacity"); - *nbytes = ret; - return Status::OK(); - } - - Status GetUsed(int64_t* nbytes) { - tOffset ret = driver_->GetUsed(fs_); - CHECK_FAILURE(ret, "GetUsed"); - *nbytes = ret; - return Status::OK(); - } - - Status GetWorkingDirectory(std::string* out) { - char buffer[2048]; - if (driver_->GetWorkingDirectory(fs_, buffer, sizeof(buffer) - 1) == nullptr) { - return IOErrorFromErrno(errno, "HDFS GetWorkingDirectory failed"); - } - *out = buffer; - return Status::OK(); - } - - Status GetPathInfo(const std::string& path, HdfsPathInfo* info) { - hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); - - if (entry == nullptr) { - return GetPathInfoFailed(path); - } - - SetPathInfo(entry, info); - driver_->FreeFileInfo(entry, 1); - - return Status::OK(); - } - - Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) { - HdfsPathInfo info; - RETURN_NOT_OK(GetPathInfo(path, &info)); - - file_info->set_size(info.size); - file_info->set_type(info.kind); - return Status::OK(); - } - - Status GetChildren(const std::string& path, std::vector* listing) { - std::vector detailed_listing; - RETURN_NOT_OK(ListDirectory(path, &detailed_listing)); - for (const auto& info : detailed_listing) { - listing->push_back(info.name); - } - return Status::OK(); - } - - Status ListDirectory(const std::string& path, std::vector* listing) { - int num_entries = 0; - errno = 0; - hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries); - - if (entries == nullptr) { - // If the directory is empty, entries is NULL but errno is 0. Non-zero - // errno indicates error - // - // Note: errno is thread-local - // - // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set - // errno 2/ENOENT for empty directories. To be more robust to this we - // double check this case - if ((errno == 0) || (errno == ENOENT && Exists(path))) { - num_entries = 0; - } else { - return IOErrorFromErrno(errno, "HDFS list directory failed"); - } - } - - // Allocate additional space for elements - int vec_offset = static_cast(listing->size()); - listing->resize(vec_offset + num_entries); - - for (int i = 0; i < num_entries; ++i) { - SetPathInfo(entries + i, &(*listing)[vec_offset + i]); - } - - // Free libhdfs file info - driver_->FreeFileInfo(entries, num_entries); - - return Status::OK(); - } - - Status OpenReadable(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, - std::shared_ptr* file) { - errno = 0; - hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); - - if (handle == nullptr) { - return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); - } - - // std::make_shared does not work with private ctors - *file = std::shared_ptr(new HdfsReadableFile(io_context)); - (*file)->impl_->set_members(path, driver_, fs_, handle); - (*file)->impl_->set_buffer_size(buffer_size); - - return Status::OK(); - } - - Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { - int flags = O_WRONLY; - if (append) flags |= O_APPEND; - - errno = 0; - hdfsFile handle = - driver_->OpenFile(fs_, path.c_str(), flags, buffer_size, replication, - static_cast(default_block_size)); - - if (handle == nullptr) { - return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); - } - - // std::make_shared does not work with private ctors - *file = std::shared_ptr(new HdfsOutputStream()); - (*file)->impl_->set_members(path, driver_, fs_, handle); - - return Status::OK(); - } - - Status Rename(const std::string& src, const std::string& dst) { - int ret = driver_->Rename(fs_, src.c_str(), dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Copy(const std::string& src, const std::string& dst) { - int ret = driver_->Copy(fs_, src.c_str(), fs_, dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Move(const std::string& src, const std::string& dst) { - int ret = driver_->Move(fs_, src.c_str(), fs_, dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Chmod(const std::string& path, int mode) { - int ret = driver_->Chmod(fs_, path.c_str(), static_cast(mode)); // NOLINT - CHECK_FAILURE(ret, "Chmod"); - return Status::OK(); - } - - Status Chown(const std::string& path, const char* owner, const char* group) { - int ret = driver_->Chown(fs_, path.c_str(), owner, group); - CHECK_FAILURE(ret, "Chown"); - return Status::OK(); - } - - private: - internal::LibHdfsShim* driver_; - - std::string namenode_host_; - std::string user_; - int port_; - std::string kerb_ticket_; - - hdfsFS fs_; -}; - -// ---------------------------------------------------------------------- -// Public API for HDFSClient - -HadoopFileSystem::HadoopFileSystem() { impl_.reset(new HadoopFileSystemImpl()); } - -HadoopFileSystem::~HadoopFileSystem() {} - -Status HadoopFileSystem::Connect(const HdfsConnectionConfig* config, - std::shared_ptr* fs) { - // ctor is private, make_shared will not work - *fs = std::shared_ptr(new HadoopFileSystem()); - - RETURN_NOT_OK((*fs)->impl_->Connect(config)); - return Status::OK(); -} - -Status HadoopFileSystem::MakeDirectory(const std::string& path) { - return impl_->MakeDirectory(path); -} - -Status HadoopFileSystem::Delete(const std::string& path, bool recursive) { - return impl_->Delete(path, recursive); -} - -Status HadoopFileSystem::DeleteDirectory(const std::string& path) { - return Delete(path, true); -} - -Status HadoopFileSystem::Disconnect() { return impl_->Disconnect(); } - -bool HadoopFileSystem::Exists(const std::string& path) { return impl_->Exists(path); } - -Status HadoopFileSystem::GetPathInfo(const std::string& path, HdfsPathInfo* info) { - return impl_->GetPathInfo(path, info); -} - -Status HadoopFileSystem::Stat(const std::string& path, arrow::fs::FileInfo* file_info) { - return impl_->Stat(path, file_info); -} - -Status HadoopFileSystem::GetCapacity(int64_t* nbytes) { - return impl_->GetCapacity(nbytes); -} - -Status HadoopFileSystem::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); } - -Status HadoopFileSystem::GetWorkingDirectory(std::string* out) { - return impl_->GetWorkingDirectory(out); -} - -Status HadoopFileSystem::GetChildren(const std::string& path, - std::vector* listing) { - return impl_->GetChildren(path, listing); -} - -Status HadoopFileSystem::ListDirectory(const std::string& path, - std::vector* listing) { - return impl_->ListDirectory(path, listing); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { - return impl_->OpenReadable(path, buffer_size, io::default_io_context(), file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, - std::shared_ptr* file) { - return OpenReadable(path, kDefaultHdfsBufferSize, io::default_io_context(), file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, - std::shared_ptr* file) { - return impl_->OpenReadable(path, buffer_size, io_context, file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, - const io::IOContext& io_context, - std::shared_ptr* file) { - return OpenReadable(path, kDefaultHdfsBufferSize, io_context, file); -} - -Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, - int32_t buffer_size, int16_t replication, - int64_t default_block_size, - std::shared_ptr* file) { - return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, - file); -} - -Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, - std::shared_ptr* file) { - return OpenWritable(path, append, 0, 0, 0, file); -} - -Status HadoopFileSystem::Chmod(const std::string& path, int mode) { - return impl_->Chmod(path, mode); -} - -Status HadoopFileSystem::Chown(const std::string& path, const char* owner, - const char* group) { - return impl_->Chown(path, owner, group); -} - -Status HadoopFileSystem::Rename(const std::string& src, const std::string& dst) { - return impl_->Rename(src, dst); -} - -Status HadoopFileSystem::Copy(const std::string& src, const std::string& dst) { - return impl_->Copy(src, dst); -} - -Status HadoopFileSystem::Move(const std::string& src, const std::string& dst) { - return impl_->Move(src, dst); -} - -// ---------------------------------------------------------------------- -// Allow public API users to check whether we are set up correctly - -Status HaveLibHdfs() { - internal::LibHdfsShim* driver; - return internal::ConnectLibHdfs(&driver); -} - -} // namespace io -} // namespace arrow diff --git a/cpp/src/arrow/io/hdfs_internal.cc b/cpp/src/arrow/io/hdfs_internal.cc deleted file mode 100644 index 4a88b9a6be6..00000000000 --- a/cpp/src/arrow/io/hdfs_internal.cc +++ /dev/null @@ -1,506 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This shim interface to libhdfs (for runtime shared library loading) has been -// adapted from the SFrame project, released under the ASF-compatible 3-clause -// BSD license -// -// Using this required having the $JAVA_HOME and $HADOOP_HOME environment -// variables set, so that libjvm and libhdfs can be located easily - -// Copyright (C) 2015 Dato, Inc. -// All rights reserved. -// -// This software may be modified and distributed under the terms -// of the BSD license. See the LICENSE file for details. - -#include "arrow/io/hdfs_internal.h" - -#include -#include -#include -#include // IWYU pragma: keep -#include -#include -#include -#include "arrow/util/basic_decimal.h" - -#ifndef _WIN32 -# include -#endif - -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/io_util.h" -#include "arrow/util/logging_internal.h" - -namespace arrow { - -using internal::GetEnvVarNative; -using internal::PlatformFilename; - -namespace io::internal { - -namespace { - -template -Status SetSymbol(void* handle, char const* name, T** symbol) { - if (*symbol != nullptr) return Status::OK(); - - auto maybe_symbol = ::arrow::internal::GetSymbolAs(handle, name); - if (Required || maybe_symbol.ok()) { - ARROW_ASSIGN_OR_RAISE(*symbol, maybe_symbol); - } - return Status::OK(); -} - -#define GET_SYMBOL_REQUIRED(SHIM, SYMBOL_NAME) \ - RETURN_NOT_OK(SetSymbol(SHIM->handle, #SYMBOL_NAME, &SHIM->SYMBOL_NAME)); - -#define GET_SYMBOL(SHIM, SYMBOL_NAME) \ - DCHECK_OK(SetSymbol(SHIM->handle, #SYMBOL_NAME, &SHIM->SYMBOL_NAME)); - -Result> MakeFilenameVector( - const std::vector& names) { - std::vector filenames(names.size()); - for (size_t i = 0; i < names.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(filenames[i], PlatformFilename::FromString(names[i])); - } - return filenames; -} - -void AppendEnvVarFilename(const char* var_name, - std::vector* filenames) { - auto maybe_env_var = GetEnvVarNative(var_name); - if (maybe_env_var.ok()) { - filenames->emplace_back(std::move(*maybe_env_var)); - } -} - -void AppendEnvVarFilename(const char* var_name, const char* suffix, - std::vector* filenames) { - auto maybe_env_var = GetEnvVarNative(var_name); - if (maybe_env_var.ok()) { - auto maybe_env_var_with_suffix = - PlatformFilename(std::move(*maybe_env_var)).Join(suffix); - if (maybe_env_var_with_suffix.ok()) { - filenames->emplace_back(std::move(*maybe_env_var_with_suffix)); - } - } -} - -void InsertEnvVarFilename(const char* var_name, - std::vector* filenames) { - auto maybe_env_var = GetEnvVarNative(var_name); - if (maybe_env_var.ok()) { - filenames->emplace(filenames->begin(), PlatformFilename(std::move(*maybe_env_var))); - } -} - -Result> get_potential_libhdfs_paths() { - std::vector potential_paths; - std::string file_name; - -// OS-specific file name -#ifdef _WIN32 - file_name = "hdfs.dll"; -#elif __APPLE__ - file_name = "libhdfs.dylib"; -#else - file_name = "libhdfs.so"; -#endif - - // Common paths - ARROW_ASSIGN_OR_RAISE(auto search_paths, MakeFilenameVector({"", "."})); - - // Path from environment variable - AppendEnvVarFilename("HADOOP_HOME", "lib/native", &search_paths); - AppendEnvVarFilename("ARROW_LIBHDFS_DIR", &search_paths); - - // All paths with file name - for (const auto& path : search_paths) { - ARROW_ASSIGN_OR_RAISE(auto full_path, path.Join(file_name)); - potential_paths.push_back(std::move(full_path)); - } - - return potential_paths; -} - -Result> get_potential_libjvm_paths() { - std::vector potential_paths; - - std::vector search_prefixes; - std::vector search_suffixes; - std::string file_name; - -// From heuristics -#ifdef _WIN32 - ARROW_ASSIGN_OR_RAISE(search_prefixes, MakeFilenameVector({""})); - ARROW_ASSIGN_OR_RAISE(search_suffixes, - MakeFilenameVector({"/jre/bin/server", "/bin/server"})); - file_name = "jvm.dll"; -#elif __APPLE__ - ARROW_ASSIGN_OR_RAISE(search_prefixes, MakeFilenameVector({""})); - ARROW_ASSIGN_OR_RAISE(search_suffixes, - MakeFilenameVector({"/jre/lib/server", "/lib/server"})); - file_name = "libjvm.dylib"; - -// SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are -// expecting users to set an environment variable -#else -# if defined(__aarch64__) - const std::string prefix_arch{"arm64"}; - const std::string suffix_arch{"aarch64"}; -# else - const std::string prefix_arch{"amd64"}; - const std::string suffix_arch{"amd64"}; -# endif - ARROW_ASSIGN_OR_RAISE( - search_prefixes, - MakeFilenameVector({ - "/usr/lib/jvm/default-java", // ubuntu / debian distros - "/usr/lib/jvm/java", // rhel6 - "/usr/lib/jvm", // centos6 - "/usr/lib64/jvm", // opensuse 13 - "/usr/local/lib/jvm/default-java", // alt ubuntu / debian distros - "/usr/local/lib/jvm/java", // alt rhel6 - "/usr/local/lib/jvm", // alt centos6 - "/usr/local/lib64/jvm", // alt opensuse 13 - "/usr/local/lib/jvm/java-8-openjdk-" + - prefix_arch, // alt ubuntu / debian distros - "/usr/lib/jvm/java-8-openjdk-" + prefix_arch, // alt ubuntu / debian distros - "/usr/local/lib/jvm/java-7-openjdk-" + - prefix_arch, // alt ubuntu / debian distros - "/usr/lib/jvm/java-7-openjdk-" + prefix_arch, // alt ubuntu / debian distros - "/usr/local/lib/jvm/java-6-openjdk-" + - prefix_arch, // alt ubuntu / debian distros - "/usr/lib/jvm/java-6-openjdk-" + prefix_arch, // alt ubuntu / debian distros - "/usr/lib/jvm/java-7-oracle", // alt ubuntu - "/usr/lib/jvm/java-8-oracle", // alt ubuntu - "/usr/lib/jvm/java-6-oracle", // alt ubuntu - "/usr/local/lib/jvm/java-7-oracle", // alt ubuntu - "/usr/local/lib/jvm/java-8-oracle", // alt ubuntu - "/usr/local/lib/jvm/java-6-oracle", // alt ubuntu - "/usr/lib/jvm/default", // alt centos - "/usr/java/latest" // alt centos - })); - ARROW_ASSIGN_OR_RAISE( - search_suffixes, - MakeFilenameVector({"", "/lib/server", "/jre/lib/" + suffix_arch + "/server", - "/lib/" + suffix_arch + "/server"})); - file_name = "libjvm.so"; -#endif - - // From direct environment variable - InsertEnvVarFilename("JAVA_HOME", &search_prefixes); - - // Generate cross product between search_prefixes, search_suffixes, and file_name - for (auto& prefix : search_prefixes) { - for (auto& suffix : search_suffixes) { - ARROW_ASSIGN_OR_RAISE(auto path, prefix.Join(suffix).Join(file_name)); - potential_paths.push_back(std::move(path)); - } - } - - return potential_paths; -} - -Result try_dlopen(const std::vector& potential_paths, - std::string name) { - std::string error_message = "Unable to load " + name; - - for (const auto& p : potential_paths) { - auto maybe_handle = arrow::internal::LoadDynamicLibrary(p); - if (maybe_handle.ok()) return *maybe_handle; - error_message += "\n" + maybe_handle.status().message(); - } - - return Status(StatusCode::IOError, std::move(error_message)); -} - -LibHdfsShim libhdfs_shim; - -} // namespace - -Status LibHdfsShim::GetRequiredSymbols() { - GET_SYMBOL_REQUIRED(this, hdfsNewBuilder); - GET_SYMBOL_REQUIRED(this, hdfsBuilderSetNameNode); - GET_SYMBOL_REQUIRED(this, hdfsBuilderSetNameNodePort); - GET_SYMBOL_REQUIRED(this, hdfsBuilderSetUserName); - GET_SYMBOL_REQUIRED(this, hdfsBuilderSetKerbTicketCachePath); - GET_SYMBOL_REQUIRED(this, hdfsBuilderSetForceNewInstance); - GET_SYMBOL_REQUIRED(this, hdfsBuilderConfSetStr); - GET_SYMBOL_REQUIRED(this, hdfsBuilderConnect); - GET_SYMBOL_REQUIRED(this, hdfsCreateDirectory); - GET_SYMBOL_REQUIRED(this, hdfsDelete); - GET_SYMBOL_REQUIRED(this, hdfsDisconnect); - GET_SYMBOL_REQUIRED(this, hdfsExists); - GET_SYMBOL_REQUIRED(this, hdfsFreeFileInfo); - GET_SYMBOL_REQUIRED(this, hdfsGetCapacity); - GET_SYMBOL_REQUIRED(this, hdfsGetUsed); - GET_SYMBOL_REQUIRED(this, hdfsGetPathInfo); - GET_SYMBOL_REQUIRED(this, hdfsListDirectory); - GET_SYMBOL_REQUIRED(this, hdfsChown); - GET_SYMBOL_REQUIRED(this, hdfsChmod); - - // File methods - GET_SYMBOL_REQUIRED(this, hdfsCloseFile); - GET_SYMBOL_REQUIRED(this, hdfsFlush); - GET_SYMBOL_REQUIRED(this, hdfsOpenFile); - GET_SYMBOL_REQUIRED(this, hdfsRead); - GET_SYMBOL_REQUIRED(this, hdfsSeek); - GET_SYMBOL_REQUIRED(this, hdfsTell); - GET_SYMBOL_REQUIRED(this, hdfsWrite); - - return Status::OK(); -} - -Status ConnectLibHdfs(LibHdfsShim** driver) { - static std::mutex lock; - std::lock_guard guard(lock); - - LibHdfsShim* shim = &libhdfs_shim; - - static bool shim_attempted = false; - if (!shim_attempted) { - shim_attempted = true; - - shim->Initialize(); - - ARROW_ASSIGN_OR_RAISE(auto libjvm_potential_paths, get_potential_libjvm_paths()); - RETURN_NOT_OK(try_dlopen(libjvm_potential_paths, "libjvm")); - - ARROW_ASSIGN_OR_RAISE(auto libhdfs_potential_paths, get_potential_libhdfs_paths()); - ARROW_ASSIGN_OR_RAISE(shim->handle, try_dlopen(libhdfs_potential_paths, "libhdfs")); - } else if (shim->handle == nullptr) { - return Status::IOError("Prior attempt to load libhdfs failed"); - } - - *driver = shim; - return shim->GetRequiredSymbols(); -} - -/////////////////////////////////////////////////////////////////////////// -// HDFS thin wrapper methods - -hdfsBuilder* LibHdfsShim::NewBuilder() { return this->hdfsNewBuilder(); } - -void LibHdfsShim::BuilderSetNameNode(hdfsBuilder* bld, const char* nn) { - this->hdfsBuilderSetNameNode(bld, nn); -} - -void LibHdfsShim::BuilderSetNameNodePort(hdfsBuilder* bld, tPort port) { - this->hdfsBuilderSetNameNodePort(bld, port); -} - -void LibHdfsShim::BuilderSetUserName(hdfsBuilder* bld, const char* userName) { - this->hdfsBuilderSetUserName(bld, userName); -} - -void LibHdfsShim::BuilderSetKerbTicketCachePath(hdfsBuilder* bld, - const char* kerbTicketCachePath) { - this->hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath); -} - -void LibHdfsShim::BuilderSetForceNewInstance(hdfsBuilder* bld) { - this->hdfsBuilderSetForceNewInstance(bld); -} - -hdfsFS LibHdfsShim::BuilderConnect(hdfsBuilder* bld) { - return this->hdfsBuilderConnect(bld); -} - -int LibHdfsShim::BuilderConfSetStr(hdfsBuilder* bld, const char* key, const char* val) { - return this->hdfsBuilderConfSetStr(bld, key, val); -} - -int LibHdfsShim::Disconnect(hdfsFS fs) { return this->hdfsDisconnect(fs); } - -hdfsFile LibHdfsShim::OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize) { // NOLINT - return this->hdfsOpenFile(fs, path, flags, bufferSize, replication, blocksize); -} - -int LibHdfsShim::CloseFile(hdfsFS fs, hdfsFile file) { - return this->hdfsCloseFile(fs, file); -} - -int LibHdfsShim::Exists(hdfsFS fs, const char* path) { - return this->hdfsExists(fs, path); -} - -int LibHdfsShim::Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos) { - return this->hdfsSeek(fs, file, desiredPos); -} - -tOffset LibHdfsShim::Tell(hdfsFS fs, hdfsFile file) { return this->hdfsTell(fs, file); } - -tSize LibHdfsShim::Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length) { - return this->hdfsRead(fs, file, buffer, length); -} - -bool LibHdfsShim::HasPread() { - GET_SYMBOL(this, hdfsPread); - return this->hdfsPread != nullptr; -} - -tSize LibHdfsShim::Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, - tSize length) { - GET_SYMBOL(this, hdfsPread); - DCHECK(this->hdfsPread); - return this->hdfsPread(fs, file, position, buffer, length); -} - -tSize LibHdfsShim::Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length) { - return this->hdfsWrite(fs, file, buffer, length); -} - -int LibHdfsShim::Flush(hdfsFS fs, hdfsFile file) { return this->hdfsFlush(fs, file); } - -int LibHdfsShim::Available(hdfsFS fs, hdfsFile file) { - GET_SYMBOL(this, hdfsAvailable); - if (this->hdfsAvailable) { - return this->hdfsAvailable(fs, file); - } else { - return 0; - } -} - -int LibHdfsShim::Copy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { - GET_SYMBOL(this, hdfsCopy); - if (this->hdfsCopy) { - return this->hdfsCopy(srcFS, src, dstFS, dst); - } else { - return 0; - } -} - -int LibHdfsShim::Move(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { - GET_SYMBOL(this, hdfsMove); - if (this->hdfsMove) { - return this->hdfsMove(srcFS, src, dstFS, dst); - } else { - return 0; - } -} - -int LibHdfsShim::Delete(hdfsFS fs, const char* path, int recursive) { - return this->hdfsDelete(fs, path, recursive); -} - -int LibHdfsShim::Rename(hdfsFS fs, const char* oldPath, const char* newPath) { - GET_SYMBOL(this, hdfsRename); - if (this->hdfsRename) { - return this->hdfsRename(fs, oldPath, newPath); - } else { - return 0; - } -} - -char* LibHdfsShim::GetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize) { - GET_SYMBOL(this, hdfsGetWorkingDirectory); - if (this->hdfsGetWorkingDirectory) { - return this->hdfsGetWorkingDirectory(fs, buffer, bufferSize); - } else { - return nullptr; - } -} - -int LibHdfsShim::SetWorkingDirectory(hdfsFS fs, const char* path) { - GET_SYMBOL(this, hdfsSetWorkingDirectory); - if (this->hdfsSetWorkingDirectory) { - return this->hdfsSetWorkingDirectory(fs, path); - } else { - return 0; - } -} - -int LibHdfsShim::MakeDirectory(hdfsFS fs, const char* path) { - return this->hdfsCreateDirectory(fs, path); -} - -int LibHdfsShim::SetReplication(hdfsFS fs, const char* path, int16_t replication) { - GET_SYMBOL(this, hdfsSetReplication); - if (this->hdfsSetReplication) { - return this->hdfsSetReplication(fs, path, replication); - } else { - return 0; - } -} - -hdfsFileInfo* LibHdfsShim::ListDirectory(hdfsFS fs, const char* path, int* numEntries) { - return this->hdfsListDirectory(fs, path, numEntries); -} - -hdfsFileInfo* LibHdfsShim::GetPathInfo(hdfsFS fs, const char* path) { - return this->hdfsGetPathInfo(fs, path); -} - -void LibHdfsShim::FreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries) { - this->hdfsFreeFileInfo(hdfsFileInfo, numEntries); -} - -char*** LibHdfsShim::GetHosts(hdfsFS fs, const char* path, tOffset start, - tOffset length) { - GET_SYMBOL(this, hdfsGetHosts); - if (this->hdfsGetHosts) { - return this->hdfsGetHosts(fs, path, start, length); - } else { - return nullptr; - } -} - -void LibHdfsShim::FreeHosts(char*** blockHosts) { - GET_SYMBOL(this, hdfsFreeHosts); - if (this->hdfsFreeHosts) { - this->hdfsFreeHosts(blockHosts); - } -} - -tOffset LibHdfsShim::GetDefaultBlockSize(hdfsFS fs) { - GET_SYMBOL(this, hdfsGetDefaultBlockSize); - if (this->hdfsGetDefaultBlockSize) { - return this->hdfsGetDefaultBlockSize(fs); - } else { - return 0; - } -} - -tOffset LibHdfsShim::GetCapacity(hdfsFS fs) { return this->hdfsGetCapacity(fs); } - -tOffset LibHdfsShim::GetUsed(hdfsFS fs) { return this->hdfsGetUsed(fs); } - -int LibHdfsShim::Chown(hdfsFS fs, const char* path, const char* owner, - const char* group) { - return this->hdfsChown(fs, path, owner, group); -} - -int LibHdfsShim::Chmod(hdfsFS fs, const char* path, short mode) { // NOLINT - return this->hdfsChmod(fs, path, mode); -} - -int LibHdfsShim::Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { - GET_SYMBOL(this, hdfsUtime); - if (this->hdfsUtime) { - return this->hdfsUtime(fs, path, mtime, atime); - } else { - return 0; - } -} - -} // namespace io::internal -} // namespace arrow From 65dac4a389104b4886b1b9a9c63cde92d30ae718 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 9 Apr 2025 09:07:14 +0200 Subject: [PATCH 08/27] Fix linter errors and revert changes in the test namespace --- cpp/src/arrow/CMakeLists.txt | 6 ++- cpp/src/arrow/filesystem/hdfs.h | 2 +- cpp/src/arrow/filesystem/hdfs_io.cc | 4 +- cpp/src/arrow/filesystem/hdfs_test_io.cc | 52 ++++++++++++------------ 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bfa3770b4bd..c79ecdb841a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -903,7 +903,11 @@ if(ARROW_FILESYSTEM) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() if(ARROW_HDFS) - list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc filesystem/hdfs_io.cc filesystem/hdfs_internal.cc) + list(APPEND + ARROW_FILESYSTEM_SRCS + filesystem/hdfs.cc + filesystem/hdfs_io.cc + filesystem/hdfs_internal.cc) endif() if(ARROW_S3) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/s3fs.cc) diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index f330da92f0e..41ef6629b27 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -57,7 +57,7 @@ struct ARROW_EXPORT HdfsOptions { /// HDFS-backed FileSystem implementation. /// /// implementation notes: -/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs. +/// - This is a wrapper of arrow/filesystem/hdfs_io, so we can use FileSystem API to handle hdfs. class ARROW_EXPORT HadoopFileSystem : public FileSystem { public: ~HadoopFileSystem() override; diff --git a/cpp/src/arrow/filesystem/hdfs_io.cc b/cpp/src/arrow/filesystem/hdfs_io.cc index cc55dd8d143..20df3b7e5be 100644 --- a/cpp/src/arrow/filesystem/hdfs_io.cc +++ b/cpp/src/arrow/filesystem/hdfs_io.cc @@ -30,8 +30,8 @@ #include "arrow/buffer.h" #include "arrow/filesystem/type_fwd.h" -#include "arrow/io/hdfs.h" -#include "arrow/io/hdfs_internal.h" +#include "arrow/filesystem/hdfs_io.h" +#include "arrow/filesystem/hdfs_internal.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/filesystem/hdfs_test_io.cc b/cpp/src/arrow/filesystem/hdfs_test_io.cc index 125716a02dc..52278b524eb 100644 --- a/cpp/src/arrow/filesystem/hdfs_test_io.cc +++ b/cpp/src/arrow/filesystem/hdfs_test_io.cc @@ -30,16 +30,16 @@ #include #include "arrow/buffer.h" -#include "arrow/filesystem/type_fwd.h" #include "arrow/filesystem/hdfs_internal.h" #include "arrow/filesystem/hdfs_io.h" +#include "arrow/filesystem/type_fwd.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" namespace arrow { -namespace fs { +namespace io { std::vector RandomData(int64_t size) { std::vector buffer(size); @@ -59,7 +59,7 @@ class TestHadoopFileSystem : public ::testing::Test { Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, bool append = false, int buffer_size = 0, int16_t replication = 0, int default_block_size = 0) { - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(client_->OpenWritable(path, append, buffer_size, replication, default_block_size, &file)); @@ -83,7 +83,7 @@ class TestHadoopFileSystem : public ::testing::Test { // Set up shared state between unit tests void SetUp() { - io::internal::LibHdfsShim* driver_shim; + internal::LibHdfsShim* driver_shim; client_ = nullptr; scratch_dir_ = @@ -119,7 +119,7 @@ class TestHadoopFileSystem : public ::testing::Test { conf_.user = user; conf_.port = port == nullptr ? 20500 : atoi(port); - ASSERT_OK(io::HadoopFileSystem::Connect(&conf_, &client_)); + ASSERT_OK(HadoopFileSystem::Connect(&conf_, &client_)); } void TearDown() { @@ -131,12 +131,12 @@ class TestHadoopFileSystem : public ::testing::Test { } } - io::HdfsConnectionConfig conf_; + HdfsConnectionConfig conf_; bool loaded_driver_; // Resources shared amongst unit tests std::string scratch_dir_; - std::shared_ptr client_; + std::shared_ptr client_; }; #define SKIP_IF_NO_DRIVER() \ @@ -147,8 +147,8 @@ class TestHadoopFileSystem : public ::testing::Test { TEST_F(TestHadoopFileSystem, ConnectsAgain) { SKIP_IF_NO_DRIVER(); - std::shared_ptr client; - ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client)); + std::shared_ptr client; + ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client)); ASSERT_OK(client->Disconnect()); } @@ -157,14 +157,14 @@ TEST_F(TestHadoopFileSystem, MultipleClients) { ASSERT_OK(this->MakeScratchDir()); - std::shared_ptr client1; - std::shared_ptr client2; - ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client1)); - ASSERT_OK(io::HadoopFileSystem::Connect(&this->conf_, &client2)); + std::shared_ptr client1; + std::shared_ptr client2; + ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client1)); + ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client2)); ASSERT_OK(client1->Disconnect()); // client2 continues to function after equivalent client1 has shutdown - std::vector listing; + std::vector listing; ASSERT_OK(client2->ListDirectory(this->scratch_dir_, &listing)); ASSERT_OK(client2->Disconnect()); } @@ -180,7 +180,7 @@ TEST_F(TestHadoopFileSystem, MakeDirectory) { ASSERT_OK(this->client_->MakeDirectory(path)); ASSERT_TRUE(this->client_->Exists(path)); - std::vector listing; + std::vector listing; ARROW_EXPECT_OK(this->client_->ListDirectory(path, &listing)); ASSERT_EQ(0, listing.size()); ARROW_EXPECT_OK(this->client_->Delete(path, true)); @@ -204,7 +204,7 @@ TEST_F(TestHadoopFileSystem, GetCapacityUsed) { TEST_F(TestHadoopFileSystem, GetPathInfo) { SKIP_IF_NO_DRIVER(); - io::HdfsPathInfo info; + HdfsPathInfo info; ASSERT_OK(this->MakeScratchDir()); @@ -238,7 +238,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfoNotExist) { ASSERT_OK(this->MakeScratchDir()); auto path = this->ScratchPath("path-does-not-exist"); - io::HdfsPathInfo info; + HdfsPathInfo info; Status s = this->client_->GetPathInfo(path, &info); ASSERT_TRUE(s.IsIOError()); @@ -262,7 +262,7 @@ TEST_F(TestHadoopFileSystem, AppendToFile) { // now append ASSERT_OK(this->WriteDummyFile(path, buffer.data(), size, true)); - io::HdfsPathInfo info; + HdfsPathInfo info; ASSERT_OK(this->client_->GetPathInfo(path, &info)); ASSERT_EQ(size * 2, info.size); } @@ -282,7 +282,7 @@ TEST_F(TestHadoopFileSystem, ListDirectory) { ASSERT_OK(this->WriteDummyFile(p2, data.data(), size / 2)); ASSERT_OK(this->client_->MakeDirectory(d1)); - std::vector listing; + std::vector listing; ASSERT_OK(this->client_->ListDirectory(this->scratch_dir_, &listing)); // Do it again, appends! @@ -292,7 +292,7 @@ TEST_F(TestHadoopFileSystem, ListDirectory) { // Argh, well, shouldn't expect the listing to be in any particular order for (size_t i = 0; i < listing.size(); ++i) { - const io::HdfsPathInfo& info = listing[i]; + const HdfsPathInfo& info = listing[i]; if (info.name == this->HdfsAbsPath(p1)) { ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(size, info.size); @@ -318,7 +318,7 @@ TEST_F(TestHadoopFileSystem, ReadableMethods) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); // Test GetSize -- move this into its own unit test if ever needed @@ -355,7 +355,7 @@ TEST_F(TestHadoopFileSystem, LargeFile) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); ASSERT_FALSE(file->closed()); @@ -366,7 +366,7 @@ TEST_F(TestHadoopFileSystem, LargeFile) { ASSERT_EQ(0, std::memcmp(buffer->data(), data.data(), size)); // explicit buffer size - std::shared_ptr file2; + std::shared_ptr file2; ASSERT_OK(this->client_->OpenReadable(path, 1 << 18, &file2)); ASSERT_OK_AND_ASSIGN(auto buffer2, AllocateBuffer(size)); @@ -404,7 +404,7 @@ TEST_F(TestHadoopFileSystem, ChmodChown) { std::vector data = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, data.data(), size)); - io::HdfsPathInfo info; + HdfsPathInfo info; ASSERT_OK(this->client_->Chmod(path, mode)); ASSERT_OK(this->client_->GetPathInfo(path, &info)); ASSERT_EQ(mode, info.permissions); @@ -427,7 +427,7 @@ TEST_F(TestHadoopFileSystem, ThreadSafety) { ASSERT_OK(this->WriteDummyFile(src_path, reinterpret_cast(data.c_str()), static_cast(data.size()))); - std::shared_ptr file; + std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(src_path, &file)); std::atomic correct_count(0); @@ -463,5 +463,5 @@ TEST_F(TestHadoopFileSystem, ThreadSafety) { ASSERT_EQ(niter * 4, correct_count); } -} // namespace fs +} // namespace io } // namespace arrow From 61c40632d54ac1fe1c7b92e99bf436574b796986 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 9 Apr 2025 09:09:47 +0200 Subject: [PATCH 09/27] Add docstrings to python have_libhdfs method --- python/pyarrow/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index c99f480287e..f20276d6470 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -111,6 +111,9 @@ def _filesystem_is_available(fs): return True def have_libhdfs(): + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ try: from pyarrow._hdfs import _have_libhdfs # noqa return _have_libhdfs() From 8dc8745cd84008a7bed6b310de0429864d3a7794 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 9 Apr 2025 09:17:48 +0200 Subject: [PATCH 10/27] More linter fixes --- cpp/src/arrow/filesystem/hdfs.h | 3 ++- cpp/src/arrow/filesystem/hdfs_io.cc | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index 41ef6629b27..f633dd34951 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -57,7 +57,8 @@ struct ARROW_EXPORT HdfsOptions { /// HDFS-backed FileSystem implementation. /// /// implementation notes: -/// - This is a wrapper of arrow/filesystem/hdfs_io, so we can use FileSystem API to handle hdfs. +/// - This is a wrapper of arrow/filesystem/hdfs_io, so we can use FileSystem API to +/// handle hdfs. class ARROW_EXPORT HadoopFileSystem : public FileSystem { public: ~HadoopFileSystem() override; diff --git a/cpp/src/arrow/filesystem/hdfs_io.cc b/cpp/src/arrow/filesystem/hdfs_io.cc index 20df3b7e5be..2c341fcf1cc 100644 --- a/cpp/src/arrow/filesystem/hdfs_io.cc +++ b/cpp/src/arrow/filesystem/hdfs_io.cc @@ -29,9 +29,9 @@ #include #include "arrow/buffer.h" -#include "arrow/filesystem/type_fwd.h" -#include "arrow/filesystem/hdfs_io.h" #include "arrow/filesystem/hdfs_internal.h" +#include "arrow/filesystem/hdfs_io.h" +#include "arrow/filesystem/type_fwd.h" #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" #include "arrow/result.h" From a51a0599dc55ea05c550ef24126bc0bacd12ebff Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 9 Apr 2025 10:04:56 +0200 Subject: [PATCH 11/27] And Python linter --- python/pyarrow/__init__.py | 1 + python/pyarrow/includes/libarrow_fs.pxd | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index f20276d6470..82e20b5e847 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -110,6 +110,7 @@ def _filesystem_is_available(fs): else: return True + def have_libhdfs(): """ Return true if HDFS (HadoopFileSystem) library is set up correctly. diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index b67880a88f1..5011b90ed79 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -366,7 +366,7 @@ cdef extern from "arrow/filesystem/hdfs_io.h" namespace "arrow::io" nogil: CStatus GetUsed(int64_t* nbytes) CStatus ListDirectory(const c_string& path, - vector[HdfsPathInfo]* listing) + vector[HdfsPathInfo]* listing) CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) From ed1b17f113d32850eda3e5c6cdeb6bd661c52f80 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 5 May 2025 14:22:19 +0200 Subject: [PATCH 12/27] Remove target_link_libraries for ARROW_IO --- cpp/src/arrow/CMakeLists.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index c79ecdb841a..d1b378c927a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -413,11 +413,6 @@ arrow_add_object_library(ARROW_IO io/slow.cc io/stdio.cc io/transform.cc) -foreach(ARROW_IO_TARGET ${ARROW_IO_TARGETS}) - if(NOT MSVC) - target_link_libraries(${ARROW_IO_TARGET} PRIVATE ${CMAKE_DL_LIBS}) - endif() -endforeach() set(ARROW_MEMORY_POOL_SRCS memory_pool.cc) if(ARROW_JEMALLOC) From 6009c5a202a1934ce3c2458d8ee7c88f5d62c2d3 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 5 May 2025 14:33:42 +0200 Subject: [PATCH 13/27] Rename hdfs_io_test --- cpp/src/arrow/filesystem/CMakeLists.txt | 2 +- cpp/src/arrow/filesystem/{hdfs_test_io.cc => hdfs_io_test.cc} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/arrow/filesystem/{hdfs_test_io.cc => hdfs_io_test.cc} (100%) diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 32007b4653c..05133955f53 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -143,7 +143,7 @@ endif() if(ARROW_HDFS) add_arrow_test(hdfs_test EXTRA_LABELS filesystem) - add_arrow_test(hdfs_test_io + add_arrow_test(hdfs_io_test NO_VALGRIND EXTRA_LABELS filesystem diff --git a/cpp/src/arrow/filesystem/hdfs_test_io.cc b/cpp/src/arrow/filesystem/hdfs_io_test.cc similarity index 100% rename from cpp/src/arrow/filesystem/hdfs_test_io.cc rename to cpp/src/arrow/filesystem/hdfs_io_test.cc From 0433bc1e4ab4778225cbc3c8e4456d11fc52ad2b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 5 May 2025 14:37:47 +0200 Subject: [PATCH 14/27] Remove hdfs_io include in filesystem/api.h --- cpp/src/arrow/filesystem/api.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index 703368bb475..7211ad5c2cc 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -32,6 +32,3 @@ #ifdef ARROW_S3 # include "arrow/filesystem/s3fs.h" // IWYU pragma: export #endif -#ifdef ARROW_HDFS -# include "arrow/filesystem/hdfs_io.h" // IWYU pragma: export -#endif From 6621fc9e5fc111dbafe64bbc6698cafbb07589f7 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 12 May 2025 11:10:37 +0200 Subject: [PATCH 15/27] Merge internal hdfs classes --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/filesystem/CMakeLists.txt | 2 +- cpp/src/arrow/filesystem/hdfs.cc | 433 ++++++++++- cpp/src/arrow/filesystem/hdfs.h | 100 ++- cpp/src/arrow/filesystem/hdfs_internal.cc | 321 ++++++++ cpp/src/arrow/filesystem/hdfs_internal.h | 112 +++ ...{hdfs_io_test.cc => hdfs_internal_test.cc} | 70 +- cpp/src/arrow/filesystem/hdfs_io.cc | 720 ------------------ cpp/src/arrow/filesystem/hdfs_io.h | 274 ------- python/pyarrow/includes/libarrow_fs.pxd | 61 -- 10 files changed, 968 insertions(+), 1126 deletions(-) rename cpp/src/arrow/filesystem/{hdfs_io_test.cc => hdfs_internal_test.cc} (87%) delete mode 100644 cpp/src/arrow/filesystem/hdfs_io.cc delete mode 100644 cpp/src/arrow/filesystem/hdfs_io.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d1b378c927a..4a88b906d9e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -901,7 +901,6 @@ if(ARROW_FILESYSTEM) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc - filesystem/hdfs_io.cc filesystem/hdfs_internal.cc) endif() if(ARROW_S3) diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 05133955f53..32f210b7fb0 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -143,7 +143,7 @@ endif() if(ARROW_HDFS) add_arrow_test(hdfs_test EXTRA_LABELS filesystem) - add_arrow_test(hdfs_io_test + add_arrow_test(hdfs_internal_test NO_VALGRIND EXTRA_LABELS filesystem diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 810e1fbdb8b..bd91cc67fde 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -22,11 +22,9 @@ #include "arrow/filesystem/hdfs.h" #include "arrow/filesystem/hdfs_internal.h" -#include "arrow/filesystem/hdfs_io.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" #include "arrow/util/windows_fixup.h" @@ -35,14 +33,57 @@ namespace arrow { using internal::ErrnoFromStatus; using internal::ParseValue; +using io::internal::HdfsOutputStream; +using io::internal::HdfsPathInfo; +using io::internal::MakeOutputStream; +using io::internal::MakeReadableFile; using util::Uri; namespace fs { +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return IOErrorFromErrno(errno, "HDFS ", WHAT, " failed"); \ + } \ + } while (0) + +static constexpr int kDefaultHdfsBufferSize = 1 << 16; + using internal::GetAbstractPathParent; using internal::MakeAbstractPathRelative; using internal::RemoveLeadingSlash; +// ---------------------------------------------------------------------- +// HDFS client + +// TODO(wesm): this could throw std::bad_alloc in the course of copying strings +// into the path info object +static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { + out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File + : arrow::fs::FileType::Directory; + out->name = std::string(input->mName); + out->owner = std::string(input->mOwner); + out->group = std::string(input->mGroup); + + out->last_access_time = static_cast(input->mLastAccess); + out->last_modified_time = static_cast(input->mLastMod); + out->size = static_cast(input->mSize); + + out->replication = input->mReplication; + out->block_size = input->mBlockSize; + + out->permissions = input->mPermissions; +} + +namespace { + +Status GetPathInfoFailed(const std::string& path) { + return IOErrorFromErrno(errno, "Calling GetPathInfo for '", path, "' failed"); +} + +} // namespace + class HadoopFileSystem::Impl { public: Impl(HdfsOptions options, const io::IOContext& io_context) @@ -51,15 +92,50 @@ class HadoopFileSystem::Impl { ~Impl() { ARROW_WARN_NOT_OK(Close(), "Failed to disconnect hdfs client"); } Status Init() { - io::internal::LibHdfsShim* driver_shim; - RETURN_NOT_OK(ConnectLibHdfs(&driver_shim)); - RETURN_NOT_OK(io::HadoopFileSystem::Connect(&options_.connection_config, &client_)); + RETURN_NOT_OK(ConnectLibHdfs(&driver_)); + // RETURN_NOT_OK(io::HadoopFileSystem::Connect(&options_.connection_config, + // &client_)); + + if (!driver_) { + return Status::Invalid("Failed to initialize HDFS driver"); + } + + const HdfsConnectionConfig* config = &options_.connection_config; + + // connect to HDFS with the builder object + hdfsBuilder* builder = driver_->NewBuilder(); + if (!config->host.empty()) { + driver_->BuilderSetNameNode(builder, config->host.c_str()); + } + driver_->BuilderSetNameNodePort(builder, static_cast(config->port)); + if (!config->user.empty()) { + driver_->BuilderSetUserName(builder, config->user.c_str()); + } + if (!config->kerb_ticket.empty()) { + driver_->BuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); + } + + for (const auto& kv : config->extra_conf) { + int ret = driver_->BuilderConfSetStr(builder, kv.first.c_str(), kv.second.c_str()); + CHECK_FAILURE(ret, "confsetstr"); + } + + driver_->BuilderSetForceNewInstance(builder); + client_ = driver_->BuilderConnect(builder); + + if (client_ == nullptr) { + return Status::IOError("HDFS connection failed"); + } + return Status::OK(); } Status Close() { if (client_) { - RETURN_NOT_OK(client_->Disconnect()); + // RETURN_NOT_OK(client_->Disconnect()); + int ret = driver_->Disconnect(client_); + CHECK_FAILURE(ret, "hdfsFS::Disconnect"); + return Status::OK(); } return Status::OK(); } @@ -76,8 +152,8 @@ class HadoopFileSystem::Impl { return Status::Invalid("GetFileInfo must not be passed a URI, got: ", path); } FileInfo info; - io::HdfsPathInfo path_info; - auto status = client_->GetPathInfo(path, &path_info); + HdfsPathInfo path_info; + auto status = GetPathInfoStatus(path, &path_info); info.set_path(path); if (status.IsIOError()) { info.set_type(FileType::NotFound); @@ -88,11 +164,18 @@ class HadoopFileSystem::Impl { return info; } + bool Exists(const std::string& path) { + // hdfsExists does not distinguish between RPC failure and the file not + // existing + int ret = driver_->Exists(client_, path.c_str()); + return ret == 0; + } + Status StatSelector(const std::string& wd, const std::string& path, const FileSelector& select, int nesting_depth, std::vector* out) { - std::vector children; - Status st = client_->ListDirectory(path, &children); + std::vector children; + Status st = ListDirectory(path, &children); if (!st.ok()) { if (select.allow_not_found) { ARROW_ASSIGN_OR_RAISE(auto info, GetFileInfo(path)); @@ -129,6 +212,15 @@ class HadoopFileSystem::Impl { return Status::OK(); } + Status GetWorkingDirectory(std::string* out) { + char buffer[2048]; + if (driver_->GetWorkingDirectory(client_, buffer, sizeof(buffer) - 1) == nullptr) { + return IOErrorFromErrno(errno, "HDFS GetWorkingDirectory failed"); + } + *out = buffer; + return Status::OK(); + } + Result> GetFileInfo(const FileSelector& select) { // See GetFileInfo(const std::string&) above. if (select.base_dir.substr(0, 5) == "hdfs:") { @@ -143,7 +235,7 @@ class HadoopFileSystem::Impl { // If select.base_dir is absolute, we need to trim the "URI authority" // portion of the working directory. std::string wd; - RETURN_NOT_OK(client_->GetWorkingDirectory(&wd)); + RETURN_NOT_OK(GetWorkingDirectory(&wd)); if (!select.base_dir.empty() && select.base_dir.front() == '/') { // base_dir is absolute, only keep the URI authority portion. @@ -186,14 +278,44 @@ class HadoopFileSystem::Impl { "': parent is not a directory"); } } - RETURN_NOT_OK(client_->MakeDirectory(path)); + RETURN_NOT_OK(MakeDirectory(path)); + return Status::OK(); + } + + Status MakeDirectory(const std::string& path) { + if (IsDirectory(path)) { + return Status::OK(); + } + RETURN_NOT_OK(MakeDirectory(path)); + return Status::OK(); + } + + Status GetPathInfoStatus(const std::string& path, HdfsPathInfo* info) { + hdfsFileInfo* entry = driver_->GetPathInfo(client_, path.c_str()); + + if (entry == nullptr) { + return GetPathInfoFailed(path); + } + + SetPathInfo(entry, info); + driver_->FreeFileInfo(entry, 1); + + return Status::OK(); + } + + Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) { + HdfsPathInfo info; + RETURN_NOT_OK(GetPathInfoStatus(path, &info)); + + file_info->set_size(info.size); + file_info->set_type(info.kind); return Status::OK(); } Status CheckForDirectory(const std::string& path, const char* action) { // Check existence of path, and that it's a directory - io::HdfsPathInfo info; - RETURN_NOT_OK(client_->GetPathInfo(path, &info)); + HdfsPathInfo info; + RETURN_NOT_OK(GetPathInfoStatus(path, &info)); if (info.kind != FileType::Directory) { return Status::IOError("Cannot ", action, " directory '", path, "': not a directory"); @@ -201,9 +323,57 @@ class HadoopFileSystem::Impl { return Status::OK(); } + Status GetChildren(const std::string& path, std::vector* listing) { + std::vector detailed_listing; + RETURN_NOT_OK(ListDirectory(path, &detailed_listing)); + for (const auto& info : detailed_listing) { + listing->push_back(info.name); + } + return Status::OK(); + } + + Status ListDirectory(const std::string& path, std::vector* listing) { + int num_entries = 0; + errno = 0; + hdfsFileInfo* entries = driver_->ListDirectory(client_, path.c_str(), &num_entries); + + if (entries == nullptr) { + // If the directory is empty, entries is NULL but errno is 0. Non-zero + // errno indicates error + // + // Note: errno is thread-local + // + // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set + // errno 2/ENOENT for empty directories. To be more robust to this we + // double check this case + if ((errno == 0) || (errno == ENOENT && Exists(path))) { + num_entries = 0; + } else { + return IOErrorFromErrno(errno, "HDFS list directory failed"); + } + } + + // Allocate additional space for elements + int vec_offset = static_cast(listing->size()); + listing->resize(vec_offset + num_entries); + + for (int i = 0; i < num_entries; ++i) { + SetPathInfo(entries + i, &(*listing)[vec_offset + i]); + } + + // Free libhdfs file info + driver_->FreeFileInfo(entries, num_entries); + + return Status::OK(); + } + + Status DeleteDirectory(const std::string& path) { + return Delete(path, /*recursive=*/true); + } + Status DeleteDir(const std::string& path) { RETURN_NOT_OK(CheckForDirectory(path, "delete")); - return client_->DeleteDirectory(path); + return DeleteDirectory(path); } Status DeleteDirContents(const std::string& path, bool missing_dir_ok) { @@ -216,9 +386,9 @@ class HadoopFileSystem::Impl { } std::vector file_list; - RETURN_NOT_OK(client_->GetChildren(path, &file_list)); + RETURN_NOT_OK(GetChildren(path, &file_list)); for (const auto& file : file_list) { - RETURN_NOT_OK(client_->Delete(file, /*recursive=*/true)); + RETURN_NOT_OK(Delete(file, /*recursive=*/true)); } return Status::OK(); } @@ -227,35 +397,124 @@ class HadoopFileSystem::Impl { if (IsDirectory(path)) { return Status::IOError("path is a directory"); } - RETURN_NOT_OK(client_->Delete(path)); + RETURN_NOT_OK(Delete(path, /*recursive=*/false)); + return Status::OK(); + } + + Status Delete(const std::string& path, bool recursive) { + int ret = driver_->Delete(client_, path.c_str(), static_cast(recursive)); + CHECK_FAILURE(ret, "delete"); + return Status::OK(); + } + + Status Rename(const std::string& src, const std::string& dst) { + int ret = driver_->Rename(client_, src.c_str(), dst.c_str()); + CHECK_FAILURE(ret, "Rename"); return Status::OK(); } Status Move(const std::string& src, const std::string& dest) { - auto st = client_->Rename(src, dest); + auto st = Rename(src, dest); if (st.IsIOError() && IsFile(src) && IsFile(dest)) { // Allow file -> file clobber - RETURN_NOT_OK(client_->Delete(dest)); - st = client_->Rename(src, dest); + RETURN_NOT_OK(Delete(dest, /*recursive=*/false)); + st = Rename(src, dest); } return st; } + Status Copy(const std::string& src, const std::string& dst) { + int ret = driver_->Copy(client_, src.c_str(), client_, dst.c_str()); + CHECK_FAILURE(ret, "Rename"); + return Status::OK(); + } + Status CopyFile(const std::string& src, const std::string& dest) { - return client_->Copy(src, dest); + return Copy(src, dest); + } + + Status Chmod(const std::string& path, int mode) { + int ret = driver_->Chmod(client_, path.c_str(), static_cast(mode)); // NOLINT + CHECK_FAILURE(ret, "Chmod"); + return Status::OK(); + } + + Status Chown(const std::string& path, const char* owner, const char* group) { + int ret = driver_->Chown(client_, path.c_str(), owner, group); + CHECK_FAILURE(ret, "Chown"); + return Status::OK(); + } + + Status OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file) { + errno = 0; + hdfsFile handle = + driver_->OpenFile(client_, path.c_str(), O_RDONLY, buffer_size, 0, 0); + + if (handle == nullptr) { + return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); + } + + // std::make_shared does not work with private ctors + RETURN_NOT_OK( + MakeReadableFile(path, buffer_size, io_context, driver_, client_, handle, file)); + + return Status::OK(); + } + + Status OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file) { + return OpenReadable(path, buffer_size, io::default_io_context(), file); + } + + Status OpenReadable(const std::string& path, + std::shared_ptr* file) { + return OpenReadable(path, kDefaultHdfsBufferSize, io::default_io_context(), file); + } + + Status OpenReadable(const std::string& path, const io::IOContext& io_context, + std::shared_ptr* file) { + return OpenReadable(path, kDefaultHdfsBufferSize, io_context, file); + } + + Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file) { + int flags = O_WRONLY; + if (append) flags |= O_APPEND; + + errno = 0; + hdfsFile handle = + driver_->OpenFile(client_, path.c_str(), flags, buffer_size, replication, + static_cast(default_block_size)); + + if (handle == nullptr) { + return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); + } + + // std::make_shared does not work with private ctors + RETURN_NOT_OK(MakeOutputStream(path, buffer_size, driver_, client_, handle, file)); + + return Status::OK(); + } + + Status OpenWritable(const std::string& path, bool append, + std::shared_ptr* file) { + return OpenWritable(path, append, 0, 0, 0, file); } Result> OpenInputStream(const std::string& path) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); - std::shared_ptr file; - RETURN_NOT_OK(client_->OpenReadable(path, io_context_, &file)); + std::shared_ptr file; + RETURN_NOT_OK(OpenReadable(path, io_context_, &file)); return file; } Result> OpenInputFile(const std::string& path) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); - std::shared_ptr file; - RETURN_NOT_OK(client_->OpenReadable(path, io_context_, &file)); + std::shared_ptr file; + RETURN_NOT_OK(OpenReadable(path, io_context_, &file)); return file; } @@ -269,12 +528,27 @@ class HadoopFileSystem::Impl { return OpenOutputStreamGeneric(path, append); } + Status GetCapacity(int64_t* nbytes) { + tOffset ret = driver_->GetCapacity(client_); + CHECK_FAILURE(ret, "GetCapacity"); + *nbytes = ret; + return Status::OK(); + } + + Status GetUsed(int64_t* nbytes) { + tOffset ret = driver_->GetUsed(client_); + CHECK_FAILURE(ret, "GetUsed"); + *nbytes = ret; + return Status::OK(); + } + protected: + hdfsFS client_; const HdfsOptions options_; const io::IOContext io_context_; - std::shared_ptr<::arrow::io::HadoopFileSystem> client_; + io::internal::LibHdfsShim* driver_; - void PathInfoToFileInfo(const io::HdfsPathInfo& info, FileInfo* out) { + void PathInfoToFileInfo(const HdfsPathInfo& info, FileInfo* out) { if (info.kind == FileType::Directory) { out->set_type(FileType::Directory); out->set_size(kNoSize); @@ -288,25 +562,24 @@ class HadoopFileSystem::Impl { Result> OpenOutputStreamGeneric( const std::string& path, bool append) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); - std::shared_ptr stream; - RETURN_NOT_OK(client_->OpenWritable(path, append, options_.buffer_size, - options_.replication, options_.default_block_size, - &stream)); + std::shared_ptr stream; + RETURN_NOT_OK(OpenWritable(path, append, options_.buffer_size, options_.replication, + options_.default_block_size, &stream)); return stream; } bool IsDirectory(const std::string& path) { - io::HdfsPathInfo info; + HdfsPathInfo info; return GetPathInfo(path, &info) && info.kind == FileType::Directory; } bool IsFile(const std::string& path) { - io::HdfsPathInfo info; + HdfsPathInfo info; return GetPathInfo(path, &info) && info.kind == FileType::File; } - bool GetPathInfo(const std::string& path, io::HdfsPathInfo* info) { - return client_->GetPathInfo(path, info).ok(); + bool GetPathInfo(const std::string& path, HdfsPathInfo* info) { + return GetPathInfoStatus(path, info).ok(); } TimePoint ToTimePoint(int secs) { @@ -534,5 +807,93 @@ Result> HadoopFileSystem::OpenAppendStream( return impl_->OpenAppendStream(path); } +Status HadoopFileSystem::MakeDirectory(const std::string& path) { + return impl_->MakeDirectory(path); +} + +Status HadoopFileSystem::Delete(const std::string& path, bool recursive) { + return impl_->Delete(path, recursive); +} + +bool HadoopFileSystem::Exists(const std::string& path) { return impl_->Exists(path); } + +Status HadoopFileSystem::GetPathInfoStatus(const std::string& path, HdfsPathInfo* info) { + return impl_->GetPathInfoStatus(path, info); +} + +Status HadoopFileSystem::GetCapacity(int64_t* nbytes) { + return impl_->GetCapacity(nbytes); +} + +Status HadoopFileSystem::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); } + +Status HadoopFileSystem::ListDirectory(const std::string& path, + std::vector* listing) { + return impl_->ListDirectory(path, listing); +} + +Status HadoopFileSystem::Chmod(const std::string& path, int mode) { + return impl_->Chmod(path, mode); +} + +Status HadoopFileSystem::Chown(const std::string& path, const char* owner, + const char* group) { + return impl_->Chown(path, owner, group); +} + +Status HadoopFileSystem::Rename(const std::string& src, const std::string& dst) { + return impl_->Rename(src, dst); +} + +Status HadoopFileSystem::Copy(const std::string& src, const std::string& dst) { + return impl_->Copy(src, dst); +} + +Status HadoopFileSystem::OpenReadable( + const std::string& path, int32_t buffer_size, + std::shared_ptr* file) { + return impl_->OpenReadable(path, buffer_size, file); +}; + +Status HadoopFileSystem::OpenReadable( + const std::string& path, int32_t buffer_size, const io::IOContext& io_context, + std::shared_ptr* file) { + return impl_->OpenReadable(path, buffer_size, io_context, file); +}; + +Status HadoopFileSystem::OpenReadable( + const std::string& path, + std::shared_ptr* file) { + return impl_->OpenReadable(path, file); +}; + +Status HadoopFileSystem::OpenReadable( + const std::string& path, const io::IOContext& io_context, + std::shared_ptr* file) { + return impl_->OpenReadable(path, io_context, file); +}; + +Status HadoopFileSystem::OpenWritable( + const std::string& path, bool append, int32_t buffer_size, int16_t replication, + int64_t default_block_size, + std::shared_ptr* file) { + return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, + file); +}; + +Status HadoopFileSystem::OpenWritable( + const std::string& path, bool append, + std::shared_ptr* file) { + return impl_->OpenWritable(path, append, file); +}; + +// ---------------------------------------------------------------------- +// Allow public API users to check whether we are set up correctly + +Status HaveLibHdfs() { + io::internal::LibHdfsShim* driver; + return io::internal::ConnectLibHdfs(&driver); +} + } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index f633dd34951..0985f4cc56c 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -21,19 +21,41 @@ #include #include +// #include +// #include + #include "arrow/filesystem/filesystem.h" -#include "arrow/filesystem/hdfs_io.h" #include "arrow/util/uri.h" +// #include "arrow/filesystem/type_fwd.h" +// #include "arrow/io/interfaces.h" +// #include "arrow/util/macros.h" +// #include "arrow/util/visibility.h" + +namespace arrow::io::internal { +class HdfsReadableFile; +class HdfsOutputStream; + +struct HdfsPathInfo; +} // namespace arrow::io::internal + namespace arrow::fs { +struct HdfsConnectionConfig { + std::string host; + int port; + std::string user; + std::string kerb_ticket; + std::unordered_map extra_conf; +}; + /// Options for the HDFS implementation. struct ARROW_EXPORT HdfsOptions { HdfsOptions() = default; ~HdfsOptions() = default; /// Hdfs configuration options, contains host, port, driver - io::HdfsConnectionConfig connection_config; + HdfsConnectionConfig connection_config; /// Used by Hdfs OpenWritable Interface. int32_t buffer_size = 0; @@ -55,10 +77,6 @@ struct ARROW_EXPORT HdfsOptions { }; /// HDFS-backed FileSystem implementation. -/// -/// implementation notes: -/// - This is a wrapper of arrow/filesystem/hdfs_io, so we can use FileSystem API to -/// handle hdfs. class ARROW_EXPORT HadoopFileSystem : public FileSystem { public: ~HadoopFileSystem() override; @@ -89,10 +107,49 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { Status DeleteFile(const std::string& path) override; + Status MakeDirectory(const std::string& path); + + bool Exists(const std::string& path); + + Status GetPathInfoStatus(const std::string& path, io::internal::HdfsPathInfo* info); + + Status ListDirectory(const std::string& path, + std::vector* listing); + + // Delete file or directory + // @param path absolute path to data + // @param recursive if path is a directory, delete contents as well + // @returns error status on failure + Status Delete(const std::string& path, bool recursive = false); + Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; + // Move file or directory from source path to destination path within the + // current filesystem + Status Rename(const std::string& src, const std::string& dst); + + Status Copy(const std::string& src, const std::string& dst); + + Status GetCapacity(int64_t* nbytes); + + Status GetUsed(int64_t* nbytes); + + /// Change + /// + /// @param path file path to change + /// @param owner pass null for no change + /// @param group pass null for no change + Status Chown(const std::string& path, const char* owner, const char* group); + + /// Change path permissions + /// + /// \param path Absolute path in file system + /// \param mode Mode bitset + /// \return Status + Status Chmod(const std::string& path, int mode); + Result> OpenInputStream( const std::string& path) override; Result> OpenInputFile( @@ -108,6 +165,35 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { static Result> Make( const HdfsOptions& options, const io::IOContext& = io::default_io_context()); + // Open an HDFS file in READ mode. Returns error + // status if the file is not found. + // + // @param path complete file path + Status OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, const io::IOContext& io_context, + std::shared_ptr* file); + + // FileMode::WRITE options + // @param path complete file path + // @param buffer_size 0 by default + // @param replication 0 by default + // @param default_block_size 0 by default + Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); + + Status OpenWritable(const std::string& path, bool append, + std::shared_ptr* file); + protected: HadoopFileSystem(const HdfsOptions& options, const io::IOContext&); @@ -115,4 +201,6 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { std::unique_ptr impl_; }; +ARROW_EXPORT Status HaveLibHdfs(); + } // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index 16a18dc6dc7..2d6bbe01e13 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -39,24 +39,44 @@ #include #include "arrow/util/basic_decimal.h" +// #include +// #include +// #include +// #include +// #include +// #include + #ifndef _WIN32 # include #endif +#include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/io_util.h" #include "arrow/util/logging_internal.h" +// #include "arrow/filesystem/type_fwd.h" +// #include "arrow/io/interfaces.h" +// #include "arrow/memory_pool.h" + namespace arrow { using internal::GetEnvVarNative; +using internal::IOErrorFromErrno; using internal::PlatformFilename; namespace io::internal { namespace { +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + return IOErrorFromErrno(errno, "HDFS ", WHAT, " failed"); \ + } \ + } while (0) + template Status SetSymbol(void* handle, char const* name, T** symbol) { if (*symbol != nullptr) return Status::OK(); @@ -502,5 +522,306 @@ int LibHdfsShim::Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { } } +// ---------------------------------------------------------------------- +// File reading + +class HdfsAnyFileImpl { + public: + void set_members(const std::string& path, internal::LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle) { + path_ = path; + driver_ = driver; + fs_ = fs; + file_ = handle; + is_open_ = true; + } + + Status Seek(int64_t position) { + RETURN_NOT_OK(CheckClosed()); + int ret = driver_->Seek(fs_, file_, position); + CHECK_FAILURE(ret, "seek"); + return Status::OK(); + } + + Result Tell() { + RETURN_NOT_OK(CheckClosed()); + int64_t ret = driver_->Tell(fs_, file_); + CHECK_FAILURE(ret, "tell"); + return ret; + } + + bool is_open() const { return is_open_; } + + protected: + Status CheckClosed() { + if (!is_open_) { + return Status::Invalid("Operation on closed HDFS file"); + } + return Status::OK(); + } + + std::string path_; + + internal::LibHdfsShim* driver_; + + // For threadsafety + std::mutex lock_; + + // These are pointers in libhdfs, so OK to copy + hdfsFS fs_; + hdfsFile file_; + + bool is_open_; +}; + +namespace { + +Status GetPathInfoFailed(const std::string& path) { + return IOErrorFromErrno(errno, "Calling GetPathInfo for '", path, "' failed"); +} + +} // namespace + +// Private implementation for read-only files +class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { + public: + explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} + + Status Close() { + if (is_open_) { + // is_open_ must be set to false in the beginning, because the destructor + // attempts to close the stream again, and if the first close fails, then + // the error doesn't get propagated properly and the second close + // initiated by the destructor raises a segfault + is_open_ = false; + int ret = driver_->CloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + } + return Status::OK(); + } + + bool closed() const { return !is_open_; } + + Result ReadAt(int64_t position, int64_t nbytes, uint8_t* buffer) { + RETURN_NOT_OK(CheckClosed()); + if (!driver_->HasPread()) { + std::lock_guard guard(lock_); + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, buffer); + } + + constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); + int64_t total_bytes = 0; + while (nbytes > 0) { + const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); + tSize ret = + driver_->Pread(fs_, file_, static_cast(position), buffer, block_size); + CHECK_FAILURE(ret, "read"); + DCHECK_LE(ret, block_size); + if (ret == 0) { + break; // EOF + } + buffer += ret; + total_bytes += ret; + position += ret; + nbytes -= ret; + } + return total_bytes; + } + + Result> ReadAt(int64_t position, int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + ReadAt(position, nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + buffer->ZeroPadding(); + } + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buffer)); + } + + Result Read(int64_t nbytes, void* buffer) { + RETURN_NOT_OK(CheckClosed()); + + int64_t total_bytes = 0; + while (total_bytes < nbytes) { + tSize ret = driver_->Read( + fs_, file_, reinterpret_cast(buffer) + total_bytes, + static_cast(std::min(buffer_size_, nbytes - total_bytes))); + CHECK_FAILURE(ret, "read"); + total_bytes += ret; + if (ret == 0) { + break; + } + } + return total_bytes; + } + + Result> Read(int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data())); + if (bytes_read < nbytes) { + RETURN_NOT_OK(buffer->Resize(bytes_read)); + } + // R build with openSUSE155 requires an explicit shared_ptr construction + return std::shared_ptr(std::move(buffer)); + } + + Result GetSize() { + RETURN_NOT_OK(CheckClosed()); + + hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); + if (entry == nullptr) { + return GetPathInfoFailed(path_); + } + int64_t size = entry->mSize; + driver_->FreeFileInfo(entry, 1); + return size; + } + + void set_memory_pool(MemoryPool* pool) { pool_ = pool; } + + void set_buffer_size(int32_t buffer_size) { buffer_size_ = buffer_size; } + + private: + MemoryPool* pool_; + int32_t buffer_size_; +}; + +HdfsReadableFile::HdfsReadableFile(const io::IOContext& io_context) { + impl_.reset(new HdfsReadableFileImpl(io_context.pool())); +} + +HdfsReadableFile::~HdfsReadableFile() { + ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsReadableFile"); +} + +Status HdfsReadableFile::Close() { return impl_->Close(); } + +bool HdfsReadableFile::closed() const { return impl_->closed(); } + +Result HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, void* buffer) { + return impl_->ReadAt(position, nbytes, reinterpret_cast(buffer)); +} + +Result> HdfsReadableFile::ReadAt(int64_t position, + int64_t nbytes) { + return impl_->ReadAt(position, nbytes); +} + +Result HdfsReadableFile::Read(int64_t nbytes, void* buffer) { + return impl_->Read(nbytes, buffer); +} + +Result> HdfsReadableFile::Read(int64_t nbytes) { + return impl_->Read(nbytes); +} + +Result HdfsReadableFile::GetSize() { return impl_->GetSize(); } + +Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); } + +Result HdfsReadableFile::Tell() const { return impl_->Tell(); } + +// ---------------------------------------------------------------------- +// File writing + +// Private implementation for writable-only files +class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { + public: + HdfsOutputStreamImpl() {} + + Status Close() { + if (is_open_) { + // is_open_ must be set to false in the beginning, because the destructor + // attempts to close the stream again, and if the first close fails, then + // the error doesn't get propagated properly and the second close + // initiated by the destructor raises a segfault + is_open_ = false; + RETURN_NOT_OK(FlushInternal()); + int ret = driver_->CloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + } + return Status::OK(); + } + + bool closed() const { return !is_open_; } + + Status Flush() { + RETURN_NOT_OK(CheckClosed()); + + return FlushInternal(); + } + + Status Write(const uint8_t* buffer, int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); + + std::lock_guard guard(lock_); + while (nbytes > 0) { + const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); + tSize ret = driver_->Write(fs_, file_, buffer, block_size); + CHECK_FAILURE(ret, "Write"); + DCHECK_LE(ret, block_size); + buffer += ret; + nbytes -= ret; + } + return Status::OK(); + } + + protected: + Status FlushInternal() { + int ret = driver_->Flush(fs_, file_); + CHECK_FAILURE(ret, "Flush"); + return Status::OK(); + } +}; + +HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } + +HdfsOutputStream::~HdfsOutputStream() { + ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsOutputStream"); +} + +Status HdfsOutputStream::Close() { return impl_->Close(); } + +bool HdfsOutputStream::closed() const { return impl_->closed(); } + +Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes) { + return impl_->Write(reinterpret_cast(buffer), nbytes); +} + +Status HdfsOutputStream::Flush() { return impl_->Flush(); } + +Result HdfsOutputStream::Tell() const { return impl_->Tell(); } + +// Factory function to construct a HdfsReadableFile for use with the public API. +Status MakeReadableFile(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, internal::LibHdfsShim* driver, + hdfsFS fs, hdfsFile file, + std::shared_ptr* out) { + // Must use `new` due to private constructor + std::shared_ptr file_obj(new HdfsReadableFile(io_context)); + file_obj->impl_->set_members(path, driver, fs, file); + file_obj->impl_->set_buffer_size(buffer_size); + *out = std::move(file_obj); + return Status::OK(); +} + +Status MakeOutputStream(const std::string& path, int32_t buffer_size, LibHdfsShim* driver, + hdfsFS fs, hdfsFile file, + std::shared_ptr* out) { + std::shared_ptr stream(new HdfsOutputStream()); + stream->impl_->set_members(path, driver, fs, file); + *out = std::move(stream); + return Status::OK(); +} + } // namespace io::internal } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h index 4b6b4884c00..116b37a697b 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.h +++ b/cpp/src/arrow/filesystem/hdfs_internal.h @@ -22,9 +22,22 @@ #include +// #include +// #include +// #include +// #include +// #include + +#include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/hdfs.h" +#include "arrow/filesystem/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/io_util.h" #include "arrow/util/visibility.h" #include "arrow/util/windows_compatibility.h" // IWYU pragma: keep +// #include "arrow/util/macros.h" + using std::size_t; struct hdfsBuilder; @@ -33,8 +46,13 @@ namespace arrow { class Status; +using internal::IOErrorFromErrno; + namespace io::internal { +class HdfsReadableFile; +class HdfsOutputStream; + // NOTE(wesm): cpplint does not like use of short and other imprecise C types struct LibHdfsShim { void* handle; @@ -210,5 +228,99 @@ struct LibHdfsShim { // TODO(wesm): Remove these exports when we are linking statically ARROW_EXPORT Status ConnectLibHdfs(LibHdfsShim** driver); +struct HdfsPathInfo { + arrow::fs::FileType kind; + + std::string name; + std::string owner; + std::string group; + + // Access times in UNIX timestamps (seconds) + int64_t size; + int64_t block_size; + + int32_t last_modified_time; + int32_t last_access_time; + + int16_t replication; + int16_t permissions; +}; + +class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { + public: + ~HdfsReadableFile() override; + + Status Close() override; + + bool closed() const override; + + // NOTE: If you wish to read a particular range of a file in a multithreaded + // context, you may prefer to use ReadAt to avoid locking issues + Result Read(int64_t nbytes, void* out) override; + Result> Read(int64_t nbytes) override; + Result ReadAt(int64_t position, int64_t nbytes, void* out) override; + Result> ReadAt(int64_t position, int64_t nbytes) override; + + Status Seek(int64_t position) override; + Result Tell() const override; + Result GetSize() override; + + private: + explicit HdfsReadableFile(const io::IOContext&); + + class ARROW_NO_EXPORT HdfsReadableFileImpl; + std::unique_ptr impl_; + + friend class arrow::fs::HadoopFileSystem; + + friend Status MakeReadableFile(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, LibHdfsShim* driver, + hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); +}; + +// Naming this file OutputStream because it does not support seeking (like the +// WritableFile interface) +class ARROW_EXPORT HdfsOutputStream : public OutputStream { + public: + ~HdfsOutputStream() override; + + Status Close() override; + + bool closed() const override; + + using OutputStream::Write; + Status Write(const void* buffer, int64_t nbytes) override; + + Status Flush() override; + + Result Tell() const override; + + private: + class ARROW_NO_EXPORT HdfsOutputStreamImpl; + std::unique_ptr impl_; + + friend class arrow::fs::HadoopFileSystem; + + HdfsOutputStream(); + + friend Status MakeOutputStream(const std::string& path, int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); +}; + +ARROW_EXPORT Status MakeReadableFile(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, LibHdfsShim* driver, + hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + +ARROW_EXPORT Status MakeOutputStream(const std::string& path, int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + } // namespace io::internal } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_io_test.cc b/cpp/src/arrow/filesystem/hdfs_internal_test.cc similarity index 87% rename from cpp/src/arrow/filesystem/hdfs_io_test.cc rename to cpp/src/arrow/filesystem/hdfs_internal_test.cc index 52278b524eb..bd1460e3543 100644 --- a/cpp/src/arrow/filesystem/hdfs_io_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal_test.cc @@ -31,7 +31,6 @@ #include "arrow/buffer.h" #include "arrow/filesystem/hdfs_internal.h" -#include "arrow/filesystem/hdfs_io.h" #include "arrow/filesystem/type_fwd.h" #include "arrow/io/interfaces.h" #include "arrow/status.h" @@ -39,6 +38,12 @@ #include "arrow/testing/util.h" namespace arrow { + +using arrow::fs::HadoopFileSystem; +using arrow::fs::HdfsOptions; +using io::internal::HdfsPathInfo; +using io::internal::HdfsReadableFile; + namespace io { std::vector RandomData(int64_t size) { @@ -59,13 +64,13 @@ class TestHadoopFileSystem : public ::testing::Test { Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, bool append = false, int buffer_size = 0, int16_t replication = 0, int default_block_size = 0) { - std::shared_ptr file; - RETURN_NOT_OK(client_->OpenWritable(path, append, buffer_size, replication, - default_block_size, &file)); - - RETURN_NOT_OK(file->Write(buffer, size)); - RETURN_NOT_OK(file->Close()); + { + std::shared_ptr file; + RETURN_NOT_OK(client_->OpenWritable(path, append, buffer_size, replication, + default_block_size, &file)); + RETURN_NOT_OK(file->Write(buffer, size)); + } return Status::OK(); } @@ -119,7 +124,10 @@ class TestHadoopFileSystem : public ::testing::Test { conf_.user = user; conf_.port = port == nullptr ? 20500 : atoi(port); - ASSERT_OK(HadoopFileSystem::Connect(&conf_, &client_)); + HdfsOptions options; + options.connection_config = conf_; + io::IOContext io_context = io::default_io_context(); + ASSERT_OK_AND_ASSIGN(client_, HadoopFileSystem::Make(options, io_context)); } void TearDown() { @@ -127,11 +135,10 @@ class TestHadoopFileSystem : public ::testing::Test { if (client_->Exists(scratch_dir_)) { ARROW_EXPECT_OK(client_->Delete(scratch_dir_, true)); } - ARROW_EXPECT_OK(client_->Disconnect()); } } - HdfsConnectionConfig conf_; + arrow::fs::HdfsConnectionConfig conf_; bool loaded_driver_; // Resources shared amongst unit tests @@ -148,8 +155,13 @@ TEST_F(TestHadoopFileSystem, ConnectsAgain) { SKIP_IF_NO_DRIVER(); std::shared_ptr client; - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client)); - ASSERT_OK(client->Disconnect()); + + { + HdfsOptions options; + options.connection_config = this->conf_; + io::IOContext io_context = io::default_io_context(); + ASSERT_OK_AND_ASSIGN(client_, HadoopFileSystem::Make(options, io_context)); + } } TEST_F(TestHadoopFileSystem, MultipleClients) { @@ -159,14 +171,18 @@ TEST_F(TestHadoopFileSystem, MultipleClients) { std::shared_ptr client1; std::shared_ptr client2; - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client1)); - ASSERT_OK(HadoopFileSystem::Connect(&this->conf_, &client2)); - ASSERT_OK(client1->Disconnect()); - // client2 continues to function after equivalent client1 has shutdown - std::vector listing; - ASSERT_OK(client2->ListDirectory(this->scratch_dir_, &listing)); - ASSERT_OK(client2->Disconnect()); + io::IOContext io_context = io::default_io_context(); + HdfsOptions options; + options.connection_config = this->conf_; + { ASSERT_OK_AND_ASSIGN(client1, HadoopFileSystem::Make(options, io_context)); } + { + ASSERT_OK_AND_ASSIGN(client2, HadoopFileSystem::Make(options, io_context)); + + // client2 continues to function after equivalent client1 has shutdown + std::vector listing; + ASSERT_OK(client2->ListDirectory(this->scratch_dir_, &listing)); + } } TEST_F(TestHadoopFileSystem, MakeDirectory) { @@ -201,7 +217,7 @@ TEST_F(TestHadoopFileSystem, GetCapacityUsed) { ASSERT_LT(0, nbytes); } -TEST_F(TestHadoopFileSystem, GetPathInfo) { +TEST_F(TestHadoopFileSystem, GetPathInfoStatus) { SKIP_IF_NO_DRIVER(); HdfsPathInfo info; @@ -209,7 +225,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfo) { ASSERT_OK(this->MakeScratchDir()); // Directory info - ASSERT_OK(this->client_->GetPathInfo(this->scratch_dir_, &info)); + ASSERT_OK(this->client_->GetPathInfoStatus(this->scratch_dir_, &info)); ASSERT_EQ(arrow::fs::FileType::Directory, info.kind); ASSERT_EQ(this->HdfsAbsPath(this->scratch_dir_), info.name); ASSERT_EQ(this->conf_.user, info.owner); @@ -223,7 +239,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfo) { std::vector buffer = RandomData(size); ASSERT_OK(this->WriteDummyFile(path, buffer.data(), size)); - ASSERT_OK(this->client_->GetPathInfo(path, &info)); + ASSERT_OK(this->client_->GetPathInfoStatus(path, &info)); ASSERT_EQ(arrow::fs::FileType::File, info.kind); ASSERT_EQ(this->HdfsAbsPath(path), info.name); @@ -231,7 +247,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfo) { ASSERT_EQ(size, info.size); } -TEST_F(TestHadoopFileSystem, GetPathInfoNotExist) { +TEST_F(TestHadoopFileSystem, GetPathInfoStatusNotExist) { // ARROW-2919: Test that the error message is reasonable SKIP_IF_NO_DRIVER(); @@ -239,7 +255,7 @@ TEST_F(TestHadoopFileSystem, GetPathInfoNotExist) { auto path = this->ScratchPath("path-does-not-exist"); HdfsPathInfo info; - Status s = this->client_->GetPathInfo(path, &info); + Status s = this->client_->GetPathInfoStatus(path, &info); ASSERT_TRUE(s.IsIOError()); const std::string error_message = s.ToString(); @@ -263,7 +279,7 @@ TEST_F(TestHadoopFileSystem, AppendToFile) { ASSERT_OK(this->WriteDummyFile(path, buffer.data(), size, true)); HdfsPathInfo info; - ASSERT_OK(this->client_->GetPathInfo(path, &info)); + ASSERT_OK(this->client_->GetPathInfoStatus(path, &info)); ASSERT_EQ(size * 2, info.size); } @@ -406,13 +422,13 @@ TEST_F(TestHadoopFileSystem, ChmodChown) { HdfsPathInfo info; ASSERT_OK(this->client_->Chmod(path, mode)); - ASSERT_OK(this->client_->GetPathInfo(path, &info)); + ASSERT_OK(this->client_->GetPathInfoStatus(path, &info)); ASSERT_EQ(mode, info.permissions); std::string owner = "hadoop"; std::string group = "hadoop"; ASSERT_OK(this->client_->Chown(path, owner.c_str(), group.c_str())); - ASSERT_OK(this->client_->GetPathInfo(path, &info)); + ASSERT_OK(this->client_->GetPathInfoStatus(path, &info)); ASSERT_EQ("hadoop", info.owner); ASSERT_EQ("hadoop", info.group); } diff --git a/cpp/src/arrow/filesystem/hdfs_io.cc b/cpp/src/arrow/filesystem/hdfs_io.cc deleted file mode 100644 index 2c341fcf1cc..00000000000 --- a/cpp/src/arrow/filesystem/hdfs_io.cc +++ /dev/null @@ -1,720 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/filesystem/hdfs_internal.h" -#include "arrow/filesystem/hdfs_io.h" -#include "arrow/filesystem/type_fwd.h" -#include "arrow/io/interfaces.h" -#include "arrow/memory_pool.h" -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/io_util.h" -#include "arrow/util/logging_internal.h" - -using std::size_t; - -namespace arrow { - -using internal::IOErrorFromErrno; - -namespace io { - -#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ - do { \ - if (RETURN_VALUE == -1) { \ - return IOErrorFromErrno(errno, "HDFS ", WHAT, " failed"); \ - } \ - } while (0) - -static constexpr int kDefaultHdfsBufferSize = 1 << 16; - -// ---------------------------------------------------------------------- -// File reading - -class HdfsAnyFileImpl { - public: - void set_members(const std::string& path, internal::LibHdfsShim* driver, hdfsFS fs, - hdfsFile handle) { - path_ = path; - driver_ = driver; - fs_ = fs; - file_ = handle; - is_open_ = true; - } - - Status Seek(int64_t position) { - RETURN_NOT_OK(CheckClosed()); - int ret = driver_->Seek(fs_, file_, position); - CHECK_FAILURE(ret, "seek"); - return Status::OK(); - } - - Result Tell() { - RETURN_NOT_OK(CheckClosed()); - int64_t ret = driver_->Tell(fs_, file_); - CHECK_FAILURE(ret, "tell"); - return ret; - } - - bool is_open() const { return is_open_; } - - protected: - Status CheckClosed() { - if (!is_open_) { - return Status::Invalid("Operation on closed HDFS file"); - } - return Status::OK(); - } - - std::string path_; - - internal::LibHdfsShim* driver_; - - // For threadsafety - std::mutex lock_; - - // These are pointers in libhdfs, so OK to copy - hdfsFS fs_; - hdfsFile file_; - - bool is_open_; -}; - -namespace { - -Status GetPathInfoFailed(const std::string& path) { - return IOErrorFromErrno(errno, "Calling GetPathInfo for '", path, "' failed"); -} - -} // namespace - -// Private implementation for read-only files -class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { - public: - explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} - - Status Close() { - if (is_open_) { - // is_open_ must be set to false in the beginning, because the destructor - // attempts to close the stream again, and if the first close fails, then - // the error doesn't get propagated properly and the second close - // initiated by the destructor raises a segfault - is_open_ = false; - int ret = driver_->CloseFile(fs_, file_); - CHECK_FAILURE(ret, "CloseFile"); - } - return Status::OK(); - } - - bool closed() const { return !is_open_; } - - Result ReadAt(int64_t position, int64_t nbytes, uint8_t* buffer) { - RETURN_NOT_OK(CheckClosed()); - if (!driver_->HasPread()) { - std::lock_guard guard(lock_); - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, buffer); - } - - constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); - int64_t total_bytes = 0; - while (nbytes > 0) { - const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); - tSize ret = - driver_->Pread(fs_, file_, static_cast(position), buffer, block_size); - CHECK_FAILURE(ret, "read"); - DCHECK_LE(ret, block_size); - if (ret == 0) { - break; // EOF - } - buffer += ret; - total_bytes += ret; - position += ret; - nbytes -= ret; - } - return total_bytes; - } - - Result> ReadAt(int64_t position, int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); - ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, - ReadAt(position, nbytes, buffer->mutable_data())); - if (bytes_read < nbytes) { - RETURN_NOT_OK(buffer->Resize(bytes_read)); - buffer->ZeroPadding(); - } - // R build with openSUSE155 requires an explicit shared_ptr construction - return std::shared_ptr(std::move(buffer)); - } - - Result Read(int64_t nbytes, void* buffer) { - RETURN_NOT_OK(CheckClosed()); - - int64_t total_bytes = 0; - while (total_bytes < nbytes) { - tSize ret = driver_->Read( - fs_, file_, reinterpret_cast(buffer) + total_bytes, - static_cast(std::min(buffer_size_, nbytes - total_bytes))); - CHECK_FAILURE(ret, "read"); - total_bytes += ret; - if (ret == 0) { - break; - } - } - return total_bytes; - } - - Result> Read(int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, pool_)); - ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data())); - if (bytes_read < nbytes) { - RETURN_NOT_OK(buffer->Resize(bytes_read)); - } - // R build with openSUSE155 requires an explicit shared_ptr construction - return std::shared_ptr(std::move(buffer)); - } - - Result GetSize() { - RETURN_NOT_OK(CheckClosed()); - - hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path_.c_str()); - if (entry == nullptr) { - return GetPathInfoFailed(path_); - } - int64_t size = entry->mSize; - driver_->FreeFileInfo(entry, 1); - return size; - } - - void set_memory_pool(MemoryPool* pool) { pool_ = pool; } - - void set_buffer_size(int32_t buffer_size) { buffer_size_ = buffer_size; } - - private: - MemoryPool* pool_; - int32_t buffer_size_; -}; - -HdfsReadableFile::HdfsReadableFile(const io::IOContext& io_context) { - impl_.reset(new HdfsReadableFileImpl(io_context.pool())); -} - -HdfsReadableFile::~HdfsReadableFile() { - ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsReadableFile"); -} - -Status HdfsReadableFile::Close() { return impl_->Close(); } - -bool HdfsReadableFile::closed() const { return impl_->closed(); } - -Result HdfsReadableFile::ReadAt(int64_t position, int64_t nbytes, void* buffer) { - return impl_->ReadAt(position, nbytes, reinterpret_cast(buffer)); -} - -Result> HdfsReadableFile::ReadAt(int64_t position, - int64_t nbytes) { - return impl_->ReadAt(position, nbytes); -} - -Result HdfsReadableFile::Read(int64_t nbytes, void* buffer) { - return impl_->Read(nbytes, buffer); -} - -Result> HdfsReadableFile::Read(int64_t nbytes) { - return impl_->Read(nbytes); -} - -Result HdfsReadableFile::GetSize() { return impl_->GetSize(); } - -Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); } - -Result HdfsReadableFile::Tell() const { return impl_->Tell(); } - -// ---------------------------------------------------------------------- -// File writing - -// Private implementation for writable-only files -class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { - public: - HdfsOutputStreamImpl() {} - - Status Close() { - if (is_open_) { - // is_open_ must be set to false in the beginning, because the destructor - // attempts to close the stream again, and if the first close fails, then - // the error doesn't get propagated properly and the second close - // initiated by the destructor raises a segfault - is_open_ = false; - RETURN_NOT_OK(FlushInternal()); - int ret = driver_->CloseFile(fs_, file_); - CHECK_FAILURE(ret, "CloseFile"); - } - return Status::OK(); - } - - bool closed() const { return !is_open_; } - - Status Flush() { - RETURN_NOT_OK(CheckClosed()); - - return FlushInternal(); - } - - Status Write(const uint8_t* buffer, int64_t nbytes) { - RETURN_NOT_OK(CheckClosed()); - - constexpr int64_t kMaxBlockSize = std::numeric_limits::max(); - - std::lock_guard guard(lock_); - while (nbytes > 0) { - const auto block_size = static_cast(std::min(kMaxBlockSize, nbytes)); - tSize ret = driver_->Write(fs_, file_, buffer, block_size); - CHECK_FAILURE(ret, "Write"); - DCHECK_LE(ret, block_size); - buffer += ret; - nbytes -= ret; - } - return Status::OK(); - } - - protected: - Status FlushInternal() { - int ret = driver_->Flush(fs_, file_); - CHECK_FAILURE(ret, "Flush"); - return Status::OK(); - } -}; - -HdfsOutputStream::HdfsOutputStream() { impl_.reset(new HdfsOutputStreamImpl()); } - -HdfsOutputStream::~HdfsOutputStream() { - ARROW_WARN_NOT_OK(impl_->Close(), "Failed to close HdfsOutputStream"); -} - -Status HdfsOutputStream::Close() { return impl_->Close(); } - -bool HdfsOutputStream::closed() const { return impl_->closed(); } - -Status HdfsOutputStream::Write(const void* buffer, int64_t nbytes) { - return impl_->Write(reinterpret_cast(buffer), nbytes); -} - -Status HdfsOutputStream::Flush() { return impl_->Flush(); } - -Result HdfsOutputStream::Tell() const { return impl_->Tell(); } - -// ---------------------------------------------------------------------- -// HDFS client - -// TODO(wesm): this could throw std::bad_alloc in the course of copying strings -// into the path info object -static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { - out->kind = input->mKind == kObjectKindFile ? arrow::fs::FileType::File - : arrow::fs::FileType::Directory; - out->name = std::string(input->mName); - out->owner = std::string(input->mOwner); - out->group = std::string(input->mGroup); - - out->last_access_time = static_cast(input->mLastAccess); - out->last_modified_time = static_cast(input->mLastMod); - out->size = static_cast(input->mSize); - - out->replication = input->mReplication; - out->block_size = input->mBlockSize; - - out->permissions = input->mPermissions; -} - -// Private implementation -class HadoopFileSystem::HadoopFileSystemImpl { - public: - HadoopFileSystemImpl() : driver_(NULLPTR), port_(0), fs_(NULLPTR) {} - - Status Connect(const HdfsConnectionConfig* config) { - RETURN_NOT_OK(ConnectLibHdfs(&driver_)); - - // connect to HDFS with the builder object - hdfsBuilder* builder = driver_->NewBuilder(); - if (!config->host.empty()) { - driver_->BuilderSetNameNode(builder, config->host.c_str()); - } - driver_->BuilderSetNameNodePort(builder, static_cast(config->port)); - if (!config->user.empty()) { - driver_->BuilderSetUserName(builder, config->user.c_str()); - } - if (!config->kerb_ticket.empty()) { - driver_->BuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); - } - - for (const auto& kv : config->extra_conf) { - int ret = driver_->BuilderConfSetStr(builder, kv.first.c_str(), kv.second.c_str()); - CHECK_FAILURE(ret, "confsetstr"); - } - - driver_->BuilderSetForceNewInstance(builder); - fs_ = driver_->BuilderConnect(builder); - - if (fs_ == nullptr) { - return Status::IOError("HDFS connection failed"); - } - namenode_host_ = config->host; - port_ = config->port; - user_ = config->user; - kerb_ticket_ = config->kerb_ticket; - - return Status::OK(); - } - - Status MakeDirectory(const std::string& path) { - int ret = driver_->MakeDirectory(fs_, path.c_str()); - CHECK_FAILURE(ret, "create directory"); - return Status::OK(); - } - - Status Delete(const std::string& path, bool recursive) { - int ret = driver_->Delete(fs_, path.c_str(), static_cast(recursive)); - CHECK_FAILURE(ret, "delete"); - return Status::OK(); - } - - Status Disconnect() { - int ret = driver_->Disconnect(fs_); - CHECK_FAILURE(ret, "hdfsFS::Disconnect"); - return Status::OK(); - } - - bool Exists(const std::string& path) { - // hdfsExists does not distinguish between RPC failure and the file not - // existing - int ret = driver_->Exists(fs_, path.c_str()); - return ret == 0; - } - - Status GetCapacity(int64_t* nbytes) { - tOffset ret = driver_->GetCapacity(fs_); - CHECK_FAILURE(ret, "GetCapacity"); - *nbytes = ret; - return Status::OK(); - } - - Status GetUsed(int64_t* nbytes) { - tOffset ret = driver_->GetUsed(fs_); - CHECK_FAILURE(ret, "GetUsed"); - *nbytes = ret; - return Status::OK(); - } - - Status GetWorkingDirectory(std::string* out) { - char buffer[2048]; - if (driver_->GetWorkingDirectory(fs_, buffer, sizeof(buffer) - 1) == nullptr) { - return IOErrorFromErrno(errno, "HDFS GetWorkingDirectory failed"); - } - *out = buffer; - return Status::OK(); - } - - Status GetPathInfo(const std::string& path, HdfsPathInfo* info) { - hdfsFileInfo* entry = driver_->GetPathInfo(fs_, path.c_str()); - - if (entry == nullptr) { - return GetPathInfoFailed(path); - } - - SetPathInfo(entry, info); - driver_->FreeFileInfo(entry, 1); - - return Status::OK(); - } - - Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) { - HdfsPathInfo info; - RETURN_NOT_OK(GetPathInfo(path, &info)); - - file_info->set_size(info.size); - file_info->set_type(info.kind); - return Status::OK(); - } - - Status GetChildren(const std::string& path, std::vector* listing) { - std::vector detailed_listing; - RETURN_NOT_OK(ListDirectory(path, &detailed_listing)); - for (const auto& info : detailed_listing) { - listing->push_back(info.name); - } - return Status::OK(); - } - - Status ListDirectory(const std::string& path, std::vector* listing) { - int num_entries = 0; - errno = 0; - hdfsFileInfo* entries = driver_->ListDirectory(fs_, path.c_str(), &num_entries); - - if (entries == nullptr) { - // If the directory is empty, entries is NULL but errno is 0. Non-zero - // errno indicates error - // - // Note: errno is thread-local - // - // XXX(wesm): ARROW-2300; we found with Hadoop 2.6 that libhdfs would set - // errno 2/ENOENT for empty directories. To be more robust to this we - // double check this case - if ((errno == 0) || (errno == ENOENT && Exists(path))) { - num_entries = 0; - } else { - return IOErrorFromErrno(errno, "HDFS list directory failed"); - } - } - - // Allocate additional space for elements - int vec_offset = static_cast(listing->size()); - listing->resize(vec_offset + num_entries); - - for (int i = 0; i < num_entries; ++i) { - SetPathInfo(entries + i, &(*listing)[vec_offset + i]); - } - - // Free libhdfs file info - driver_->FreeFileInfo(entries, num_entries); - - return Status::OK(); - } - - Status OpenReadable(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, - std::shared_ptr* file) { - errno = 0; - hdfsFile handle = driver_->OpenFile(fs_, path.c_str(), O_RDONLY, buffer_size, 0, 0); - - if (handle == nullptr) { - return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); - } - - // std::make_shared does not work with private ctors - *file = std::shared_ptr(new HdfsReadableFile(io_context)); - (*file)->impl_->set_members(path, driver_, fs_, handle); - (*file)->impl_->set_buffer_size(buffer_size); - - return Status::OK(); - } - - Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { - int flags = O_WRONLY; - if (append) flags |= O_APPEND; - - errno = 0; - hdfsFile handle = - driver_->OpenFile(fs_, path.c_str(), flags, buffer_size, replication, - static_cast(default_block_size)); - - if (handle == nullptr) { - return IOErrorFromErrno(errno, "Opening HDFS file '", path, "' failed"); - } - - // std::make_shared does not work with private ctors - *file = std::shared_ptr(new HdfsOutputStream()); - (*file)->impl_->set_members(path, driver_, fs_, handle); - - return Status::OK(); - } - - Status Rename(const std::string& src, const std::string& dst) { - int ret = driver_->Rename(fs_, src.c_str(), dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Copy(const std::string& src, const std::string& dst) { - int ret = driver_->Copy(fs_, src.c_str(), fs_, dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Move(const std::string& src, const std::string& dst) { - int ret = driver_->Move(fs_, src.c_str(), fs_, dst.c_str()); - CHECK_FAILURE(ret, "Rename"); - return Status::OK(); - } - - Status Chmod(const std::string& path, int mode) { - int ret = driver_->Chmod(fs_, path.c_str(), static_cast(mode)); // NOLINT - CHECK_FAILURE(ret, "Chmod"); - return Status::OK(); - } - - Status Chown(const std::string& path, const char* owner, const char* group) { - int ret = driver_->Chown(fs_, path.c_str(), owner, group); - CHECK_FAILURE(ret, "Chown"); - return Status::OK(); - } - - private: - internal::LibHdfsShim* driver_; - - std::string namenode_host_; - std::string user_; - int port_; - std::string kerb_ticket_; - - hdfsFS fs_; -}; - -// ---------------------------------------------------------------------- -// Public API for HDFSClient - -HadoopFileSystem::HadoopFileSystem() { impl_.reset(new HadoopFileSystemImpl()); } - -HadoopFileSystem::~HadoopFileSystem() {} - -Status HadoopFileSystem::Connect(const HdfsConnectionConfig* config, - std::shared_ptr* fs) { - // ctor is private, make_shared will not work - *fs = std::shared_ptr(new HadoopFileSystem()); - - RETURN_NOT_OK((*fs)->impl_->Connect(config)); - return Status::OK(); -} - -Status HadoopFileSystem::MakeDirectory(const std::string& path) { - return impl_->MakeDirectory(path); -} - -Status HadoopFileSystem::Delete(const std::string& path, bool recursive) { - return impl_->Delete(path, recursive); -} - -Status HadoopFileSystem::DeleteDirectory(const std::string& path) { - return Delete(path, true); -} - -Status HadoopFileSystem::Disconnect() { return impl_->Disconnect(); } - -bool HadoopFileSystem::Exists(const std::string& path) { return impl_->Exists(path); } - -Status HadoopFileSystem::GetPathInfo(const std::string& path, HdfsPathInfo* info) { - return impl_->GetPathInfo(path, info); -} - -Status HadoopFileSystem::Stat(const std::string& path, arrow::fs::FileInfo* file_info) { - return impl_->Stat(path, file_info); -} - -Status HadoopFileSystem::GetCapacity(int64_t* nbytes) { - return impl_->GetCapacity(nbytes); -} - -Status HadoopFileSystem::GetUsed(int64_t* nbytes) { return impl_->GetUsed(nbytes); } - -Status HadoopFileSystem::GetWorkingDirectory(std::string* out) { - return impl_->GetWorkingDirectory(out); -} - -Status HadoopFileSystem::GetChildren(const std::string& path, - std::vector* listing) { - return impl_->GetChildren(path, listing); -} - -Status HadoopFileSystem::ListDirectory(const std::string& path, - std::vector* listing) { - return impl_->ListDirectory(path, listing); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { - return impl_->OpenReadable(path, buffer_size, io::default_io_context(), file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, - std::shared_ptr* file) { - return OpenReadable(path, kDefaultHdfsBufferSize, io::default_io_context(), file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, - std::shared_ptr* file) { - return impl_->OpenReadable(path, buffer_size, io_context, file); -} - -Status HadoopFileSystem::OpenReadable(const std::string& path, - const io::IOContext& io_context, - std::shared_ptr* file) { - return OpenReadable(path, kDefaultHdfsBufferSize, io_context, file); -} - -Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, - int32_t buffer_size, int16_t replication, - int64_t default_block_size, - std::shared_ptr* file) { - return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, - file); -} - -Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, - std::shared_ptr* file) { - return OpenWritable(path, append, 0, 0, 0, file); -} - -Status HadoopFileSystem::Chmod(const std::string& path, int mode) { - return impl_->Chmod(path, mode); -} - -Status HadoopFileSystem::Chown(const std::string& path, const char* owner, - const char* group) { - return impl_->Chown(path, owner, group); -} - -Status HadoopFileSystem::Rename(const std::string& src, const std::string& dst) { - return impl_->Rename(src, dst); -} - -Status HadoopFileSystem::Copy(const std::string& src, const std::string& dst) { - return impl_->Copy(src, dst); -} - -Status HadoopFileSystem::Move(const std::string& src, const std::string& dst) { - return impl_->Move(src, dst); -} - -// ---------------------------------------------------------------------- -// Allow public API users to check whether we are set up correctly - -Status HaveLibHdfs() { - internal::LibHdfsShim* driver; - return internal::ConnectLibHdfs(&driver); -} - -} // namespace io -} // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_io.h b/cpp/src/arrow/filesystem/hdfs_io.h deleted file mode 100644 index 25fba7756ee..00000000000 --- a/cpp/src/arrow/filesystem/hdfs_io.h +++ /dev/null @@ -1,274 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "arrow/filesystem/filesystem.h" -#include "arrow/filesystem/type_fwd.h" -#include "arrow/io/interfaces.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Status; - -namespace io { - -class HdfsReadableFile; -class HdfsOutputStream; - -class ARROW_EXPORT FileSystem { - public: - virtual ~FileSystem() = default; - - virtual Status MakeDirectory(const std::string& path) = 0; - - virtual Status DeleteDirectory(const std::string& path) = 0; - - virtual Status GetChildren(const std::string& path, - std::vector* listing) = 0; - - virtual Status Rename(const std::string& src, const std::string& dst) = 0; - - virtual Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) = 0; -}; - -struct HdfsPathInfo { - arrow::fs::FileType kind; - - std::string name; - std::string owner; - std::string group; - - // Access times in UNIX timestamps (seconds) - int64_t size; - int64_t block_size; - - int32_t last_modified_time; - int32_t last_access_time; - - int16_t replication; - int16_t permissions; -}; - -struct HdfsConnectionConfig { - std::string host; - int port; - std::string user; - std::string kerb_ticket; - std::unordered_map extra_conf; -}; - -class ARROW_EXPORT HadoopFileSystem : public FileSystem { - public: - ~HadoopFileSystem() override; - - // Connect to an HDFS cluster given a configuration - // - // @param config (in): configuration for connecting - // @param fs (out): the created client - // @returns Status - static Status Connect(const HdfsConnectionConfig* config, - std::shared_ptr* fs); - - // Create directory and all parents - // - // @param path (in): absolute HDFS path - // @returns Status - Status MakeDirectory(const std::string& path) override; - - // Delete file or directory - // @param path absolute path to data - // @param recursive if path is a directory, delete contents as well - // @returns error status on failure - Status Delete(const std::string& path, bool recursive = false); - - Status DeleteDirectory(const std::string& path) override; - - // Disconnect from cluster - // - // @returns Status - Status Disconnect(); - - // @param path (in): absolute HDFS path - // @returns bool, true if the path exists, false if not (or on error) - bool Exists(const std::string& path); - - // @param path (in): absolute HDFS path - // @param info (out) - // @returns Status - Status GetPathInfo(const std::string& path, HdfsPathInfo* info); - - // @param nbytes (out): total capacity of the filesystem - // @returns Status - Status GetCapacity(int64_t* nbytes); - - // @param nbytes (out): total bytes used of the filesystem - // @returns Status - Status GetUsed(int64_t* nbytes); - - Status GetChildren(const std::string& path, std::vector* listing) override; - - /// List directory contents - /// - /// If path is a relative path, returned values will be absolute paths or URIs - /// starting from the current working directory. - Status ListDirectory(const std::string& path, std::vector* listing); - - /// Return the filesystem's current working directory. - /// - /// The working directory is the base path for all relative paths given to - /// other APIs. - /// NOTE: this actually returns a URI. - Status GetWorkingDirectory(std::string* out); - - /// Change - /// - /// @param path file path to change - /// @param owner pass null for no change - /// @param group pass null for no change - Status Chown(const std::string& path, const char* owner, const char* group); - - /// Change path permissions - /// - /// \param path Absolute path in file system - /// \param mode Mode bitset - /// \return Status - Status Chmod(const std::string& path, int mode); - - // Move file or directory from source path to destination path within the - // current filesystem - Status Rename(const std::string& src, const std::string& dst) override; - - Status Copy(const std::string& src, const std::string& dst); - - Status Move(const std::string& src, const std::string& dst); - - Status Stat(const std::string& path, arrow::fs::FileInfo* file_info) override; - - // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory - - // Open an HDFS file in READ mode. Returns error - // status if the file is not found. - // - // @param path complete file path - Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file); - - Status OpenReadable(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, - std::shared_ptr* file); - - Status OpenReadable(const std::string& path, std::shared_ptr* file); - - Status OpenReadable(const std::string& path, const io::IOContext& io_context, - std::shared_ptr* file); - - // FileMode::WRITE options - // @param path complete file path - // @param buffer_size 0 by default - // @param replication 0 by default - // @param default_block_size 0 by default - Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file); - - Status OpenWritable(const std::string& path, bool append, - std::shared_ptr* file); - - private: - friend class HdfsReadableFile; - friend class HdfsOutputStream; - - class ARROW_NO_EXPORT HadoopFileSystemImpl; - std::unique_ptr impl_; - - HadoopFileSystem(); - ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem); -}; - -class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { - public: - ~HdfsReadableFile() override; - - Status Close() override; - - bool closed() const override; - - // NOTE: If you wish to read a particular range of a file in a multithreaded - // context, you may prefer to use ReadAt to avoid locking issues - Result Read(int64_t nbytes, void* out) override; - Result> Read(int64_t nbytes) override; - Result ReadAt(int64_t position, int64_t nbytes, void* out) override; - Result> ReadAt(int64_t position, int64_t nbytes) override; - - Status Seek(int64_t position) override; - Result Tell() const override; - Result GetSize() override; - - private: - explicit HdfsReadableFile(const io::IOContext&); - - class ARROW_NO_EXPORT HdfsReadableFileImpl; - std::unique_ptr impl_; - - friend class HadoopFileSystem::HadoopFileSystemImpl; - - ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); -}; - -// Naming this file OutputStream because it does not support seeking (like the -// WritableFile interface) -class ARROW_EXPORT HdfsOutputStream : public OutputStream { - public: - ~HdfsOutputStream() override; - - Status Close() override; - - bool closed() const override; - - using OutputStream::Write; - Status Write(const void* buffer, int64_t nbytes) override; - - Status Flush() override; - - Result Tell() const override; - - private: - class ARROW_NO_EXPORT HdfsOutputStreamImpl; - std::unique_ptr impl_; - - friend class HadoopFileSystem::HadoopFileSystemImpl; - - HdfsOutputStream(); - - ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); -}; - -ARROW_EXPORT Status HaveLibHdfs(); - -} // namespace io -} // namespace arrow diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 5011b90ed79..147ab969e63 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -306,13 +306,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: const CIOContext& io_context, int64_t chunk_size, c_bool use_threads) -cdef extern from "arrow/filesystem/hdfs_io.h" namespace "arrow::io" nogil: - - cdef cppclass CIOFileSystem" arrow::io::FileSystem": - CStatus Stat(const c_string& path, CFileInfo* stat) - CStatus HaveLibHdfs() - CStatus HaveLibHdfs3() cdef enum HdfsDriver" arrow::io::HdfsDriver": HdfsDriver_LIBHDFS" arrow::io::HdfsDriver::LIBHDFS" @@ -326,61 +320,6 @@ cdef extern from "arrow/filesystem/hdfs_io.h" namespace "arrow::io" nogil: unordered_map[c_string, c_string] extra_conf HdfsDriver driver - cdef cppclass HdfsPathInfo: - CFileType kind - c_string name - c_string owner - c_string group - int32_t last_modified_time - int32_t last_access_time - int64_t size - int16_t replication - int64_t block_size - int16_t permissions - - cdef cppclass HdfsReadableFile(CRandomAccessFile): - pass - - cdef cppclass HdfsOutputStream(COutputStream): - pass - - cdef cppclass CIOHadoopFileSystem \ - "arrow::io::HadoopFileSystem"(CIOFileSystem): - @staticmethod - CStatus Connect(const HdfsConnectionConfig* config, - shared_ptr[CIOHadoopFileSystem]* client) - - CStatus MakeDirectory(const c_string& path) - - CStatus Delete(const c_string& path, c_bool recursive) - - CStatus Disconnect() - - c_bool Exists(const c_string& path) - - CStatus Chmod(const c_string& path, int mode) - CStatus Chown(const c_string& path, const char* owner, - const char* group) - - CStatus GetCapacity(int64_t* nbytes) - CStatus GetUsed(int64_t* nbytes) - - CStatus ListDirectory(const c_string& path, - vector[HdfsPathInfo]* listing) - - CStatus GetPathInfo(const c_string& path, HdfsPathInfo* info) - - CStatus Rename(const c_string& src, const c_string& dst) - - CStatus OpenReadable(const c_string& path, - shared_ptr[HdfsReadableFile]* handle) - - CStatus OpenWritable(const c_string& path, c_bool append, - int32_t buffer_size, int16_t replication, - int64_t default_block_size, - shared_ptr[HdfsOutputStream]* handle) - - # Callbacks for implementing Python filesystems # Use typedef to emulate syntax for std::function ctypedef void CallbackGetTypeName(object, c_string*) From dc1770838515040ebf2fabcdb30debb89b349fb3 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 09:47:56 +0200 Subject: [PATCH 16/27] Remove commented code --- cpp/src/arrow/filesystem/hdfs.cc | 3 --- cpp/src/arrow/filesystem/hdfs.h | 8 -------- cpp/src/arrow/filesystem/hdfs_internal.cc | 11 ----------- cpp/src/arrow/filesystem/hdfs_internal.h | 8 -------- 4 files changed, 30 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index bd91cc67fde..0d0069b00eb 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -93,8 +93,6 @@ class HadoopFileSystem::Impl { Status Init() { RETURN_NOT_OK(ConnectLibHdfs(&driver_)); - // RETURN_NOT_OK(io::HadoopFileSystem::Connect(&options_.connection_config, - // &client_)); if (!driver_) { return Status::Invalid("Failed to initialize HDFS driver"); @@ -132,7 +130,6 @@ class HadoopFileSystem::Impl { Status Close() { if (client_) { - // RETURN_NOT_OK(client_->Disconnect()); int ret = driver_->Disconnect(client_); CHECK_FAILURE(ret, "hdfsFS::Disconnect"); return Status::OK(); diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index 0985f4cc56c..8ae68a722ff 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -21,17 +21,9 @@ #include #include -// #include -// #include - #include "arrow/filesystem/filesystem.h" #include "arrow/util/uri.h" -// #include "arrow/filesystem/type_fwd.h" -// #include "arrow/io/interfaces.h" -// #include "arrow/util/macros.h" -// #include "arrow/util/visibility.h" - namespace arrow::io::internal { class HdfsReadableFile; class HdfsOutputStream; diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index 2d6bbe01e13..5a88d3da430 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -39,13 +39,6 @@ #include #include "arrow/util/basic_decimal.h" -// #include -// #include -// #include -// #include -// #include -// #include - #ifndef _WIN32 # include #endif @@ -56,10 +49,6 @@ #include "arrow/util/io_util.h" #include "arrow/util/logging_internal.h" -// #include "arrow/filesystem/type_fwd.h" -// #include "arrow/io/interfaces.h" -// #include "arrow/memory_pool.h" - namespace arrow { using internal::GetEnvVarNative; diff --git a/cpp/src/arrow/filesystem/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h index 116b37a697b..af87ad0d14f 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.h +++ b/cpp/src/arrow/filesystem/hdfs_internal.h @@ -22,12 +22,6 @@ #include -// #include -// #include -// #include -// #include -// #include - #include "arrow/filesystem/filesystem.h" #include "arrow/filesystem/hdfs.h" #include "arrow/filesystem/type_fwd.h" @@ -36,8 +30,6 @@ #include "arrow/util/visibility.h" #include "arrow/util/windows_compatibility.h" // IWYU pragma: keep -// #include "arrow/util/macros.h" - using std::size_t; struct hdfsBuilder; From 722c670f731f090efc02d500c9b7bf8dad6a15a8 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 09:50:10 +0200 Subject: [PATCH 17/27] Fix linter error in CMakeLists --- cpp/src/arrow/CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4a88b906d9e..63c9bab0859 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -898,10 +898,7 @@ if(ARROW_FILESYSTEM) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() if(ARROW_HDFS) - list(APPEND - ARROW_FILESYSTEM_SRCS - filesystem/hdfs.cc - filesystem/hdfs_internal.cc) + list(APPEND ARROW_FILESYSTEM_SRCS filesystem/hdfs.cc filesystem/hdfs_internal.cc) endif() if(ARROW_S3) list(APPEND ARROW_FILESYSTEM_SRCS filesystem/s3fs.cc) From 866903ba6c4da4fcb71df2daab9645958bc3f92b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 10:24:46 +0200 Subject: [PATCH 18/27] Add missing linter fix --- cpp/src/arrow/filesystem/hdfs.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 0d0069b00eb..255c79f44d9 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -850,25 +850,25 @@ Status HadoopFileSystem::OpenReadable( const std::string& path, int32_t buffer_size, std::shared_ptr* file) { return impl_->OpenReadable(path, buffer_size, file); -}; +} Status HadoopFileSystem::OpenReadable( const std::string& path, int32_t buffer_size, const io::IOContext& io_context, std::shared_ptr* file) { return impl_->OpenReadable(path, buffer_size, io_context, file); -}; +} Status HadoopFileSystem::OpenReadable( const std::string& path, std::shared_ptr* file) { return impl_->OpenReadable(path, file); -}; +} Status HadoopFileSystem::OpenReadable( const std::string& path, const io::IOContext& io_context, std::shared_ptr* file) { return impl_->OpenReadable(path, io_context, file); -}; +} Status HadoopFileSystem::OpenWritable( const std::string& path, bool append, int32_t buffer_size, int16_t replication, @@ -876,13 +876,13 @@ Status HadoopFileSystem::OpenWritable( std::shared_ptr* file) { return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, file); -}; +} Status HadoopFileSystem::OpenWritable( const std::string& path, bool append, std::shared_ptr* file) { return impl_->OpenWritable(path, append, file); -}; +} // ---------------------------------------------------------------------- // Allow public API users to check whether we are set up correctly From acb844c698423515a7254263ec0adb8cb9c07a5b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 11:23:50 +0200 Subject: [PATCH 19/27] Make MSVC happy --- cpp/src/arrow/filesystem/hdfs_internal.h | 26 +++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h index af87ad0d14f..fcf6efb112c 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.h +++ b/cpp/src/arrow/filesystem/hdfs_internal.h @@ -238,6 +238,11 @@ struct HdfsPathInfo { int16_t permissions; }; +ARROW_EXPORT Status MakeReadableFile(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, LibHdfsShim* driver, + hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { public: ~HdfsReadableFile() override; @@ -264,7 +269,6 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { std::unique_ptr impl_; friend class arrow::fs::HadoopFileSystem; - friend Status MakeReadableFile(const std::string& path, int32_t buffer_size, const io::IOContext& io_context, LibHdfsShim* driver, hdfsFS fs, hdfsFile file, @@ -273,6 +277,10 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); }; +ARROW_EXPORT Status MakeOutputStream(const std::string& path, int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, hdfsFile file, + std::shared_ptr* out); + // Naming this file OutputStream because it does not support seeking (like the // WritableFile interface) class ARROW_EXPORT HdfsOutputStream : public OutputStream { @@ -295,24 +303,14 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream { std::unique_ptr impl_; friend class arrow::fs::HadoopFileSystem; + friend Status MakeOutputStream(const std::string& path, int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, hdfsFile file, + std::shared_ptr* out); HdfsOutputStream(); - friend Status MakeOutputStream(const std::string& path, int32_t buffer_size, - LibHdfsShim* driver, hdfsFS fs, hdfsFile file, - std::shared_ptr* out); - ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); }; -ARROW_EXPORT Status MakeReadableFile(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, LibHdfsShim* driver, - hdfsFS fs, hdfsFile file, - std::shared_ptr* out); - -ARROW_EXPORT Status MakeOutputStream(const std::string& path, int32_t buffer_size, - LibHdfsShim* driver, hdfsFS fs, hdfsFile file, - std::shared_ptr* out); - } // namespace io::internal } // namespace arrow From f4d5118ad2ab92b3778d4f27b4fdc1f131cedf2f Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 16:28:50 +0200 Subject: [PATCH 20/27] Change friend Status Make* methods to static Result Make --- cpp/src/arrow/filesystem/hdfs.cc | 52 +++++++++++------------ cpp/src/arrow/filesystem/hdfs_internal.cc | 43 +++++++++---------- cpp/src/arrow/filesystem/hdfs_internal.h | 27 +++++------- 3 files changed, 58 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 255c79f44d9..55a72a17c7a 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -24,6 +24,7 @@ #include "arrow/filesystem/hdfs_internal.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" +#include "arrow/result.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/value_parsing.h" @@ -35,8 +36,7 @@ using internal::ErrnoFromStatus; using internal::ParseValue; using io::internal::HdfsOutputStream; using io::internal::HdfsPathInfo; -using io::internal::MakeOutputStream; -using io::internal::MakeReadableFile; +using io::internal::HdfsReadableFile; using util::Uri; namespace fs { @@ -444,7 +444,7 @@ class HadoopFileSystem::Impl { Status OpenReadable(const std::string& path, int32_t buffer_size, const io::IOContext& io_context, - std::shared_ptr* file) { + std::shared_ptr* file) { errno = 0; hdfsFile handle = driver_->OpenFile(client_, path.c_str(), O_RDONLY, buffer_size, 0, 0); @@ -454,24 +454,24 @@ class HadoopFileSystem::Impl { } // std::make_shared does not work with private ctors - RETURN_NOT_OK( - MakeReadableFile(path, buffer_size, io_context, driver_, client_, handle, file)); - + std::shared_ptr file_tmp; + ARROW_ASSIGN_OR_RAISE(file_tmp, HdfsReadableFile::Make(path, buffer_size, io_context, + driver_, client_, handle)); + *file = std::move(file_tmp); return Status::OK(); } Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { + std::shared_ptr* file) { return OpenReadable(path, buffer_size, io::default_io_context(), file); } - Status OpenReadable(const std::string& path, - std::shared_ptr* file) { + Status OpenReadable(const std::string& path, std::shared_ptr* file) { return OpenReadable(path, kDefaultHdfsBufferSize, io::default_io_context(), file); } Status OpenReadable(const std::string& path, const io::IOContext& io_context, - std::shared_ptr* file) { + std::shared_ptr* file) { return OpenReadable(path, kDefaultHdfsBufferSize, io_context, file); } @@ -491,8 +491,10 @@ class HadoopFileSystem::Impl { } // std::make_shared does not work with private ctors - RETURN_NOT_OK(MakeOutputStream(path, buffer_size, driver_, client_, handle, file)); - + std::shared_ptr file_tmp; + ARROW_ASSIGN_OR_RAISE( + file_tmp, HdfsOutputStream::Make(path, buffer_size, driver_, client_, handle)); + *file = std::move(file_tmp); return Status::OK(); } @@ -503,14 +505,14 @@ class HadoopFileSystem::Impl { Result> OpenInputStream(const std::string& path) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(OpenReadable(path, io_context_, &file)); return file; } Result> OpenInputFile(const std::string& path) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(path)); - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(OpenReadable(path, io_context_, &file)); return file; } @@ -846,27 +848,25 @@ Status HadoopFileSystem::Copy(const std::string& src, const std::string& dst) { return impl_->Copy(src, dst); } -Status HadoopFileSystem::OpenReadable( - const std::string& path, int32_t buffer_size, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file) { return impl_->OpenReadable(path, buffer_size, file); } -Status HadoopFileSystem::OpenReadable( - const std::string& path, int32_t buffer_size, const io::IOContext& io_context, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenReadable(const std::string& path, int32_t buffer_size, + const io::IOContext& io_context, + std::shared_ptr* file) { return impl_->OpenReadable(path, buffer_size, io_context, file); } -Status HadoopFileSystem::OpenReadable( - const std::string& path, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenReadable(const std::string& path, + std::shared_ptr* file) { return impl_->OpenReadable(path, file); } -Status HadoopFileSystem::OpenReadable( - const std::string& path, const io::IOContext& io_context, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenReadable(const std::string& path, + const io::IOContext& io_context, + std::shared_ptr* file) { return impl_->OpenReadable(path, io_context, file); } diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index 5a88d3da430..db679d3589b 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -576,6 +576,18 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} + static Result> Make(const std::string& path, + int32_t buffer_size, + const io::IOContext& io_context, + internal::LibHdfsShim* driver, + hdfsFS fs, hdfsFile handle) { + // Must use `new` due to private constructor + std::shared_ptr file(new HdfsReadableFile(io_context)); + file->impl_->set_members(path, driver, fs, handle); + file->impl_->set_buffer_size(buffer_size); + return std::move(file); + } + Status Close() { if (is_open_) { // is_open_ must be set to false in the beginning, because the destructor @@ -725,6 +737,15 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { public: HdfsOutputStreamImpl() {} + static Result> Make(const std::string& path, + int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle) { + std::shared_ptr file(new HdfsOutputStream()); + file->impl_->set_members(path, driver, fs, handle); + return std::move(file); + } + Status Close() { if (is_open_) { // is_open_ must be set to false in the beginning, because the destructor @@ -790,27 +811,5 @@ Status HdfsOutputStream::Flush() { return impl_->Flush(); } Result HdfsOutputStream::Tell() const { return impl_->Tell(); } -// Factory function to construct a HdfsReadableFile for use with the public API. -Status MakeReadableFile(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, internal::LibHdfsShim* driver, - hdfsFS fs, hdfsFile file, - std::shared_ptr* out) { - // Must use `new` due to private constructor - std::shared_ptr file_obj(new HdfsReadableFile(io_context)); - file_obj->impl_->set_members(path, driver, fs, file); - file_obj->impl_->set_buffer_size(buffer_size); - *out = std::move(file_obj); - return Status::OK(); -} - -Status MakeOutputStream(const std::string& path, int32_t buffer_size, LibHdfsShim* driver, - hdfsFS fs, hdfsFile file, - std::shared_ptr* out) { - std::shared_ptr stream(new HdfsOutputStream()); - stream->impl_->set_members(path, driver, fs, file); - *out = std::move(stream); - return Status::OK(); -} - } // namespace io::internal } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h index fcf6efb112c..f4d6e995809 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.h +++ b/cpp/src/arrow/filesystem/hdfs_internal.h @@ -238,15 +238,16 @@ struct HdfsPathInfo { int16_t permissions; }; -ARROW_EXPORT Status MakeReadableFile(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, LibHdfsShim* driver, - hdfsFS fs, hdfsFile file, - std::shared_ptr* out); - class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { public: ~HdfsReadableFile() override; + static Result> Make(const std::string& path, + int32_t buffer_size, + const io::IOContext& io_context, + LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle); + Status Close() override; bool closed() const override; @@ -269,24 +270,21 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { std::unique_ptr impl_; friend class arrow::fs::HadoopFileSystem; - friend Status MakeReadableFile(const std::string& path, int32_t buffer_size, - const io::IOContext& io_context, LibHdfsShim* driver, - hdfsFS fs, hdfsFile file, - std::shared_ptr* out); ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); }; -ARROW_EXPORT Status MakeOutputStream(const std::string& path, int32_t buffer_size, - LibHdfsShim* driver, hdfsFS fs, hdfsFile file, - std::shared_ptr* out); - // Naming this file OutputStream because it does not support seeking (like the // WritableFile interface) class ARROW_EXPORT HdfsOutputStream : public OutputStream { public: ~HdfsOutputStream() override; + static Result> Make(const std::string& path, + int32_t buffer_size, + LibHdfsShim* driver, hdfsFS fs, + hdfsFile handle); + Status Close() override; bool closed() const override; @@ -303,9 +301,6 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream { std::unique_ptr impl_; friend class arrow::fs::HadoopFileSystem; - friend Status MakeOutputStream(const std::string& path, int32_t buffer_size, - LibHdfsShim* driver, hdfsFS fs, hdfsFile file, - std::shared_ptr* out); HdfsOutputStream(); From 10146e3978e70227fc4ccccf104f22fc70eb7caf Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 19 May 2025 16:43:27 +0200 Subject: [PATCH 21/27] Try to remove the correct std::move --- cpp/src/arrow/filesystem/hdfs_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index db679d3589b..f7940edd0f3 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -585,7 +585,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { std::shared_ptr file(new HdfsReadableFile(io_context)); file->impl_->set_members(path, driver, fs, handle); file->impl_->set_buffer_size(buffer_size); - return std::move(file); + return file; } Status Close() { @@ -743,7 +743,7 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { hdfsFile handle) { std::shared_ptr file(new HdfsOutputStream()); file->impl_->set_members(path, driver, fs, handle); - return std::move(file); + return file; } Status Close() { From e3259bc6e631fae1bca75f9efaf0984c6de06135 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 20 May 2025 11:18:12 +0200 Subject: [PATCH 22/27] Define the static method on the outer class --- cpp/src/arrow/filesystem/hdfs_internal.cc | 41 +++++++++++------------ 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index f7940edd0f3..7ead039489b 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -576,18 +576,6 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} - static Result> Make(const std::string& path, - int32_t buffer_size, - const io::IOContext& io_context, - internal::LibHdfsShim* driver, - hdfsFS fs, hdfsFile handle) { - // Must use `new` due to private constructor - std::shared_ptr file(new HdfsReadableFile(io_context)); - file->impl_->set_members(path, driver, fs, handle); - file->impl_->set_buffer_size(buffer_size); - return file; - } - Status Close() { if (is_open_) { // is_open_ must be set to false in the beginning, because the destructor @@ -729,6 +717,16 @@ Status HdfsReadableFile::Seek(int64_t position) { return impl_->Seek(position); Result HdfsReadableFile::Tell() const { return impl_->Tell(); } +Result> HdfsReadableFile::Make( + const std::string& path, int32_t buffer_size, const io::IOContext& io_context, + internal::LibHdfsShim* driver, hdfsFS fs, hdfsFile handle) { + // Must use `new` due to private constructor + std::shared_ptr file(new HdfsReadableFile(io_context)); + file->impl_->set_members(path, driver, fs, handle); + file->impl_->set_buffer_size(buffer_size); + return file; +} + // ---------------------------------------------------------------------- // File writing @@ -737,15 +735,6 @@ class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { public: HdfsOutputStreamImpl() {} - static Result> Make(const std::string& path, - int32_t buffer_size, - LibHdfsShim* driver, hdfsFS fs, - hdfsFile handle) { - std::shared_ptr file(new HdfsOutputStream()); - file->impl_->set_members(path, driver, fs, handle); - return file; - } - Status Close() { if (is_open_) { // is_open_ must be set to false in the beginning, because the destructor @@ -811,5 +800,15 @@ Status HdfsOutputStream::Flush() { return impl_->Flush(); } Result HdfsOutputStream::Tell() const { return impl_->Tell(); } +Result> HdfsOutputStream::Make(const std::string& path, + int32_t buffer_size, + LibHdfsShim* driver, + hdfsFS fs, + hdfsFile handle) { + std::shared_ptr file(new HdfsOutputStream()); + file->impl_->set_members(path, driver, fs, handle); + return file; +} + } // namespace io::internal } // namespace arrow From 48d0c76ec78348398683583f535ad0de89542f90 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 20 May 2025 14:16:36 +0200 Subject: [PATCH 23/27] Try to safeguard client_ from nullptr and dangling ptr --- cpp/src/arrow/filesystem/hdfs.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 55a72a17c7a..209151bfbbc 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -129,11 +129,19 @@ class HadoopFileSystem::Impl { } Status Close() { - if (client_) { - int ret = driver_->Disconnect(client_); - CHECK_FAILURE(ret, "hdfsFS::Disconnect"); - return Status::OK(); + if (client_ == nullptr) { + return Status::OK(); // already closed + } + + if (!driver_) { + return Status::Invalid("driver_ is null in Close()"); } + + int ret = driver_->Disconnect(client_); + CHECK_FAILURE(ret, "hdfsFS::Disconnect"); + + client_ = nullptr; // prevent double-disconnect + return Status::OK(); } From 21b67cf99bbdaa458857736f14d3ef0d0655c2ca Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 20 May 2025 15:19:07 +0200 Subject: [PATCH 24/27] Update hdfs Impl --- cpp/src/arrow/filesystem/hdfs.cc | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index 209151bfbbc..e29254e3b61 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -87,19 +87,22 @@ Status GetPathInfoFailed(const std::string& path) { class HadoopFileSystem::Impl { public: Impl(HdfsOptions options, const io::IOContext& io_context) - : options_(std::move(options)), io_context_(io_context) {} + : options_(std::move(options)), + io_context_(io_context), + driver_(NULLPTR), + port_(0), + client_(NULLPTR) {} ~Impl() { ARROW_WARN_NOT_OK(Close(), "Failed to disconnect hdfs client"); } Status Init() { + const HdfsConnectionConfig* config = &options_.connection_config; RETURN_NOT_OK(ConnectLibHdfs(&driver_)); if (!driver_) { return Status::Invalid("Failed to initialize HDFS driver"); } - const HdfsConnectionConfig* config = &options_.connection_config; - // connect to HDFS with the builder object hdfsBuilder* builder = driver_->NewBuilder(); if (!config->host.empty()) { @@ -125,6 +128,11 @@ class HadoopFileSystem::Impl { return Status::IOError("HDFS connection failed"); } + namenode_host_ = config->host; + port_ = config->port; + user_ = config->user; + kerb_ticket_ = config->kerb_ticket; + return Status::OK(); } @@ -550,11 +558,17 @@ class HadoopFileSystem::Impl { } protected: - hdfsFS client_; const HdfsOptions options_; const io::IOContext io_context_; io::internal::LibHdfsShim* driver_; + std::string namenode_host_; + std::string user_; + int port_; + std::string kerb_ticket_; + + hdfsFS client_; + void PathInfoToFileInfo(const HdfsPathInfo& info, FileInfo* out) { if (info.kind == FileType::Directory) { out->set_type(FileType::Directory); From 0e8ff017160be5bdee9df1bdd011ca4e772f3a6c Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 10 Jun 2025 09:37:14 +0200 Subject: [PATCH 25/27] Change io::internal to fs::internal --- cpp/src/arrow/filesystem/hdfs.cc | 17 ++++++----- cpp/src/arrow/filesystem/hdfs.h | 29 ++++++++++--------- cpp/src/arrow/filesystem/hdfs_internal.cc | 4 +-- cpp/src/arrow/filesystem/hdfs_internal.h | 8 ++--- .../arrow/filesystem/hdfs_internal_test.cc | 13 +++++---- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index e29254e3b61..d3e65a68e97 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -34,13 +34,14 @@ namespace arrow { using internal::ErrnoFromStatus; using internal::ParseValue; -using io::internal::HdfsOutputStream; -using io::internal::HdfsPathInfo; -using io::internal::HdfsReadableFile; using util::Uri; namespace fs { +using internal::HdfsOutputStream; +using internal::HdfsPathInfo; +using internal::HdfsReadableFile; + #define CHECK_FAILURE(RETURN_VALUE, WHAT) \ do { \ if (RETURN_VALUE == -1) { \ @@ -560,7 +561,7 @@ class HadoopFileSystem::Impl { protected: const HdfsOptions options_; const io::IOContext io_context_; - io::internal::LibHdfsShim* driver_; + internal::LibHdfsShim* driver_; std::string namenode_host_; std::string user_; @@ -895,14 +896,14 @@ Status HadoopFileSystem::OpenReadable(const std::string& path, Status HadoopFileSystem::OpenWritable( const std::string& path, bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { + std::shared_ptr* file) { return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, file); } Status HadoopFileSystem::OpenWritable( const std::string& path, bool append, - std::shared_ptr* file) { + std::shared_ptr* file) { return impl_->OpenWritable(path, append, file); } @@ -910,8 +911,8 @@ Status HadoopFileSystem::OpenWritable( // Allow public API users to check whether we are set up correctly Status HaveLibHdfs() { - io::internal::LibHdfsShim* driver; - return io::internal::ConnectLibHdfs(&driver); + internal::LibHdfsShim* driver; + return internal::ConnectLibHdfs(&driver); } } // namespace fs diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index 8ae68a722ff..3315b80cf46 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -24,15 +24,16 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/util/uri.h" -namespace arrow::io::internal { -class HdfsReadableFile; -class HdfsOutputStream; - -struct HdfsPathInfo; -} // namespace arrow::io::internal namespace arrow::fs { +namespace internal { + class HdfsReadableFile; + class HdfsOutputStream; + + struct HdfsPathInfo; +} // namespace internal + struct HdfsConnectionConfig { std::string host; int port; @@ -103,10 +104,10 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { bool Exists(const std::string& path); - Status GetPathInfoStatus(const std::string& path, io::internal::HdfsPathInfo* info); + Status GetPathInfoStatus(const std::string& path, internal::HdfsPathInfo* info); Status ListDirectory(const std::string& path, - std::vector* listing); + std::vector* listing); // Delete file or directory // @param path absolute path to data @@ -162,17 +163,17 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { // // @param path complete file path Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenReadable(const std::string& path, int32_t buffer_size, const io::IOContext& io_context, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenReadable(const std::string& path, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenReadable(const std::string& path, const io::IOContext& io_context, - std::shared_ptr* file); + std::shared_ptr* file); // FileMode::WRITE options // @param path complete file path @@ -181,10 +182,10 @@ class ARROW_EXPORT HadoopFileSystem : public FileSystem { // @param default_block_size 0 by default Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenWritable(const std::string& path, bool append, - std::shared_ptr* file); + std::shared_ptr* file); protected: HadoopFileSystem(const HdfsOptions& options, const io::IOContext&); diff --git a/cpp/src/arrow/filesystem/hdfs_internal.cc b/cpp/src/arrow/filesystem/hdfs_internal.cc index 7ead039489b..00c481376f8 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal.cc @@ -55,7 +55,7 @@ using internal::GetEnvVarNative; using internal::IOErrorFromErrno; using internal::PlatformFilename; -namespace io::internal { +namespace fs::internal { namespace { @@ -810,5 +810,5 @@ Result> HdfsOutputStream::Make(const std::stri return file; } -} // namespace io::internal +} // namespace fs::internal } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_internal.h b/cpp/src/arrow/filesystem/hdfs_internal.h index f4d6e995809..93178402f05 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal.h +++ b/cpp/src/arrow/filesystem/hdfs_internal.h @@ -40,7 +40,7 @@ class Status; using internal::IOErrorFromErrno; -namespace io::internal { +namespace fs::internal { class HdfsReadableFile; class HdfsOutputStream; @@ -238,7 +238,7 @@ struct HdfsPathInfo { int16_t permissions; }; -class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { +class ARROW_EXPORT HdfsReadableFile : public io::RandomAccessFile { public: ~HdfsReadableFile() override; @@ -276,7 +276,7 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { // Naming this file OutputStream because it does not support seeking (like the // WritableFile interface) -class ARROW_EXPORT HdfsOutputStream : public OutputStream { +class ARROW_EXPORT HdfsOutputStream : public io::OutputStream { public: ~HdfsOutputStream() override; @@ -307,5 +307,5 @@ class ARROW_EXPORT HdfsOutputStream : public OutputStream { ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); }; -} // namespace io::internal +} // namespace fs::internal } // namespace arrow diff --git a/cpp/src/arrow/filesystem/hdfs_internal_test.cc b/cpp/src/arrow/filesystem/hdfs_internal_test.cc index bd1460e3543..e6f102da705 100644 --- a/cpp/src/arrow/filesystem/hdfs_internal_test.cc +++ b/cpp/src/arrow/filesystem/hdfs_internal_test.cc @@ -41,10 +41,11 @@ namespace arrow { using arrow::fs::HadoopFileSystem; using arrow::fs::HdfsOptions; -using io::internal::HdfsPathInfo; -using io::internal::HdfsReadableFile; -namespace io { +namespace fs::internal { + +using arrow::fs::internal::HdfsPathInfo; +using arrow::fs::internal::HdfsReadableFile; std::vector RandomData(int64_t size) { std::vector buffer(size); @@ -65,7 +66,7 @@ class TestHadoopFileSystem : public ::testing::Test { bool append = false, int buffer_size = 0, int16_t replication = 0, int default_block_size = 0) { { - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(client_->OpenWritable(path, append, buffer_size, replication, default_block_size, &file)); @@ -88,7 +89,7 @@ class TestHadoopFileSystem : public ::testing::Test { // Set up shared state between unit tests void SetUp() { - internal::LibHdfsShim* driver_shim; + LibHdfsShim* driver_shim; client_ = nullptr; scratch_dir_ = @@ -479,5 +480,5 @@ TEST_F(TestHadoopFileSystem, ThreadSafety) { ASSERT_EQ(niter * 4, correct_count); } -} // namespace io +} // namespace fs::internal } // namespace arrow From 267a1e56515928054a61490401e86d05fd047c6b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 10 Jun 2025 10:00:49 +0200 Subject: [PATCH 26/27] Run C++ linter --- cpp/src/arrow/filesystem/hdfs.cc | 13 ++++++------- cpp/src/arrow/filesystem/hdfs.h | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/hdfs.cc b/cpp/src/arrow/filesystem/hdfs.cc index d3e65a68e97..41048c1f512 100644 --- a/cpp/src/arrow/filesystem/hdfs.cc +++ b/cpp/src/arrow/filesystem/hdfs.cc @@ -893,17 +893,16 @@ Status HadoopFileSystem::OpenReadable(const std::string& path, return impl_->OpenReadable(path, io_context, file); } -Status HadoopFileSystem::OpenWritable( - const std::string& path, bool append, int32_t buffer_size, int16_t replication, - int64_t default_block_size, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + std::shared_ptr* file) { return impl_->OpenWritable(path, append, buffer_size, replication, default_block_size, file); } -Status HadoopFileSystem::OpenWritable( - const std::string& path, bool append, - std::shared_ptr* file) { +Status HadoopFileSystem::OpenWritable(const std::string& path, bool append, + std::shared_ptr* file) { return impl_->OpenWritable(path, append, file); } diff --git a/cpp/src/arrow/filesystem/hdfs.h b/cpp/src/arrow/filesystem/hdfs.h index 3315b80cf46..94091549f7b 100644 --- a/cpp/src/arrow/filesystem/hdfs.h +++ b/cpp/src/arrow/filesystem/hdfs.h @@ -24,14 +24,13 @@ #include "arrow/filesystem/filesystem.h" #include "arrow/util/uri.h" - namespace arrow::fs { namespace internal { - class HdfsReadableFile; - class HdfsOutputStream; +class HdfsReadableFile; +class HdfsOutputStream; - struct HdfsPathInfo; +struct HdfsPathInfo; } // namespace internal struct HdfsConnectionConfig { From f09d6d9fa81b0a13f07fd7850491e780774e558a Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 10 Jun 2025 11:27:23 +0200 Subject: [PATCH 27/27] Remove HdfsDriver from PyArrow Cython file --- python/pyarrow/includes/libarrow_fs.pxd | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 147ab969e63..c75b1acc6b4 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -308,17 +308,12 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CStatus HaveLibHdfs() - cdef enum HdfsDriver" arrow::io::HdfsDriver": - HdfsDriver_LIBHDFS" arrow::io::HdfsDriver::LIBHDFS" - HdfsDriver_LIBHDFS3" arrow::io::HdfsDriver::LIBHDFS3" - cdef cppclass HdfsConnectionConfig: c_string host int port c_string user c_string kerb_ticket unordered_map[c_string, c_string] extra_conf - HdfsDriver driver # Callbacks for implementing Python filesystems # Use typedef to emulate syntax for std::function