diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index a883b87222d..5d39f616827 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -139,6 +139,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_COMPUTE "Build the Arrow Compute Modules" ON) + define_option(ARROW_DATASET "Build the Arrow Dataset Modules" ON) + define_option(ARROW_FLIGHT "Build the Arrow Flight RPC System (requires GRPC, Protocol Buffers)" OFF) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 08f676af87a..c989f855a5b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -380,6 +380,10 @@ add_subdirectory(io) add_subdirectory(util) add_subdirectory(vendored) +if(ARROW_DATASET) + add_subdirectory(dataset) +endif() + if(ARROW_FLIGHT) add_subdirectory(flight) endif() diff --git a/cpp/src/arrow/dataset/CMakeLists.txt b/cpp/src/arrow/dataset/CMakeLists.txt new file mode 100644 index 00000000000..d8b3ecc0eaf --- /dev/null +++ b/cpp/src/arrow/dataset/CMakeLists.txt @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_custom_target(arrow_dataset) + +# Headers: top level +arrow_install_all_headers("arrow/dataset") + +set(ARROW_DATASET_SRCS scanner.cc) + +add_arrow_lib(arrow_dataset + OUTPUTS + ARROW_DATASET_LIBRARIES + SOURCES + ${ARROW_DATASET_SRCS} + SHARED_LINK_LIBS + arrow_shared + STATIC_LINK_LIBS + arrow_static) + +if(ARROW_DATASET_TEST_LINKAGE STREQUAL "static") + set(ARROW_DATASET_TEST_LINK_LIBS arrow_dataset_static ${ARROW_TEST_STATIC_LINK_LIBS}) +else() + set(ARROW_DATASET_TEST_LINK_LIBS arrow_dataset_shared ${ARROW_TEST_SHARED_LINK_LIBS}) +endif() + +foreach(LIB_TARGET ${ARROW_DATASET_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_DS_EXPORTING) +endforeach() + +if(NOT WIN32) + add_arrow_test(file_test + EXTRA_LINK_LIBS + ${ARROW_DATASET_TEST_LINK_LIBS} + PREFIX + "arrow-dataset" + LABELS + "arrow_dataset") +endif() diff --git a/cpp/src/arrow/dataset/README.md b/cpp/src/arrow/dataset/README.md new file mode 100644 index 00000000000..a7379db448d --- /dev/null +++ b/cpp/src/arrow/dataset/README.md @@ -0,0 +1,31 @@ + + +# Arrow C++ Datasets + +The `arrow::dataset` subcomponent provides an API to read and write +semantic datasets stored in different locations and formats. It +facilitates parallel processing of datasets spread across different +physical files and serialization formats. Other concerns such as +partitioning, filtering (partition- and column-level), and schema +normalization are also addressed. + +## Development Status + +Pre-alpha as of June 2019. API subject to change without notice. \ No newline at end of file diff --git a/cpp/src/arrow/dataset/api.h b/cpp/src/arrow/dataset/api.h new file mode 100644 index 00000000000..9ded93a7fda --- /dev/null +++ b/cpp/src/arrow/dataset/api.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/discovery.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/file_csv.h" +#include "arrow/dataset/file_feather.h" +#include "arrow/dataset/file_parquet.h" +#include "arrow/dataset/scanner.h" diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h new file mode 100644 index 00000000000..4aba8945b27 --- /dev/null +++ b/cpp/src/arrow/dataset/dataset.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +/// \brief A granular piece of a Dataset, such as an individual file, +/// which can be read/scanned separately from other fragments +class ARROW_DS_EXPORT DataFragment { + public: + virtual ~DataFragment() = default; + + /// \brief Return true if the fragment can benefit from parallel + /// scanning + virtual bool splittable() const = 0; + + /// \brief Partition options to use when scanning this fragment. May be + /// nullptr + virtual std::shared_ptr scan_options() const = 0; +}; + +/// \brief Conditions to apply to a dataset when reading to include or +/// exclude fragments, filter out rows, etc. +struct DataSelector { + std::vector> filters; + + // TODO(wesm): Select specific partition keys, file path globs, or + // other common desirable selections +}; + +/// \brief A basic component of a Dataset which yields zero or more +/// DataFragments +class ARROW_DS_EXPORT DataSource { + public: + virtual ~DataSource() = default; + + virtual std::string type() const = 0; + + virtual std::unique_ptr GetFragments( + const DataSelector& selector) = 0; +}; + +/// \brief A DataSource consisting of a flat sequence of DataFragments +class ARROW_DS_EXPORT SimpleDataSource : public DataSource { + public: + std::unique_ptr GetFragments( + const DataSelector& selector) override; + + private: + DataFragmentVector fragments_; +}; + +/// \brief Top-level interface for a Dataset with fragments coming +/// from possibly multiple sources +class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this { + public: + /// \param[in] source a single input data source + /// \param[in] schema a known schema to conform to, may be nullptr + explicit Dataset(std::shared_ptr source, + std::shared_ptr schema = NULLPTR); + + /// \param[in] sources one or more input data sources + /// \param[in] schema a known schema to conform to, may be nullptr + explicit Dataset(const std::vector>& sources, + std::shared_ptr schema = NULLPTR); + + virtual ~Dataset() = default; + + /// \brief Begin to build a new Scan operation against this Dataset + ScannerBuilder NewScan() const; + + const std::vector>& sources() const { return sources_; } + + std::shared_ptr schema() const { return schema_; } + + /// \brief Compute consensus schema from input data sources + Status InferSchema(std::shared_ptr* out); + + /// \brief Return a copy of Dataset with a new target schema + Status ReplaceSchema(std::shared_ptr schema, std::unique_ptr* out); + + protected: + // The data sources must conform their output to this schema (with + // projections and filters taken into account) + std::shared_ptr schema_; + + std::vector> sources_; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h new file mode 100644 index 00000000000..18242250f32 --- /dev/null +++ b/cpp/src/arrow/dataset/discovery.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Logic for automatically determining the structure of multi-file +/// dataset with possible partitioning according to available +/// partition schemes + +#pragma once + +#include +#include + +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace dataset { + +struct ARROW_DS_EXPORT DiscoveryOptions { + std::shared_ptr format = NULLPTR; + std::shared_ptr partition_scheme = NULLPTR; +}; + +/// \brief Using a root directory +ARROW_DS_EXPORT +Status DiscoverSource(const std::string& path, fs::FileSystem* filesystem, + const DiscoveryOptions& options, std::shared_ptr* out); + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/disk_store.h b/cpp/src/arrow/dataset/disk_store.h new file mode 100644 index 00000000000..a405aa2b41a --- /dev/null +++ b/cpp/src/arrow/dataset/disk_store.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dataset/type_fwd.h" +#include "arrow/type_fwd.h" + +namespace arrow { +namespace dataset { + +/// \brief Loads a previously-written collection of Arrow protocol +/// files and exposes them in a way that can be consumed as a Dataset +/// source +class ARROW_DS_EXPORT DiskStoreReader : public DatasetSource { + public: + DiskStoreReader(const std::string& path, fs::FileSystem* filesystem); + + private: + class DiskStoreReaderImpl; + std::unique_ptr impl_; + + std::string path_; + fs::FileSystem* filesystem_; + + DiskStoreReader() {} +}; + +/// \brief +class ARROW_DS_EXPORT DiskStoreWriter { + public: + Status Write(const RecordBatch& batch); + + private: + DiskStoreWriter() {} +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h new file mode 100644 index 00000000000..295a918103c --- /dev/null +++ b/cpp/src/arrow/dataset/file_base.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/scanner.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/dataset/writer.h" +#include "arrow/util/compression.h" + +namespace arrow { +namespace dataset { + +/// \brief Contains the location of a file to be read +class ARROW_DS_EXPORT FileSource { + public: + enum SourceType { PATH, BUFFER }; + + FileSource(std::string path, fs::FileSystem* filesystem, + Compression::type compression = Compression::UNCOMPRESSED) + : FileSource(FileSource::PATH, compression) { + path_ = std::move(path); + filesystem_ = filesystem; + } + + FileSource(std::shared_ptr buffer, + Compression::type compression = Compression::UNCOMPRESSED) + : FileSource(FileSource::BUFFER, compression) { + buffer_ = std::move(buffer); + } + + bool operator==(const FileSource& other) const { + if (type_ != other.type_) { + return false; + } else if (type_ == FileSource::PATH) { + return path_ == other.path_ && filesystem_ == other.filesystem_; + } else { + return buffer_->Equals(*other.buffer_); + } + } + + /// \brief The kind of file, whether stored in a filesystem, memory + /// resident, or other + SourceType type() const { return type_; } + + /// \brief Return the type of raw compression on the file, if any + Compression::type compression() const { return compression_; } + + /// \brief Return the file path, if any. Only valid when file source + /// type is PATH + std::string path() const { return path_; } + + /// \brief Return the filesystem, if any. Only valid when file + /// source type is PATH + fs::FileSystem* filesystem() const { return filesystem_; } + + /// \brief Return the buffer containing the file, if any. Only value + /// when file source type is BUFFER + std::shared_ptr buffer() const { return buffer_; } + + private: + explicit FileSource(SourceType type, + Compression::type compression = Compression::UNCOMPRESSED) + : type_(type), compression_(compression) {} + SourceType type_; + Compression::type compression_; + + // PATH-based source + std::string path_; + fs::FileSystem* filesystem_; + + // BUFFER-based source + std::shared_ptr buffer_; +}; + +/// \brief Base class for file scanning options +class ARROW_DS_EXPORT FileScanOptions : public ScanOptions { + public: + /// \brief The name of the file format this options corresponds to + virtual std::string file_type() const = 0; +}; + +/// \brief Base class for file writing options +class ARROW_DS_EXPORT FileWriteOptions : public WriteOptions { + public: + virtual ~FileWriteOptions() = default; + + /// \brief The name of the file format this options corresponds to + virtual std::string file_type() const = 0; +}; + +/// \brief Base class for file format implementation +class ARROW_DS_EXPORT FileFormat { + public: + virtual ~FileFormat() = default; + + virtual std::string name() const = 0; + + /// \brief Return true if the given file extension + virtual bool IsKnownExtension(const std::string& ext) const = 0; + + /// \brief Open a file for scanning + virtual Status ScanFile(const FileSource& location, + std::shared_ptr scan_options, + std::shared_ptr scan_context, + std::unique_ptr* out) const = 0; +}; + +/// \brief A DataFragment that is stored in a file with a known format +class ARROW_DS_EXPORT FileBasedDataFragment : public DataFragment { + public: + FileBasedDataFragment(const FileSource& location, std::shared_ptr format, + std::shared_ptr); + + const FileSource& location() const { return location_; } + std::shared_ptr format() const { return format_; } + + protected: + FileSource location_; + std::shared_ptr format_; + std::shared_ptr scan_options_; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h new file mode 100644 index 00000000000..1b461455756 --- /dev/null +++ b/cpp/src/arrow/dataset/file_csv.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/util/iterator.h" + +namespace arrow { + +namespace fs { + +class FileSystem; + +} // namespace fs + +namespace dataset { + +class ARROW_DS_EXPORT CsvScanOptions : public FileScanOptions { + public: + std::string file_type() const override; + + private: + csv::ParseOptions parse_options_; + csv::ConvertOptions convert_options_; + csv::ReadOptions read_options_; +}; + +class ARROW_DS_EXPORT CsvWriteOptions : public FileWriteOptions { + public: + std::string file_type() const override; +}; + +/// \brief A FileFormat implementation that reads from CSV files +class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + public: + std::string name() const override; + + /// \brief Return true if the given file extension + bool IsKnownExtension(const std::string& ext) const override; + + /// \brief Open a file for scanning + Status ScanFile(const FileSource& location, std::shared_ptr scan_options, + std::shared_ptr scan_context, + std::unique_ptr* out) const override; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_feather.h b/cpp/src/arrow/dataset/file_feather.h new file mode 100644 index 00000000000..d92cf82b9f9 --- /dev/null +++ b/cpp/src/arrow/dataset/file_feather.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +class ARROW_DS_EXPORT FeatherScanOptions : public FileScanOptions { + public: + std::string file_type() const override; +}; + +class ARROW_DS_EXPORT FeatherWriterOptions : public FileWriteOptions { + public: + std::string file_type() const override; +}; + +/// \brief A FileFormat implementation that reads from Feather (Arrow +/// IPC protocol) files +class ARROW_DS_EXPORT FeatherFileFormat : public FileFormat { + public: + std::string name() const override; + + /// \brief Return true if the given file extension + bool IsKnownExtension(const std::string& ext) const override; + + /// \brief Open a file for scanning + Status ScanFile(const FileSource& location, std::shared_ptr scan_options, + std::shared_ptr scan_context, + std::unique_ptr* out) const override; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_json.h b/cpp/src/arrow/dataset/file_json.h new file mode 100644 index 00000000000..11e6bbe6ade --- /dev/null +++ b/cpp/src/arrow/dataset/file_json.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/json/options.h" + +namespace arrow { +namespace dataset { + +class ARROW_DS_EXPORT JsonScanOptions : public FileScanOptions { + public: + /// + std::string file_type() const override; + + private: + json::ParseOptions parse_options_; + json::ReadOptions read_options_; +}; + +class ARROW_DS_EXPORT JsonWriteOptions : public FileWriteOptions { + public: + std::string file_type() const override; +}; + +/// \brief A FileFormat implementation that reads from JSON files +class ARROW_DS_EXPORT JsonFileFormat : public FileFormat { + public: + std::string name() const override; + + /// \brief Return true if the given file extension + bool IsKnownExtension(const std::string& ext) const override; + + /// \brief Open a file for scanning + Status ScanFile(const FileSource& location, std::shared_ptr scan_options, + std::shared_ptr scan_context, + std::unique_ptr* out) const override; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h new file mode 100644 index 00000000000..d88c6f889be --- /dev/null +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +class ARROW_DS_EXPORT ParquetScanOptions : public FileScanOptions { + public: + std::string file_type() const override; +}; + +class ARROW_DS_EXPORT ParquetWriteOptions : public FileWriteOptions { + public: + std::string file_type() const override; +}; + +class ARROW_DS_EXPORT ParquetFragment : public FileBasedDataFragment { + public: + bool splittable() const override { return true; } +}; + +/// \brief A FileFormat implementation that reads from Parquet files +class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { + public: + std::string name() const override; + + /// \brief Return true if the given file extension + bool IsKnownExtension(const std::string& ext) const override; + + /// \brief Open a file for scanning + Status ScanFile(const FileSource& location, std::shared_ptr scan_options, + std::shared_ptr scan_context, + std::unique_ptr* out) const override; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/file_test.cc b/cpp/src/arrow/dataset/file_test.cc new file mode 100644 index 00000000000..2db2213568b --- /dev/null +++ b/cpp/src/arrow/dataset/file_test.cc @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include + +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" + +#include "arrow/dataset/api.h" +#include "arrow/filesystem/localfs.h" + +namespace arrow { +namespace dataset { + +TEST(FileSource, PathBased) { + fs::LocalFileSystem localfs; + + std::string p1 = "/path/to/file.ext"; + std::string p2 = "/path/to/file.ext.gz"; + + FileSource source1(p1, &localfs); + FileSource source2(p2, &localfs, Compression::GZIP); + + ASSERT_EQ(p1, source1.path()); + ASSERT_EQ(&localfs, source1.filesystem()); + ASSERT_EQ(FileSource::PATH, source1.type()); + ASSERT_EQ(Compression::UNCOMPRESSED, source1.compression()); + + ASSERT_EQ(p2, source2.path()); + ASSERT_EQ(&localfs, source2.filesystem()); + ASSERT_EQ(FileSource::PATH, source2.type()); + ASSERT_EQ(Compression::GZIP, source2.compression()); + + // Test copy constructor and comparison + FileSource source3 = source1; + ASSERT_EQ(source1, source3); +} + +TEST(FileSource, BufferBased) { + std::string the_data = "this is the file contents"; + auto buf = std::make_shared(the_data); + + FileSource source1(buf); + FileSource source2(buf, Compression::LZ4); + + ASSERT_EQ(FileSource::BUFFER, source1.type()); + ASSERT_TRUE(source1.buffer()->Equals(*buf)); + ASSERT_EQ(Compression::UNCOMPRESSED, source1.compression()); + + ASSERT_EQ(FileSource::BUFFER, source2.type()); + ASSERT_TRUE(source2.buffer()->Equals(*buf)); + ASSERT_EQ(Compression::LZ4, source2.compression()); +} + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h new file mode 100644 index 00000000000..a727b1ce4b8 --- /dev/null +++ b/cpp/src/arrow/dataset/filter.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +class ARROW_DS_EXPORT Filter { + public: + enum type { + /// Simple boolean predicate consisting of comparisons and boolean + /// logic (AND, OR, NOT) involving Schema fields + EXPRESSION, + + /// + GENERIC + }; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h new file mode 100644 index 00000000000..28c55adcc10 --- /dev/null +++ b/cpp/src/arrow/dataset/partition.h @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/dataset/dataset.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +// ---------------------------------------------------------------------- +// Computing partition values + +// TODO(wesm): API for computing partition keys derived from raw +// values. For example, year(value) or hash_function(value) instead of +// simply value, so a dataset with a timestamp column might group all +// data with year 2009 in the same partition + +// /// \brief +// class ScalarTransform { +// public: +// virtual Status Transform(const std::shared_ptr& input, +// std::shared_ptr* output) const = 0; +// }; + +// class PartitionField { +// public: + +// private: +// std::string field_name_; +// }; + +// ---------------------------------------------------------------------- +// Partition identifiers + +/// \brief A partition level identifier which can be used +/// +/// TODO(wesm): Is this general enough? What other kinds of partition +/// keys exist and do we need to support them? +class PartitionKey { + public: + const std::vector& fields() const { return fields_; } + const std::vector>& values() const { return values_; } + + private: + std::vector fields_; + std::vector> values_; +}; + +/// \brief Intermediate data structure for data parsed from a string +/// partition identifier. +/// +/// For example, the identifier "foo=5" might be parsed with a single +/// "foo" field and the value 5. A more complex identifier might be +/// written as "foo=5,bar=2", which would yield two fields and two +/// values. +/// +/// Some partition schemes may store the field names in a metadata +/// store instead of in file paths, for example +/// dataset_root/2009/11/... could be used when the partition fields +/// are "year" and "month" +struct PartitionKeyData { + std::vector fields; + std::vector> values; +}; + +// ---------------------------------------------------------------------- +// Partition schemes + +/// \brief +class ARROW_DS_EXPORT PartitionScheme { + public: + virtual ~PartitionScheme() = default; + + /// \brief The name identifying the kind of partition scheme + virtual std::string name() const = 0; + + virtual bool PathMatchesScheme(const std::string& path) const = 0; + + virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0; +}; + +/// \brief Multi-level, directory based partitioning scheme +/// originating from Apache Hive with all data files stored in the +/// leaf directories. Data is partitioned by static values of a +/// particular column in the schema. Partition keys are represented in +/// the form $key=$value in directory names +class ARROW_DS_EXPORT HivePartitionScheme : public PartitionScheme { + public: + /// \brief Return true if path + bool PathMatchesScheme(const std::string& path) const override; + + virtual Status ParseKey(const std::string& path, PartitionKeyData* out) const = 0; +}; + +// ---------------------------------------------------------------------- +// + +// Partitioned datasets come in different forms. Here is an example of +// a Hive-style partitioned dataset: +// +// dataset_root/ +// key1=$k1_v1/ +// key2=$k2_v1/ +// 0.parquet +// 1.parquet +// 2.parquet +// 3.parquet +// key2=$k2_v2/ +// 0.parquet +// 1.parquet +// key1=$k1_v2/ +// key2=$k2_v1/ +// 0.parquet +// 1.parquet +// key2=$k2_v2/ +// 0.parquet +// 1.parquet +// 2.parquet +// +// In this case, the dataset has 11 fragments (11 files) to be +// scanned, or potentially more if it is configured to split Parquet +// files at the row group level + +class ARROW_DS_EXPORT Partition : public DataSource { + public: + std::string type() const override; + + /// \brief The key for this partition source, may be nullptr, + /// e.g. for the top-level partitioned source container + virtual const PartitionKey* key() const = 0; + + virtual std::unique_ptr GetFragments( + const Selector& selector) = 0; +}; + +/// \brief Simple implementation of Partition, which consists of a +/// partition identifier, subpartitions, and some data fragments +class ARROW_DS_EXPORT SimplePartition : public Partition { + public: + SimplePartition(std::unique_ptr partition_key, + DataFragmentVector&& data_fragments, PartitionVector&& subpartitions, + std::shared_ptr scan_options = NULLPTR) + : key_(std::move(partition_key)), + data_fragments_(std::move(data_fragments)), + subpartitions_(std::move(subpartitions)), + scan_options_(scan_options) {} + + const PartitionKey* key() const override { return key_.get(); } + + int num_subpartitions() const { return static_cast(subpartitions_.size()); } + + int num_data_fragments() const { return static_cast(data_fragments__.size()); } + + const PartitionVector& subpartitions() const { return subpartitions_; } + const DataFragmentVector& data_fragments() const { return data_fragments_; } + + std::unique_ptr GetFragments( + const FilterVector& filters) override; + + private: + std::unique_ptr key_; + + /// \brief Data fragments belonging to this partition level. In some + /// partition schemes such as Hive-style, this member is + /// mutually-exclusive with subpartitions, where data fragments + /// occur only in the partition leaves + std::vector> data_fragments_; + + /// \brief Child partitions of this partition + std::vector> subpartitions_; + + /// \brief Default scan options to use for data fragments + std::shared_ptr scan_options_; +}; + +/// \brief A PartitionSource that returns fragments as the result of input iterators +class ARROW_DS_EXPORT LazyPartition : public Partition { + public: + const PartitionKey* key() const override; + + std::unique_ptr GetFragments( + const& DataSelector selector) override; + + // TODO(wesm): Iterate over subpartitions + + protected: + std::unique_ptr partition_iter_; + + // By default, once this source is consumed using GetFragments, it + // cannot be consumed again. By setting this to true, we cache + bool cache_manifest_ = false; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc new file mode 100644 index 00000000000..ad802643017 --- /dev/null +++ b/cpp/src/arrow/dataset/scanner.cc @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/dataset/scanner.h" + +namespace arrow { +namespace dataset {} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h new file mode 100644 index 00000000000..36d3b84cf87 --- /dev/null +++ b/cpp/src/arrow/dataset/scanner.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +/// \brief Shared state for a Scan operation +struct ARROW_DS_EXPORT ScanContext {}; + +// TODO(wesm): API for handling of post-materialization filters. For +// example, if the user requests [$col1 > 0, $col2 > 0] and $col1 is a +// partition key, but $col2 is not, then the filter "$col2 > 0" must +// be evaluated in-memory against the RecordBatch objects resulting +// from the Scan + +class ARROW_DS_EXPORT ScanOptions { + public: + virtual ~ScanOptions() = default; +}; + +/// \brief Read record batches from a range of a single data fragment +class ARROW_DS_EXPORT ScanTask { + public: + virtual ~ScanTask() = default; + + /// \brief Iterate through sequence of materialized record batches + /// resulting from the Scan. Execution semantics encapsulated in the + /// particular ScanTask implementation + virtual std::unique_ptr Scan() = 0; +}; + +/// \brief Main interface for +class ARROW_DS_EXPORT Scanner { + public: + virtual ~Scanner() = default; + + /// \brief Return iterator yielding ScanTask instances to enable + /// serial or parallel execution of units of scanning work + virtual std::unique_ptr GetTasks() = 0; +}; + +class ARROW_DS_EXPORT ScannerBuilder { + public: + ScannerBuilder(std::shared_ptr dataset, + std::shared_ptr scan_context); + + /// \brief Set + ScannerBuilder* Project(const std::vector& columns); + + ScannerBuilder* AddFilter(const std::shared_ptr& filter); + + ScannerBuilder* SetGlobalFileOptions(std::shared_ptr options); + + /// \brief If true (default), add partition keys to the + /// RecordBatches that the scan produces if they are not in the data + /// otherwise + ScannerBuilder* IncludePartitionKeys(bool include = true); + + /// \brief Return the constructed now-immutable Scanner object + std::unique_ptr Finish() const; + + private: + std::shared_ptr dataset_; + std::shared_ptr scan_context_; + std::vector project_columns_; + FilterVector filters_; + bool include_partition_keys_; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/transaction.h b/cpp/src/arrow/dataset/transaction.h new file mode 100644 index 00000000000..d5c94b27cf0 --- /dev/null +++ b/cpp/src/arrow/dataset/transaction.h @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset {} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/type_fwd.h b/cpp/src/arrow/dataset/type_fwd.h new file mode 100644 index 00000000000..8e3824625ed --- /dev/null +++ b/cpp/src/arrow/dataset/type_fwd.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dataset/visibility.h" +#include "arrow/type_fwd.h" // IWYU pragma: export + +namespace arrow { + +namespace fs { + +class FileSystem; + +} // namespace fs + +namespace dataset { + +class Dataset; +class DataFragment; +class DataSource; +struct DataSelector; +using DataFragmentIterator = Iterator>; +using DataFragmentVector = std::vector>; + +struct DiscoveryOptions; + +class FileBasedDataFragment; +class FileFormat; +class FileScanOptions; +class FileWriteOptions; + +class Filter; +using FilterVector = std::vector>; + +class Partition; +class PartitionKey; +class PartitionScheme; +using PartitionVector = std::vector>; +using PartitionIterator = Iterator>; + +struct ScanContext; +class ScanOptions; +class Scanner; +class ScannerBuilder; +class ScanTask; +using ScanTaskIterator = Iterator>; + +class DatasetWriter; +class WriteContext; +class WriteOptions; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/dataset/visibility.h b/cpp/src/arrow/dataset/visibility.h new file mode 100644 index 00000000000..324b1b269da --- /dev/null +++ b/cpp/src/arrow/dataset/visibility.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_DS_STATIC +#define ARROW_DS_EXPORT +#elif defined(ARROW_DS_EXPORTING) +#define ARROW_DS_EXPORT __declspec(dllexport) +#else +#define ARROW_DS_EXPORT __declspec(dllimport) +#endif + +#define ARROW_DS_NO_EXPORT +#else // Not Windows +#ifndef ARROW_DS_EXPORT +#define ARROW_DS_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef ARROW_DS_NO_EXPORT +#define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/cpp/src/arrow/dataset/writer.h b/cpp/src/arrow/dataset/writer.h new file mode 100644 index 00000000000..048a0e54d75 --- /dev/null +++ b/cpp/src/arrow/dataset/writer.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" + +namespace arrow { +namespace dataset { + +class ARROW_DS_EXPORT WriteOptions { + public: + virtual ~WriteOptions() = default; +}; + +} // namespace dataset +} // namespace arrow diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index f80d4ed7683..2d0538db4af 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_RECORD_BATCH_H -#define ARROW_RECORD_BATCH_H +#pragma once #include #include @@ -24,6 +23,7 @@ #include #include "arrow/type_fwd.h" +#include "arrow/util/iterator.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -186,5 +186,3 @@ class ARROW_EXPORT RecordBatchReader { }; } // namespace arrow - -#endif // ARROW_RECORD_BATCH_H diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 918c25e6294..ea32b49d168 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -20,6 +20,7 @@ #include +#include "arrow/util/iterator.h" #include "arrow/util/visibility.h" namespace arrow { @@ -39,6 +40,8 @@ class Column; class RecordBatch; class Table; +using RecordBatchIterator = Iterator>; + class Buffer; class MemoryPool; class RecordBatch; diff --git a/cpp/src/arrow/util/interfaces.h b/cpp/src/arrow/util/interfaces.h new file mode 100644 index 00000000000..3ebe2207f7b --- /dev/null +++ b/cpp/src/arrow/util/interfaces.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/visibility.h" + +namespace arrow { + +class Status; + +template +class Iterator { + public: + virtual Status Next(T* out) = 0; +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/iterator.h b/cpp/src/arrow/util/iterator.h new file mode 100644 index 00000000000..52fb10b09a4 --- /dev/null +++ b/cpp/src/arrow/util/iterator.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dataset/visibility.h" + +namespace arrow { + +class Status; + +/// \brief A generic Iterator that can return errors +template +class Iterator { + public: + /// \brief Return the next element of the sequence, nullptr when the + /// iteration is completed + virtual Status Next(T* out) = 0; +}; + +} // namespace arrow