From ace2adfbe17c84b918b0afea72729707a9a360b6 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 5 Jun 2019 12:19:34 +0200 Subject: [PATCH 1/4] R script to copy arrow and parquet headers in inst/include --- r/data-raw/arrow_headers.R | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 r/data-raw/arrow_headers.R diff --git a/r/data-raw/arrow_headers.R b/r/data-raw/arrow_headers.R new file mode 100644 index 00000000000..b81e3901261 --- /dev/null +++ b/r/data-raw/arrow_headers.R @@ -0,0 +1,21 @@ +library(fs) +library(magrittr) +library(glue) + +copy_headers <- function(dir) { + origin <- glue("../cpp/src/{dir}") + dest <- glue("r/inst/include/{dir}") + + # copy all files + try(dir_delete(dest), silent = TRUE) + dir_copy(origin, dest) + + # only keep headers + dir_ls(dest, glob = "*.h", invert = TRUE, all = TRUE, recurse = TRUE, type = "file") %>% + file_delete() + + message(glue("{n} header files copied in `{dest}`", n = length(dir_ls(dest, recurse = TRUE)))) + invisible() +} +copy_headers("arrow") +copy_headers("parquet") From 6c919a6604603a3d93dc38f76daff90a8cda7c66 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 5 Jun 2019 12:20:16 +0200 Subject: [PATCH 2/4] bundle arrow and parquet header files in the R package --- r/R/inst/include/arrow/adapters/orc/adapter.h | 152 + .../include/arrow/adapters/orc/adapter_util.h | 44 + .../arrow/adapters/tensorflow/convert.h | 158 + r/R/inst/include/arrow/allocator.h | 151 + r/R/inst/include/arrow/api.h | 43 + r/R/inst/include/arrow/array.h | 1072 +++ .../include/arrow/array/builder_adaptive.h | 175 + r/R/inst/include/arrow/array/builder_base.h | 219 + r/R/inst/include/arrow/array/builder_binary.h | 365 + .../include/arrow/array/builder_decimal.h | 51 + r/R/inst/include/arrow/array/builder_dict.h | 369 + r/R/inst/include/arrow/array/builder_nested.h | 200 + .../include/arrow/array/builder_primitive.h | 427 + r/R/inst/include/arrow/array/builder_time.h | 70 + r/R/inst/include/arrow/array/builder_union.h | 106 + r/R/inst/include/arrow/array/concatenate.h | 39 + r/R/inst/include/arrow/buffer-builder.h | 376 + r/R/inst/include/arrow/buffer.h | 444 + r/R/inst/include/arrow/builder.h | 58 + r/R/inst/include/arrow/compare.h | 101 + r/R/inst/include/arrow/compute/api.h | 33 + .../include/arrow/compute/benchmark-util.h | 74 + r/R/inst/include/arrow/compute/context.h | 82 + r/R/inst/include/arrow/compute/expression.h | 261 + r/R/inst/include/arrow/compute/kernel.h | 271 + .../include/arrow/compute/kernels/aggregate.h | 115 + .../include/arrow/compute/kernels/boolean.h | 76 + r/R/inst/include/arrow/compute/kernels/cast.h | 98 + .../include/arrow/compute/kernels/compare.h | 116 + .../include/arrow/compute/kernels/count.h | 88 + .../include/arrow/compute/kernels/filter.h | 67 + .../kernels/generated/cast-codegen-internal.h | 208 + r/R/inst/include/arrow/compute/kernels/hash.h | 105 + r/R/inst/include/arrow/compute/kernels/mean.h | 66 + .../arrow/compute/kernels/sum-internal.h | 207 + r/R/inst/include/arrow/compute/kernels/sum.h | 70 + r/R/inst/include/arrow/compute/kernels/take.h | 83 + .../arrow/compute/kernels/util-internal.h | 144 + r/R/inst/include/arrow/compute/logical_type.h | 308 + r/R/inst/include/arrow/compute/operation.h | 52 + .../include/arrow/compute/operations/cast.h | 46 + .../arrow/compute/operations/literal.h | 45 + r/R/inst/include/arrow/compute/test-util.h | 110 + r/R/inst/include/arrow/compute/type_fwd.h | 38 + r/R/inst/include/arrow/csv/api.h | 24 + r/R/inst/include/arrow/csv/chunker.h | 69 + r/R/inst/include/arrow/csv/column-builder.h | 87 + r/R/inst/include/arrow/csv/converter.h | 68 + r/R/inst/include/arrow/csv/options.h | 98 + r/R/inst/include/arrow/csv/parser.h | 149 + r/R/inst/include/arrow/csv/reader.h | 53 + r/R/inst/include/arrow/csv/test-common.h | 71 + r/R/inst/include/arrow/dbi/hiveserver2/api.h | 27 + .../arrow/dbi/hiveserver2/columnar-row-set.h | 155 + .../include/arrow/dbi/hiveserver2/operation.h | 127 + .../include/arrow/dbi/hiveserver2/service.h | 140 + .../include/arrow/dbi/hiveserver2/session.h | 84 + .../arrow/dbi/hiveserver2/thrift-internal.h | 91 + .../include/arrow/dbi/hiveserver2/types.h | 131 + r/R/inst/include/arrow/dbi/hiveserver2/util.h | 36 + r/R/inst/include/arrow/extension_type.h | 115 + .../include/arrow/filesystem/filesystem.h | 247 + r/R/inst/include/arrow/filesystem/localfs.h | 67 + r/R/inst/include/arrow/filesystem/mockfs.h | 104 + r/R/inst/include/arrow/filesystem/path-util.h | 70 + r/R/inst/include/arrow/filesystem/test-util.h | 126 + .../include/arrow/filesystem/util-internal.h | 36 + r/R/inst/include/arrow/flight/api.h | 24 + r/R/inst/include/arrow/flight/client.h | 178 + r/R/inst/include/arrow/flight/client_auth.h | 62 + .../include/arrow/flight/customize_protobuf.h | 129 + r/R/inst/include/arrow/flight/internal.h | 100 + r/R/inst/include/arrow/flight/platform.h | 32 + .../include/arrow/flight/protocol-internal.h | 28 + .../arrow/flight/serialization-internal.h | 67 + r/R/inst/include/arrow/flight/server.h | 207 + r/R/inst/include/arrow/flight/server_auth.h | 78 + r/R/inst/include/arrow/flight/test-util.h | 188 + r/R/inst/include/arrow/flight/types.h | 290 + r/R/inst/include/arrow/flight/visibility.h | 48 + r/R/inst/include/arrow/gpu/cuda_api.h | 26 + r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h | 77 + r/R/inst/include/arrow/gpu/cuda_common.h | 40 + r/R/inst/include/arrow/gpu/cuda_context.h | 168 + r/R/inst/include/arrow/gpu/cuda_memory.h | 232 + r/R/inst/include/arrow/io/api.h | 28 + r/R/inst/include/arrow/io/buffered.h | 160 + r/R/inst/include/arrow/io/compressed.h | 115 + r/R/inst/include/arrow/io/file.h | 246 + r/R/inst/include/arrow/io/hdfs-internal.h | 224 + r/R/inst/include/arrow/io/hdfs.h | 258 + r/R/inst/include/arrow/io/interfaces.h | 206 + r/R/inst/include/arrow/io/memory.h | 172 + r/R/inst/include/arrow/io/mman.h | 181 + r/R/inst/include/arrow/io/readahead.h | 98 + r/R/inst/include/arrow/io/test-common.h | 61 + r/R/inst/include/arrow/ipc/api.h | 28 + r/R/inst/include/arrow/ipc/dictionary.h | 106 + r/R/inst/include/arrow/ipc/feather-internal.h | 235 + r/R/inst/include/arrow/ipc/feather.h | 173 + r/R/inst/include/arrow/ipc/json-integration.h | 133 + r/R/inst/include/arrow/ipc/json-internal.h | 120 + r/R/inst/include/arrow/ipc/json-simple.h | 56 + r/R/inst/include/arrow/ipc/message.h | 241 + .../include/arrow/ipc/metadata-internal.h | 176 + r/R/inst/include/arrow/ipc/reader.h | 291 + r/R/inst/include/arrow/ipc/test-common.h | 134 + r/R/inst/include/arrow/ipc/util.h | 48 + r/R/inst/include/arrow/ipc/writer.h | 366 + r/R/inst/include/arrow/json/api.h | 21 + r/R/inst/include/arrow/json/chunked-builder.h | 76 + r/R/inst/include/arrow/json/chunker.h | 69 + r/R/inst/include/arrow/json/converter.h | 94 + r/R/inst/include/arrow/json/options.h | 63 + r/R/inst/include/arrow/json/parser.h | 96 + r/R/inst/include/arrow/json/rapidjson-defs.h | 44 + r/R/inst/include/arrow/json/reader.h | 62 + r/R/inst/include/arrow/json/test-common.h | 183 + r/R/inst/include/arrow/memory_pool-test.h | 90 + r/R/inst/include/arrow/memory_pool.h | 155 + r/R/inst/include/arrow/pretty_print.h | 112 + r/R/inst/include/arrow/python/api.h | 32 + .../include/arrow/python/arrow_to_pandas.h | 97 + r/R/inst/include/arrow/python/benchmark.h | 39 + r/R/inst/include/arrow/python/common.h | 265 + r/R/inst/include/arrow/python/config.h | 42 + r/R/inst/include/arrow/python/decimal.h | 113 + r/R/inst/include/arrow/python/deserialize.h | 92 + r/R/inst/include/arrow/python/flight.h | 207 + r/R/inst/include/arrow/python/helpers.h | 136 + r/R/inst/include/arrow/python/inference.h | 64 + r/R/inst/include/arrow/python/init.h | 29 + r/R/inst/include/arrow/python/io.h | 108 + r/R/inst/include/arrow/python/iterators.h | 157 + .../include/arrow/python/numpy-internal.h | 179 + r/R/inst/include/arrow/python/numpy_convert.h | 74 + r/R/inst/include/arrow/python/numpy_interop.h | 99 + .../include/arrow/python/numpy_to_arrow.h | 75 + r/R/inst/include/arrow/python/platform.h | 34 + r/R/inst/include/arrow/python/pyarrow.h | 86 + r/R/inst/include/arrow/python/pyarrow_api.h | 187 + r/R/inst/include/arrow/python/pyarrow_lib.h | 81 + .../include/arrow/python/python_to_arrow.h | 83 + r/R/inst/include/arrow/python/serialize.h | 136 + r/R/inst/include/arrow/python/type_traits.h | 302 + r/R/inst/include/arrow/python/util/datetime.h | 308 + r/R/inst/include/arrow/python/visibility.h | 39 + r/R/inst/include/arrow/record_batch.h | 190 + r/R/inst/include/arrow/scalar.h | 199 + r/R/inst/include/arrow/sparse_tensor.h | 259 + r/R/inst/include/arrow/status.h | 424 + r/R/inst/include/arrow/stl.h | 373 + r/R/inst/include/arrow/table.h | 377 + r/R/inst/include/arrow/table_builder.h | 113 + r/R/inst/include/arrow/tensor.h | 167 + r/R/inst/include/arrow/testing/gtest_common.h | 133 + r/R/inst/include/arrow/testing/gtest_util.h | 302 + r/R/inst/include/arrow/testing/random.h | 272 + r/R/inst/include/arrow/testing/util.h | 126 + r/R/inst/include/arrow/type.h | 1104 +++ r/R/inst/include/arrow/type_fwd.h | 225 + r/R/inst/include/arrow/type_traits.h | 590 ++ r/R/inst/include/arrow/util/basic_decimal.h | 175 + .../include/arrow/util/bit-stream-utils.h | 416 + r/R/inst/include/arrow/util/bit-util.h | 855 ++ r/R/inst/include/arrow/util/bpacking.h | 3308 +++++++ r/R/inst/include/arrow/util/checked_cast.h | 54 + r/R/inst/include/arrow/util/compiler-util.h | 25 + r/R/inst/include/arrow/util/compression.h | 153 + .../include/arrow/util/compression_brotli.h | 55 + r/R/inst/include/arrow/util/compression_bz2.h | 55 + r/R/inst/include/arrow/util/compression_lz4.h | 55 + .../include/arrow/util/compression_snappy.h | 54 + .../include/arrow/util/compression_zlib.h | 70 + .../include/arrow/util/compression_zstd.h | 55 + r/R/inst/include/arrow/util/cpu-info.h | 101 + r/R/inst/include/arrow/util/decimal.h | 133 + r/R/inst/include/arrow/util/hash-util.h | 310 + r/R/inst/include/arrow/util/hashing.h | 807 ++ r/R/inst/include/arrow/util/int-util.h | 89 + r/R/inst/include/arrow/util/io-util.h | 263 + .../include/arrow/util/key_value_metadata.h | 81 + r/R/inst/include/arrow/util/lazy.h | 128 + r/R/inst/include/arrow/util/logging.h | 244 + r/R/inst/include/arrow/util/macros.h | 164 + r/R/inst/include/arrow/util/memory.h | 46 + r/R/inst/include/arrow/util/neon-util.h | 59 + r/R/inst/include/arrow/util/parallel.h | 95 + r/R/inst/include/arrow/util/parsing.h | 512 ++ r/R/inst/include/arrow/util/rle-encoding.h | 604 ++ r/R/inst/include/arrow/util/sse-util.h | 122 + r/R/inst/include/arrow/util/stl.h | 95 + r/R/inst/include/arrow/util/stopwatch.h | 48 + r/R/inst/include/arrow/util/string.h | 68 + r/R/inst/include/arrow/util/string_builder.h | 69 + r/R/inst/include/arrow/util/string_view.h | 33 + r/R/inst/include/arrow/util/task-group.h | 91 + r/R/inst/include/arrow/util/thread-pool.h | 169 + r/R/inst/include/arrow/util/trie.h | 245 + r/R/inst/include/arrow/util/type_traits.h | 48 + r/R/inst/include/arrow/util/ubsan.h | 53 + r/R/inst/include/arrow/util/uri.h | 70 + r/R/inst/include/arrow/util/utf8.h | 176 + r/R/inst/include/arrow/util/variant.h | 36 + r/R/inst/include/arrow/util/visibility.h | 56 + .../arrow/util/windows_compatibility.h | 40 + r/R/inst/include/arrow/vendored/datetime.h | 21 + .../include/arrow/vendored/datetime/date.h | 8025 +++++++++++++++++ .../include/arrow/vendored/datetime/ios.h | 53 + r/R/inst/include/arrow/vendored/datetime/tz.h | 2590 ++++++ .../arrow/vendored/datetime/tz_private.h | 321 + .../arrow/vendored/datetime/visibility.h | 26 + .../include/arrow/vendored/xxhash/xxhash.h | 330 + r/R/inst/include/arrow/visitor.h | 138 + r/R/inst/include/arrow/visitor_inline.h | 277 + r/R/inst/include/parquet/api/io.h | 24 + r/R/inst/include/parquet/api/reader.h | 37 + r/R/inst/include/parquet/api/schema.h | 24 + r/R/inst/include/parquet/api/writer.h | 27 + r/R/inst/include/parquet/arrow/reader.h | 356 + .../include/parquet/arrow/record_reader.h | 122 + r/R/inst/include/parquet/arrow/schema.h | 100 + r/R/inst/include/parquet/arrow/test-util.h | 485 + r/R/inst/include/parquet/arrow/writer.h | 250 + r/R/inst/include/parquet/bloom_filter.h | 255 + r/R/inst/include/parquet/column_page.h | 173 + r/R/inst/include/parquet/column_reader.h | 255 + r/R/inst/include/parquet/column_scanner.h | 265 + r/R/inst/include/parquet/column_writer.h | 192 + r/R/inst/include/parquet/deprecated_io.h | 135 + r/R/inst/include/parquet/encoding.h | 358 + .../include/parquet/encryption_internal.h | 114 + r/R/inst/include/parquet/exception.h | 91 + r/R/inst/include/parquet/file_reader.h | 141 + r/R/inst/include/parquet/file_writer.h | 237 + r/R/inst/include/parquet/hasher.h | 75 + r/R/inst/include/parquet/metadata.h | 304 + r/R/inst/include/parquet/murmur3.h | 57 + r/R/inst/include/parquet/platform.h | 112 + r/R/inst/include/parquet/printer.h | 49 + r/R/inst/include/parquet/properties.h | 428 + r/R/inst/include/parquet/schema-internal.h | 139 + r/R/inst/include/parquet/schema.h | 470 + r/R/inst/include/parquet/statistics.h | 307 + r/R/inst/include/parquet/test-util.h | 710 ++ r/R/inst/include/parquet/thrift.h | 214 + r/R/inst/include/parquet/types.h | 662 ++ .../include/parquet/windows_compatibility.h | 30 + 248 files changed, 53735 insertions(+) create mode 100644 r/R/inst/include/arrow/adapters/orc/adapter.h create mode 100644 r/R/inst/include/arrow/adapters/orc/adapter_util.h create mode 100644 r/R/inst/include/arrow/adapters/tensorflow/convert.h create mode 100644 r/R/inst/include/arrow/allocator.h create mode 100644 r/R/inst/include/arrow/api.h create mode 100644 r/R/inst/include/arrow/array.h create mode 100644 r/R/inst/include/arrow/array/builder_adaptive.h create mode 100644 r/R/inst/include/arrow/array/builder_base.h create mode 100644 r/R/inst/include/arrow/array/builder_binary.h create mode 100644 r/R/inst/include/arrow/array/builder_decimal.h create mode 100644 r/R/inst/include/arrow/array/builder_dict.h create mode 100644 r/R/inst/include/arrow/array/builder_nested.h create mode 100644 r/R/inst/include/arrow/array/builder_primitive.h create mode 100644 r/R/inst/include/arrow/array/builder_time.h create mode 100644 r/R/inst/include/arrow/array/builder_union.h create mode 100644 r/R/inst/include/arrow/array/concatenate.h create mode 100644 r/R/inst/include/arrow/buffer-builder.h create mode 100644 r/R/inst/include/arrow/buffer.h create mode 100644 r/R/inst/include/arrow/builder.h create mode 100644 r/R/inst/include/arrow/compare.h create mode 100644 r/R/inst/include/arrow/compute/api.h create mode 100644 r/R/inst/include/arrow/compute/benchmark-util.h create mode 100644 r/R/inst/include/arrow/compute/context.h create mode 100644 r/R/inst/include/arrow/compute/expression.h create mode 100644 r/R/inst/include/arrow/compute/kernel.h create mode 100644 r/R/inst/include/arrow/compute/kernels/aggregate.h create mode 100644 r/R/inst/include/arrow/compute/kernels/boolean.h create mode 100644 r/R/inst/include/arrow/compute/kernels/cast.h create mode 100644 r/R/inst/include/arrow/compute/kernels/compare.h create mode 100644 r/R/inst/include/arrow/compute/kernels/count.h create mode 100644 r/R/inst/include/arrow/compute/kernels/filter.h create mode 100644 r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h create mode 100644 r/R/inst/include/arrow/compute/kernels/hash.h create mode 100644 r/R/inst/include/arrow/compute/kernels/mean.h create mode 100644 r/R/inst/include/arrow/compute/kernels/sum-internal.h create mode 100644 r/R/inst/include/arrow/compute/kernels/sum.h create mode 100644 r/R/inst/include/arrow/compute/kernels/take.h create mode 100644 r/R/inst/include/arrow/compute/kernels/util-internal.h create mode 100644 r/R/inst/include/arrow/compute/logical_type.h create mode 100644 r/R/inst/include/arrow/compute/operation.h create mode 100644 r/R/inst/include/arrow/compute/operations/cast.h create mode 100644 r/R/inst/include/arrow/compute/operations/literal.h create mode 100644 r/R/inst/include/arrow/compute/test-util.h create mode 100644 r/R/inst/include/arrow/compute/type_fwd.h create mode 100644 r/R/inst/include/arrow/csv/api.h create mode 100644 r/R/inst/include/arrow/csv/chunker.h create mode 100644 r/R/inst/include/arrow/csv/column-builder.h create mode 100644 r/R/inst/include/arrow/csv/converter.h create mode 100644 r/R/inst/include/arrow/csv/options.h create mode 100644 r/R/inst/include/arrow/csv/parser.h create mode 100644 r/R/inst/include/arrow/csv/reader.h create mode 100644 r/R/inst/include/arrow/csv/test-common.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/api.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/operation.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/service.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/session.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/types.h create mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/util.h create mode 100644 r/R/inst/include/arrow/extension_type.h create mode 100644 r/R/inst/include/arrow/filesystem/filesystem.h create mode 100644 r/R/inst/include/arrow/filesystem/localfs.h create mode 100644 r/R/inst/include/arrow/filesystem/mockfs.h create mode 100644 r/R/inst/include/arrow/filesystem/path-util.h create mode 100644 r/R/inst/include/arrow/filesystem/test-util.h create mode 100644 r/R/inst/include/arrow/filesystem/util-internal.h create mode 100644 r/R/inst/include/arrow/flight/api.h create mode 100644 r/R/inst/include/arrow/flight/client.h create mode 100644 r/R/inst/include/arrow/flight/client_auth.h create mode 100644 r/R/inst/include/arrow/flight/customize_protobuf.h create mode 100644 r/R/inst/include/arrow/flight/internal.h create mode 100644 r/R/inst/include/arrow/flight/platform.h create mode 100644 r/R/inst/include/arrow/flight/protocol-internal.h create mode 100644 r/R/inst/include/arrow/flight/serialization-internal.h create mode 100644 r/R/inst/include/arrow/flight/server.h create mode 100644 r/R/inst/include/arrow/flight/server_auth.h create mode 100644 r/R/inst/include/arrow/flight/test-util.h create mode 100644 r/R/inst/include/arrow/flight/types.h create mode 100644 r/R/inst/include/arrow/flight/visibility.h create mode 100644 r/R/inst/include/arrow/gpu/cuda_api.h create mode 100644 r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h create mode 100644 r/R/inst/include/arrow/gpu/cuda_common.h create mode 100644 r/R/inst/include/arrow/gpu/cuda_context.h create mode 100644 r/R/inst/include/arrow/gpu/cuda_memory.h create mode 100644 r/R/inst/include/arrow/io/api.h create mode 100644 r/R/inst/include/arrow/io/buffered.h create mode 100644 r/R/inst/include/arrow/io/compressed.h create mode 100644 r/R/inst/include/arrow/io/file.h create mode 100644 r/R/inst/include/arrow/io/hdfs-internal.h create mode 100644 r/R/inst/include/arrow/io/hdfs.h create mode 100644 r/R/inst/include/arrow/io/interfaces.h create mode 100644 r/R/inst/include/arrow/io/memory.h create mode 100644 r/R/inst/include/arrow/io/mman.h create mode 100644 r/R/inst/include/arrow/io/readahead.h create mode 100644 r/R/inst/include/arrow/io/test-common.h create mode 100644 r/R/inst/include/arrow/ipc/api.h create mode 100644 r/R/inst/include/arrow/ipc/dictionary.h create mode 100644 r/R/inst/include/arrow/ipc/feather-internal.h create mode 100644 r/R/inst/include/arrow/ipc/feather.h create mode 100644 r/R/inst/include/arrow/ipc/json-integration.h create mode 100644 r/R/inst/include/arrow/ipc/json-internal.h create mode 100644 r/R/inst/include/arrow/ipc/json-simple.h create mode 100644 r/R/inst/include/arrow/ipc/message.h create mode 100644 r/R/inst/include/arrow/ipc/metadata-internal.h create mode 100644 r/R/inst/include/arrow/ipc/reader.h create mode 100644 r/R/inst/include/arrow/ipc/test-common.h create mode 100644 r/R/inst/include/arrow/ipc/util.h create mode 100644 r/R/inst/include/arrow/ipc/writer.h create mode 100644 r/R/inst/include/arrow/json/api.h create mode 100644 r/R/inst/include/arrow/json/chunked-builder.h create mode 100644 r/R/inst/include/arrow/json/chunker.h create mode 100644 r/R/inst/include/arrow/json/converter.h create mode 100644 r/R/inst/include/arrow/json/options.h create mode 100644 r/R/inst/include/arrow/json/parser.h create mode 100644 r/R/inst/include/arrow/json/rapidjson-defs.h create mode 100644 r/R/inst/include/arrow/json/reader.h create mode 100644 r/R/inst/include/arrow/json/test-common.h create mode 100644 r/R/inst/include/arrow/memory_pool-test.h create mode 100644 r/R/inst/include/arrow/memory_pool.h create mode 100644 r/R/inst/include/arrow/pretty_print.h create mode 100644 r/R/inst/include/arrow/python/api.h create mode 100644 r/R/inst/include/arrow/python/arrow_to_pandas.h create mode 100644 r/R/inst/include/arrow/python/benchmark.h create mode 100644 r/R/inst/include/arrow/python/common.h create mode 100644 r/R/inst/include/arrow/python/config.h create mode 100644 r/R/inst/include/arrow/python/decimal.h create mode 100644 r/R/inst/include/arrow/python/deserialize.h create mode 100644 r/R/inst/include/arrow/python/flight.h create mode 100644 r/R/inst/include/arrow/python/helpers.h create mode 100644 r/R/inst/include/arrow/python/inference.h create mode 100644 r/R/inst/include/arrow/python/init.h create mode 100644 r/R/inst/include/arrow/python/io.h create mode 100644 r/R/inst/include/arrow/python/iterators.h create mode 100644 r/R/inst/include/arrow/python/numpy-internal.h create mode 100644 r/R/inst/include/arrow/python/numpy_convert.h create mode 100644 r/R/inst/include/arrow/python/numpy_interop.h create mode 100644 r/R/inst/include/arrow/python/numpy_to_arrow.h create mode 100644 r/R/inst/include/arrow/python/platform.h create mode 100644 r/R/inst/include/arrow/python/pyarrow.h create mode 100644 r/R/inst/include/arrow/python/pyarrow_api.h create mode 100644 r/R/inst/include/arrow/python/pyarrow_lib.h create mode 100644 r/R/inst/include/arrow/python/python_to_arrow.h create mode 100644 r/R/inst/include/arrow/python/serialize.h create mode 100644 r/R/inst/include/arrow/python/type_traits.h create mode 100644 r/R/inst/include/arrow/python/util/datetime.h create mode 100644 r/R/inst/include/arrow/python/visibility.h create mode 100644 r/R/inst/include/arrow/record_batch.h create mode 100644 r/R/inst/include/arrow/scalar.h create mode 100644 r/R/inst/include/arrow/sparse_tensor.h create mode 100644 r/R/inst/include/arrow/status.h create mode 100644 r/R/inst/include/arrow/stl.h create mode 100644 r/R/inst/include/arrow/table.h create mode 100644 r/R/inst/include/arrow/table_builder.h create mode 100644 r/R/inst/include/arrow/tensor.h create mode 100644 r/R/inst/include/arrow/testing/gtest_common.h create mode 100644 r/R/inst/include/arrow/testing/gtest_util.h create mode 100644 r/R/inst/include/arrow/testing/random.h create mode 100644 r/R/inst/include/arrow/testing/util.h create mode 100644 r/R/inst/include/arrow/type.h create mode 100644 r/R/inst/include/arrow/type_fwd.h create mode 100644 r/R/inst/include/arrow/type_traits.h create mode 100644 r/R/inst/include/arrow/util/basic_decimal.h create mode 100644 r/R/inst/include/arrow/util/bit-stream-utils.h create mode 100644 r/R/inst/include/arrow/util/bit-util.h create mode 100644 r/R/inst/include/arrow/util/bpacking.h create mode 100644 r/R/inst/include/arrow/util/checked_cast.h create mode 100644 r/R/inst/include/arrow/util/compiler-util.h create mode 100644 r/R/inst/include/arrow/util/compression.h create mode 100644 r/R/inst/include/arrow/util/compression_brotli.h create mode 100644 r/R/inst/include/arrow/util/compression_bz2.h create mode 100644 r/R/inst/include/arrow/util/compression_lz4.h create mode 100644 r/R/inst/include/arrow/util/compression_snappy.h create mode 100644 r/R/inst/include/arrow/util/compression_zlib.h create mode 100644 r/R/inst/include/arrow/util/compression_zstd.h create mode 100644 r/R/inst/include/arrow/util/cpu-info.h create mode 100644 r/R/inst/include/arrow/util/decimal.h create mode 100644 r/R/inst/include/arrow/util/hash-util.h create mode 100644 r/R/inst/include/arrow/util/hashing.h create mode 100644 r/R/inst/include/arrow/util/int-util.h create mode 100644 r/R/inst/include/arrow/util/io-util.h create mode 100644 r/R/inst/include/arrow/util/key_value_metadata.h create mode 100644 r/R/inst/include/arrow/util/lazy.h create mode 100644 r/R/inst/include/arrow/util/logging.h create mode 100644 r/R/inst/include/arrow/util/macros.h create mode 100644 r/R/inst/include/arrow/util/memory.h create mode 100644 r/R/inst/include/arrow/util/neon-util.h create mode 100644 r/R/inst/include/arrow/util/parallel.h create mode 100644 r/R/inst/include/arrow/util/parsing.h create mode 100644 r/R/inst/include/arrow/util/rle-encoding.h create mode 100644 r/R/inst/include/arrow/util/sse-util.h create mode 100644 r/R/inst/include/arrow/util/stl.h create mode 100644 r/R/inst/include/arrow/util/stopwatch.h create mode 100644 r/R/inst/include/arrow/util/string.h create mode 100644 r/R/inst/include/arrow/util/string_builder.h create mode 100644 r/R/inst/include/arrow/util/string_view.h create mode 100644 r/R/inst/include/arrow/util/task-group.h create mode 100644 r/R/inst/include/arrow/util/thread-pool.h create mode 100644 r/R/inst/include/arrow/util/trie.h create mode 100644 r/R/inst/include/arrow/util/type_traits.h create mode 100644 r/R/inst/include/arrow/util/ubsan.h create mode 100644 r/R/inst/include/arrow/util/uri.h create mode 100644 r/R/inst/include/arrow/util/utf8.h create mode 100644 r/R/inst/include/arrow/util/variant.h create mode 100644 r/R/inst/include/arrow/util/visibility.h create mode 100644 r/R/inst/include/arrow/util/windows_compatibility.h create mode 100644 r/R/inst/include/arrow/vendored/datetime.h create mode 100644 r/R/inst/include/arrow/vendored/datetime/date.h create mode 100644 r/R/inst/include/arrow/vendored/datetime/ios.h create mode 100644 r/R/inst/include/arrow/vendored/datetime/tz.h create mode 100644 r/R/inst/include/arrow/vendored/datetime/tz_private.h create mode 100644 r/R/inst/include/arrow/vendored/datetime/visibility.h create mode 100644 r/R/inst/include/arrow/vendored/xxhash/xxhash.h create mode 100644 r/R/inst/include/arrow/visitor.h create mode 100644 r/R/inst/include/arrow/visitor_inline.h create mode 100644 r/R/inst/include/parquet/api/io.h create mode 100644 r/R/inst/include/parquet/api/reader.h create mode 100644 r/R/inst/include/parquet/api/schema.h create mode 100644 r/R/inst/include/parquet/api/writer.h create mode 100644 r/R/inst/include/parquet/arrow/reader.h create mode 100644 r/R/inst/include/parquet/arrow/record_reader.h create mode 100644 r/R/inst/include/parquet/arrow/schema.h create mode 100644 r/R/inst/include/parquet/arrow/test-util.h create mode 100644 r/R/inst/include/parquet/arrow/writer.h create mode 100644 r/R/inst/include/parquet/bloom_filter.h create mode 100644 r/R/inst/include/parquet/column_page.h create mode 100644 r/R/inst/include/parquet/column_reader.h create mode 100644 r/R/inst/include/parquet/column_scanner.h create mode 100644 r/R/inst/include/parquet/column_writer.h create mode 100644 r/R/inst/include/parquet/deprecated_io.h create mode 100644 r/R/inst/include/parquet/encoding.h create mode 100644 r/R/inst/include/parquet/encryption_internal.h create mode 100644 r/R/inst/include/parquet/exception.h create mode 100644 r/R/inst/include/parquet/file_reader.h create mode 100644 r/R/inst/include/parquet/file_writer.h create mode 100644 r/R/inst/include/parquet/hasher.h create mode 100644 r/R/inst/include/parquet/metadata.h create mode 100644 r/R/inst/include/parquet/murmur3.h create mode 100644 r/R/inst/include/parquet/platform.h create mode 100644 r/R/inst/include/parquet/printer.h create mode 100644 r/R/inst/include/parquet/properties.h create mode 100644 r/R/inst/include/parquet/schema-internal.h create mode 100644 r/R/inst/include/parquet/schema.h create mode 100644 r/R/inst/include/parquet/statistics.h create mode 100644 r/R/inst/include/parquet/test-util.h create mode 100644 r/R/inst/include/parquet/thrift.h create mode 100644 r/R/inst/include/parquet/types.h create mode 100644 r/R/inst/include/parquet/windows_compatibility.h diff --git a/r/R/inst/include/arrow/adapters/orc/adapter.h b/r/R/inst/include/arrow/adapters/orc/adapter.h new file mode 100644 index 00000000000..6279f687dc1 --- /dev/null +++ b/r/R/inst/include/arrow/adapters/orc/adapter.h @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ORC_CONVERTER_H +#define ARROW_ORC_CONVERTER_H + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace adapters { + +namespace orc { + +/// \class ORCFileReader +/// \brief Read an Arrow Table or RecordBatch from an ORC file. +class ARROW_EXPORT ORCFileReader { + public: + ~ORCFileReader(); + + /// \brief Creates a new ORC reader. + /// + /// \param[in] file the data source + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \param[out] reader the returned reader object + /// \return Status + static Status Open(const std::shared_ptr& file, MemoryPool* pool, + std::unique_ptr* reader); + + /// \brief Return the schema read from the ORC file + /// + /// \param[out] out the returned Schema object + Status ReadSchema(std::shared_ptr* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[out] out the returned Table + Status Read(std::shared_ptr* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \param[out] out the returned Table + Status Read(const std::shared_ptr& schema, std::shared_ptr
* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned Table + Status Read(const std::vector& include_indices, std::shared_ptr
* out); + + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned Table + Status Read(const std::shared_ptr& schema, + const std::vector& include_indices, std::shared_ptr
* out); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[out] out the returned RecordBatch + Status ReadStripe(int64_t stripe, std::shared_ptr* out); + + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned RecordBatch + Status ReadStripe(int64_t stripe, const std::vector& include_indices, + std::shared_ptr* out); + + /// \brief Seek to designated row. Invoke NextStripeReader() after seek + /// will return stripe reader starting from designated row. + /// + /// \param[in] row_number the rows number to seek + Status Seek(int64_t row_number); + + /// \brief Get a stripe level record batch iterator with specified row count + /// in each record batch. NextStripeReader serves as an fine grain + /// alternative to ReadStripe which may cause OOM issue by loading + /// the whole stripes into memory. + /// + /// \param[in] batch_size the number of rows each record batch contains in + /// record batch iteration. + /// \param[out] out the returned stripe reader + Status NextStripeReader(int64_t batch_size, std::shared_ptr* out); + + /// \brief Get a stripe level record batch iterator with specified row count + /// in each record batch. NextStripeReader serves as an fine grain + /// alternative to ReadStripe which may cause OOM issue by loading + /// the whole stripes into memory. + /// + /// \param[in] batch_size Get a stripe level record batch iterator with specified row + /// count in each record batch. + /// + /// \param[in] include_indices the selected field indices to read + /// \param[out] out the returned stripe reader + Status NextStripeReader(int64_t batch_size, const std::vector& include_indices, + std::shared_ptr* out); + + /// \brief The number of stripes in the file + int64_t NumberOfStripes(); + + /// \brief The number of rows in the file + int64_t NumberOfRows(); + + private: + class Impl; + std::unique_ptr impl_; + ORCFileReader(); +}; + +} // namespace orc + +} // namespace adapters + +} // namespace arrow + +#endif // ARROW_ORC_CONVERTER_H diff --git a/r/R/inst/include/arrow/adapters/orc/adapter_util.h b/r/R/inst/include/arrow/adapters/orc/adapter_util.h new file mode 100644 index 00000000000..eede23051d2 --- /dev/null +++ b/r/R/inst/include/arrow/adapters/orc/adapter_util.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ADAPATER_UTIL_H +#define ARROW_ADAPATER_UTIL_H + +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/status.h" +#include "orc/OrcFile.hh" + +namespace liborc = orc; + +namespace arrow { + +namespace adapters { + +namespace orc { + +Status GetArrowType(const liborc::Type* type, std::shared_ptr* out); + +Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, + int64_t offset, int64_t length, ArrayBuilder* builder); +} // namespace orc +} // namespace adapters +} // namespace arrow + +#endif // ARROW_ADAPATER_UTIL_H diff --git a/r/R/inst/include/arrow/adapters/tensorflow/convert.h b/r/R/inst/include/arrow/adapters/tensorflow/convert.h new file mode 100644 index 00000000000..93b7e621ef8 --- /dev/null +++ b/r/R/inst/include/arrow/adapters/tensorflow/convert.h @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TENSORFLOW_CONVERTER_H +#define ARROW_TENSORFLOW_CONVERTER_H + +#include + +#include "tensorflow/core/framework/op.h" + +#include "arrow/type.h" + +// These utilities are supposed to be included in TensorFlow operators +// that need to be compiled separately from Arrow because of ABI issues. +// They therefore need to be header-only. + +namespace arrow { + +namespace adapters { + +namespace tensorflow { + +Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr* out) { + switch (dtype) { + case ::tensorflow::DT_BOOL: + *out = arrow::boolean(); + break; + case ::tensorflow::DT_FLOAT: + *out = arrow::float32(); + break; + case ::tensorflow::DT_DOUBLE: + *out = arrow::float64(); + break; + case ::tensorflow::DT_HALF: + *out = arrow::float16(); + break; + case ::tensorflow::DT_INT8: + *out = arrow::int8(); + break; + case ::tensorflow::DT_INT16: + *out = arrow::int16(); + break; + case ::tensorflow::DT_INT32: + *out = arrow::int32(); + break; + case ::tensorflow::DT_INT64: + *out = arrow::int64(); + break; + case ::tensorflow::DT_UINT8: + *out = arrow::uint8(); + break; + case ::tensorflow::DT_UINT16: + *out = arrow::uint16(); + break; + case ::tensorflow::DT_UINT32: + *out = arrow::uint32(); + break; + case ::tensorflow::DT_UINT64: + *out = arrow::uint64(); + break; + case ::tensorflow::DT_BFLOAT16: + case ::tensorflow::DT_COMPLEX64: + case ::tensorflow::DT_COMPLEX128: + case ::tensorflow::DT_INVALID: + case ::tensorflow::DT_QINT8: + case ::tensorflow::DT_QINT16: + case ::tensorflow::DT_QINT32: + case ::tensorflow::DT_QUINT8: + case ::tensorflow::DT_QUINT16: + case ::tensorflow::DT_RESOURCE: + case ::tensorflow::DT_STRING: + case ::tensorflow::DT_VARIANT: + default: + return Status::TypeError("TensorFlow data type is not supported"); + } + return Status::OK(); +} + +Status GetTensorFlowType(std::shared_ptr dtype, ::tensorflow::DataType* out) { + switch (dtype->id()) { + case Type::BOOL: + *out = ::tensorflow::DT_BOOL; + break; + case Type::UINT8: + *out = ::tensorflow::DT_UINT8; + break; + case Type::INT8: + *out = ::tensorflow::DT_INT8; + break; + case Type::UINT16: + *out = ::tensorflow::DT_UINT16; + break; + case Type::INT16: + *out = ::tensorflow::DT_INT16; + break; + case Type::UINT32: + *out = ::tensorflow::DT_UINT32; + break; + case Type::INT32: + *out = ::tensorflow::DT_INT32; + break; + case Type::UINT64: + *out = ::tensorflow::DT_UINT64; + break; + case Type::INT64: + *out = ::tensorflow::DT_INT64; + break; + case Type::HALF_FLOAT: + *out = ::tensorflow::DT_HALF; + break; + case Type::FLOAT: + *out = ::tensorflow::DT_FLOAT; + break; + case Type::DOUBLE: + *out = ::tensorflow::DT_DOUBLE; + break; + case Type::STRING: + case Type::BINARY: + case Type::FIXED_SIZE_BINARY: + case Type::DATE32: + case Type::DATE64: + case Type::TIMESTAMP: + case Type::TIME32: + case Type::TIME64: + case Type::INTERVAL: + case Type::DECIMAL: + case Type::LIST: + case Type::STRUCT: + case Type::UNION: + case Type::DICTIONARY: + case Type::MAP: + default: + return Status::TypeError("Arrow data type is not supported"); + } + return arrow::Status::OK(); +} + +} // namespace tensorflow + +} // namespace adapters + +} // namespace arrow + +#endif // ARROW_TENSORFLOW_CONVERTER_H diff --git a/r/R/inst/include/arrow/allocator.h b/r/R/inst/include/arrow/allocator.h new file mode 100644 index 00000000000..a02b8e64bb0 --- /dev/null +++ b/r/R/inst/include/arrow/allocator.h @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ALLOCATOR_H +#define ARROW_ALLOCATOR_H + +#include +#include +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" + +namespace arrow { + +/// \brief A STL allocator delegating allocations to a Arrow MemoryPool +template +class stl_allocator { + public: + using value_type = T; + using pointer = T*; + using const_pointer = const T*; + using reference = T&; + using const_reference = const T&; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + template + struct rebind { + using other = stl_allocator; + }; + + /// \brief Construct an allocator from the default MemoryPool + stl_allocator() noexcept : pool_(default_memory_pool()) {} + /// \brief Construct an allocator from the given MemoryPool + explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {} + + template + stl_allocator(const stl_allocator& rhs) noexcept : pool_(rhs.pool_) {} + + ~stl_allocator() { pool_ = NULLPTR; } + + pointer address(reference r) const noexcept { return std::addressof(r); } + + const_pointer address(const_reference r) const noexcept { return std::addressof(r); } + + pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) { + uint8_t* data; + Status s = pool_->Allocate(n * sizeof(T), &data); + if (!s.ok()) throw std::bad_alloc(); + return reinterpret_cast(data); + } + + void deallocate(pointer p, size_type n) { + pool_->Free(reinterpret_cast(p), n * sizeof(T)); + } + + size_type size_max() const noexcept { return size_type(-1) / sizeof(T); } + + template + void construct(U* p, Args&&... args) { + new (reinterpret_cast(p)) U(std::forward(args)...); + } + + template + void destroy(U* p) { + p->~U(); + } + + MemoryPool* pool() const noexcept { return pool_; } + + private: + MemoryPool* pool_; +}; + +/// \brief A MemoryPool implementation delegating allocations to a STL allocator +/// +/// Note that STL allocators don't provide a resizing operation, and therefore +/// any buffer resizes will do a full reallocation and copy. +template > +class STLMemoryPool : public MemoryPool { + public: + /// \brief Construct a memory pool from the given allocator + explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} + + Status Allocate(int64_t size, uint8_t** out) override { + try { + *out = alloc_.allocate(size); + } catch (std::bad_alloc& e) { + return Status::OutOfMemory(e.what()); + } + stats_.UpdateAllocatedBytes(size); + return Status::OK(); + } + + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { + uint8_t* old_ptr = *ptr; + try { + *ptr = alloc_.allocate(new_size); + } catch (std::bad_alloc& e) { + return Status::OutOfMemory(e.what()); + } + memcpy(*ptr, old_ptr, std::min(old_size, new_size)); + alloc_.deallocate(old_ptr, old_size); + stats_.UpdateAllocatedBytes(new_size - old_size); + return Status::OK(); + } + + void Free(uint8_t* buffer, int64_t size) override { + alloc_.deallocate(buffer, size); + stats_.UpdateAllocatedBytes(-size); + } + + int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } + + int64_t max_memory() const override { return stats_.max_memory(); } + + private: + Allocator alloc_; + internal::MemoryPoolStats stats_; +}; + +template +bool operator==(const stl_allocator& lhs, const stl_allocator& rhs) noexcept { + return lhs.pool() == rhs.pool(); +} + +template +bool operator!=(const stl_allocator& lhs, const stl_allocator& rhs) noexcept { + return !(lhs == rhs); +} + +} // namespace arrow + +#endif // ARROW_ALLOCATOR_H diff --git a/r/R/inst/include/arrow/api.h b/r/R/inst/include/arrow/api.h new file mode 100644 index 00000000000..3d6a17961b6 --- /dev/null +++ b/r/R/inst/include/arrow/api.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Coarse public API while the library is in development + +#ifndef ARROW_API_H +#define ARROW_API_H + +#include "arrow/array.h" // IYWU pragma: export +#include "arrow/buffer.h" // IYWU pragma: export +#include "arrow/builder.h" // IYWU pragma: export +#include "arrow/compare.h" // IYWU pragma: export +#include "arrow/extension_type.h" // IYWU pragma: export +#include "arrow/memory_pool.h" // IYWU pragma: export +#include "arrow/pretty_print.h" // IYWU pragma: export +#include "arrow/record_batch.h" // IYWU pragma: export +#include "arrow/status.h" // IYWU pragma: export +#include "arrow/table.h" // IYWU pragma: export +#include "arrow/table_builder.h" // IYWU pragma: export +#include "arrow/tensor.h" // IYWU pragma: export +#include "arrow/type.h" // IYWU pragma: export +#include "arrow/util/config.h" // IYWU pragma: export +#include "arrow/util/key_value_metadata.h" // IWYU pragma: export +#include "arrow/visitor.h" // IYWU pragma: export + +/// \brief Top-level namespace for Apache Arrow C++ API +namespace arrow {} + +#endif // ARROW_API_H diff --git a/r/R/inst/include/arrow/array.h b/r/R/inst/include/arrow/array.h new file mode 100644 index 00000000000..de8df2bb031 --- /dev/null +++ b/r/R/inst/include/arrow/array.h @@ -0,0 +1,1072 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ARRAY_H +#define ARROW_ARRAY_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/compare.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" // IWYU pragma: export +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class ArrayVisitor; + +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +constexpr int64_t kUnknownNullCount = -1; + +class MemoryPool; +class Status; + +// ---------------------------------------------------------------------- +// Generic array data container + +/// \class ArrayData +/// \brief Mutable container for generic Arrow array data +/// +/// This data structure is a self-contained representation of the memory and +/// metadata inside an Arrow array data structure (called vectors in Java). The +/// classes arrow::Array and its subclasses provide strongly-typed accessors +/// with support for the visitor pattern and other affordances. +/// +/// This class is designed for easy internal data manipulation, analytical data +/// processing, and data transport to and from IPC messages. For example, we +/// could cast from int64 to float64 like so: +/// +/// Int64Array arr = GetMyData(); +/// auto new_data = arr.data()->Copy(); +/// new_data->type = arrow::float64(); +/// DoubleArray double_arr(new_data); +/// +/// This object is also useful in an analytics setting where memory may be +/// reused. For example, if we had a group of operations all returning doubles, +/// say: +/// +/// Log(Sqrt(Expr(arr))) +/// +/// Then the low-level implementations of each of these functions could have +/// the signatures +/// +/// void Log(const ArrayData& values, ArrayData* out); +/// +/// As another example a function may consume one or more memory buffers in an +/// input array and replace them with newly-allocated data, changing the output +/// data type as well. +struct ARROW_EXPORT ArrayData { + ArrayData() : length(0), null_count(0), offset(0) {} + + ArrayData(const std::shared_ptr& type, int64_t length, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : type(type), length(length), null_count(null_count), offset(offset) {} + + ArrayData(const std::shared_ptr& type, int64_t length, + const std::vector>& buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(type, length, null_count, offset) { + this->buffers = buffers; + } + + ArrayData(const std::shared_ptr& type, int64_t length, + const std::vector>& buffers, + const std::vector>& child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(type, length, null_count, offset) { + this->buffers = buffers; + this->child_data = child_data; + } + + ArrayData(const std::shared_ptr& type, int64_t length, + std::vector>&& buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : ArrayData(type, length, null_count, offset) { + this->buffers = std::move(buffers); + } + + static std::shared_ptr Make(const std::shared_ptr& type, + int64_t length, + std::vector>&& buffers, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + static std::shared_ptr Make( + const std::shared_ptr& type, int64_t length, + const std::vector>& buffers, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + static std::shared_ptr Make( + const std::shared_ptr& type, int64_t length, + const std::vector>& buffers, + const std::vector>& child_data, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + static std::shared_ptr Make(const std::shared_ptr& type, + int64_t length, + int64_t null_count = kUnknownNullCount, + int64_t offset = 0); + + // Move constructor + ArrayData(ArrayData&& other) noexcept + : type(std::move(other.type)), + length(other.length), + null_count(other.null_count), + offset(other.offset), + buffers(std::move(other.buffers)), + child_data(std::move(other.child_data)), + dictionary(std::move(other.dictionary)) {} + + // Copy constructor + ArrayData(const ArrayData& other) noexcept + : type(other.type), + length(other.length), + null_count(other.null_count), + offset(other.offset), + buffers(other.buffers), + child_data(other.child_data), + dictionary(other.dictionary) {} + + // Move assignment + ArrayData& operator=(ArrayData&& other) = default; + + // Copy assignment + ArrayData& operator=(const ArrayData& other) = default; + + std::shared_ptr Copy() const { return std::make_shared(*this); } + + // Access a buffer's data as a typed C pointer + template + inline const T* GetValues(int i, int64_t absolute_offset) const { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline const T* GetValues(int i) const { + return GetValues(i, offset); + } + + // Access a buffer's data as a typed C pointer + template + inline T* GetMutableValues(int i, int64_t absolute_offset) { + if (buffers[i]) { + return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; + } else { + return NULLPTR; + } + } + + template + inline T* GetMutableValues(int i) { + return GetMutableValues(i, offset); + } + + // Construct a zero-copy slice of the data with the indicated offset and length + ArrayData Slice(int64_t offset, int64_t length) const; + + /// \brief Return null count, or compute and set it if it's not known + int64_t GetNullCount() const; + + std::shared_ptr type; + int64_t length; + mutable int64_t null_count; + // The logical start point into the physical buffers (in values, not bytes). + // Note that, for child data, this must be *added* to the child data's own offset. + int64_t offset; + std::vector> buffers; + std::vector> child_data; + + // The dictionary for this Array, if any. Only used for dictionary + // type + std::shared_ptr dictionary; +}; + +/// \brief Create a strongly-typed Array instance from generic ArrayData +/// \param[in] data the array contents +/// \return the resulting Array instance +ARROW_EXPORT +std::shared_ptr MakeArray(const std::shared_ptr& data); + +// ---------------------------------------------------------------------- +// User array accessor types + +/// \brief Array base type +/// Immutable data array with some logical type and some length. +/// +/// Any memory is owned by the respective Buffer instance (or its parents). +/// +/// The base class is only required to have a null bitmap buffer if the null +/// count is greater than 0 +/// +/// If known, the null count can be provided in the base Array constructor. If +/// the null count is not known, pass -1 to indicate that the null count is to +/// be computed on the first call to null_count() +class ARROW_EXPORT Array { + public: + virtual ~Array() = default; + + /// \brief Return true if value at index is null. Does not boundscheck + bool IsNull(int64_t i) const { + return null_bitmap_data_ != NULLPTR && + !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); + } + + /// \brief Return true if value at index is valid (not null). Does not + /// boundscheck + bool IsValid(int64_t i) const { + return null_bitmap_data_ == NULLPTR || + BitUtil::GetBit(null_bitmap_data_, i + data_->offset); + } + + /// Size in the number of elements this array contains. + int64_t length() const { return data_->length; } + + /// A relative position into another array's data, to enable zero-copy + /// slicing. This value defaults to zero + int64_t offset() const { return data_->offset; } + + /// The number of null entries in the array. If the null count was not known + /// at time of construction (and set to a negative value), then the null + /// count will be computed and cached on the first invocation of this + /// function + int64_t null_count() const; + + std::shared_ptr type() const { return data_->type; } + Type::type type_id() const { return data_->type->id(); } + + /// Buffer for the null bitmap. + /// + /// Note that for `null_count == 0`, this can be null. + /// This buffer does not account for any slice offset + std::shared_ptr null_bitmap() const { return data_->buffers[0]; } + + /// Raw pointer to the null bitmap. + /// + /// Note that for `null_count == 0`, this can be null. + /// This buffer does not account for any slice offset + const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } + + /// Equality comparison with another array + bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; + bool Equals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// Approximate equality comparison with another array + /// + /// epsilon is only used if this is FloatArray or DoubleArray + bool ApproxEquals(const std::shared_ptr& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + bool ApproxEquals(const Array& arr, + const EqualOptions& = EqualOptions::Defaults()) const; + + /// Compare if the range of slots specified are equal for the given array and + /// this array. end_idx exclusive. This methods does not bounds check. + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const Array& other) const; + bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, + const std::shared_ptr& other) const; + bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, + int64_t other_start_idx) const; + bool RangeEquals(const std::shared_ptr& other, int64_t start_idx, + int64_t end_idx, int64_t other_start_idx) const; + + Status Accept(ArrayVisitor* visitor) const; + + /// Construct a zero-copy slice of the array with the indicated offset and + /// length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// Slice from offset until end of the array + std::shared_ptr Slice(int64_t offset) const; + + std::shared_ptr data() const { return data_; } + + int num_fields() const { return static_cast(data_->child_data.size()); } + + /// \return PrettyPrint representation of array suitable for debugging + std::string ToString() const; + + protected: + Array() : null_bitmap_data_(NULLPTR) {} + + std::shared_ptr data_; + const uint8_t* null_bitmap_data_; + + /// Protected method for constructors + inline void SetData(const std::shared_ptr& data) { + if (data->buffers.size() > 0 && data->buffers[0]) { + null_bitmap_data_ = data->buffers[0]->data(); + } else { + null_bitmap_data_ = NULLPTR; + } + data_ = data; + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Array); +}; + +using ArrayVector = std::vector>; + +namespace internal { + +/// Given a number of ArrayVectors, treat each ArrayVector as the +/// chunks of a chunked array. Then rechunk each ArrayVector such that +/// all ArrayVectors are chunked identically. It is mandatory that +/// all ArrayVectors contain the same total number of elements. +ARROW_EXPORT +std::vector RechunkArraysConsistently(const std::vector&); + +} // namespace internal + +static inline std::ostream& operator<<(std::ostream& os, const Array& x) { + os << x.ToString(); + return os; +} + +/// Base class for non-nested arrays +class ARROW_EXPORT FlatArray : public Array { + protected: + using Array::Array; +}; + +/// Degenerate null type Array +class ARROW_EXPORT NullArray : public FlatArray { + public: + using TypeClass = NullType; + + explicit NullArray(const std::shared_ptr& data) { SetData(data); } + explicit NullArray(int64_t length); + + private: + inline void SetData(const std::shared_ptr& data) { + null_bitmap_data_ = NULLPTR; + data->null_count = data->length; + data_ = data; + } +}; + +/// Base class for arrays of fixed-size logical types +class ARROW_EXPORT PrimitiveArray : public FlatArray { + public: + PrimitiveArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// Does not account for any slice offset + std::shared_ptr values() const { return data_->buffers[1]; } + + protected: + PrimitiveArray() : raw_values_(NULLPTR) {} + + inline void SetData(const std::shared_ptr& data) { + auto values = data->buffers[1]; + this->Array::SetData(data); + raw_values_ = values == NULLPTR ? NULLPTR : values->data(); + } + + explicit inline PrimitiveArray(const std::shared_ptr& data) { + SetData(data); + } + + const uint8_t* raw_values_; +}; + +/// Concrete Array class for numeric data. +template +class NumericArray : public PrimitiveArray { + public: + using TypeClass = TYPE; + using value_type = typename TypeClass::c_type; + + explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} + + // Only enable this constructor without a type argument for types without additional + // metadata + template + NumericArray( + typename std::enable_if::is_parameter_free, int64_t>::type length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0) + : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, + null_count, offset) {} + + const value_type* raw_values() const { + return reinterpret_cast(raw_values_) + data_->offset; + } + + value_type Value(int64_t i) const { return raw_values()[i]; } + + // For API compatibility with BinaryArray etc. + value_type GetView(int64_t i) const { return Value(i); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +/// Concrete Array class for boolean data +class ARROW_EXPORT BooleanArray : public PrimitiveArray { + public: + using TypeClass = BooleanType; + + explicit BooleanArray(const std::shared_ptr& data); + + BooleanArray(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + bool Value(int64_t i) const { + return BitUtil::GetBit(reinterpret_cast(raw_values_), + i + data_->offset); + } + + bool GetView(int64_t i) const { return Value(i); } + + protected: + using PrimitiveArray::PrimitiveArray; +}; + +// ---------------------------------------------------------------------- +// ListArray + +/// Concrete Array class for list data +class ARROW_EXPORT ListArray : public Array { + public: + using TypeClass = ListType; + + explicit ListArray(const std::shared_ptr& data); + + ListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListArray from array of offsets and child value array + /// + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type + /// \param[in] values Array containing + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[out] out Will have length equal to offsets.length() - 1 + static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr* out); + + const ListType* list_type() const; + + /// \brief Return array object containing the list's values + std::shared_ptr values() const; + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + + std::shared_ptr value_type() const; + + /// Return pointer to raw value offsets accounting for any slice offset + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + + // Neither of these functions will perform boundschecking + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + int32_t value_length(int64_t i) const { + i += data_->offset; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + + protected: + void SetData(const std::shared_ptr& data); + const int32_t* raw_value_offsets_; + + private: + std::shared_ptr values_; +}; + +// ---------------------------------------------------------------------- +// FixedSizeListArray + +/// Concrete Array class for fixed size list data +class ARROW_EXPORT FixedSizeListArray : public Array { + public: + using TypeClass = FixedSizeListType; + + explicit FixedSizeListArray(const std::shared_ptr& data); + + FixedSizeListArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& values, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const FixedSizeListType* list_type() const; + + /// \brief Return array object containing the list's values + std::shared_ptr values() const; + + std::shared_ptr value_type() const; + + // Neither of these functions will perform boundschecking + int32_t value_offset(int64_t i) const { + i += data_->offset; + return static_cast(list_size_ * i); + } + int32_t value_length(int64_t i = 0) const { return list_size_; } + + protected: + void SetData(const std::shared_ptr& data); + int32_t list_size_; + + private: + std::shared_ptr values_; +}; + +// ---------------------------------------------------------------------- +// Binary and String + +/// Concrete Array class for variable-size binary data +class ARROW_EXPORT BinaryArray : public FlatArray { + public: + using TypeClass = BinaryType; + + explicit BinaryArray(const std::shared_ptr& data); + + BinaryArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// Return the pointer to the given elements bytes + // XXX should GetValue(int64_t i) return a string_view? + const uint8_t* GetValue(int64_t i, int32_t* out_length) const { + // Account for base offset + i += data_->offset; + const int32_t pos = raw_value_offsets_[i]; + *out_length = raw_value_offsets_[i + 1] - pos; + return raw_data_ + pos; + } + + /// \brief Get binary value as a string_view + /// + /// \param i the value index + /// \return the view over the selected value + util::string_view GetView(int64_t i) const { + // Account for base offset + i += data_->offset; + const int32_t pos = raw_value_offsets_[i]; + return util::string_view(reinterpret_cast(raw_data_ + pos), + raw_value_offsets_[i + 1] - pos); + } + + /// \brief Get binary value as a std::string + /// + /// \param i the value index + /// \return the value copied into a std::string + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[1]; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_data() const { return data_->buffers[2]; } + + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + + // Neither of these functions will perform boundschecking + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + int32_t value_length(int64_t i) const { + i += data_->offset; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } + + protected: + // For subclasses + BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} + + /// Protected method for constructors + void SetData(const std::shared_ptr& data); + + // Constructor that allows sub-classes/builders to propagate there logical type up the + // class hierarchy. + BinaryArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const int32_t* raw_value_offsets_; + const uint8_t* raw_data_; +}; + +/// Concrete Array class for variable-size string (utf-8) data +class ARROW_EXPORT StringArray : public BinaryArray { + public: + using TypeClass = StringType; + + explicit StringArray(const std::shared_ptr& data); + + StringArray(int64_t length, const std::shared_ptr& value_offsets, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); +}; + +// ---------------------------------------------------------------------- +// Fixed width binary + +/// Concrete Array class for fixed-size binary data +class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { + public: + using TypeClass = FixedSizeBinaryType; + + explicit FixedSizeBinaryArray(const std::shared_ptr& data); + + FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const uint8_t* GetValue(int64_t i) const; + const uint8_t* Value(int64_t i) const { return GetValue(i); } + + util::string_view GetView(int64_t i) const { + return util::string_view(reinterpret_cast(GetValue(i)), byte_width()); + } + + std::string GetString(int64_t i) const { return std::string(GetView(i)); } + + int32_t byte_width() const { return byte_width_; } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } + + protected: + inline void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + byte_width_ = + internal::checked_cast(*type()).byte_width(); + } + + int32_t byte_width_; +}; + +/// DayTimeArray +/// --------------------- +/// \brief Array of Day and Millisecond values. +class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { + public: + using TypeClass = DayTimeIntervalType; + + explicit DayTimeIntervalArray(const std::shared_ptr& data); + + DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + TypeClass::DayMilliseconds GetValue(int64_t i) const; + TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } + + // For compability with Take kernel. + TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } + + int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } + + const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } + + protected: + inline void SetData(const std::shared_ptr& data) { + this->PrimitiveArray::SetData(data); + } +}; + +// ---------------------------------------------------------------------- +// Decimal128Array + +/// Concrete Array class for 128-bit decimal data +class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { + public: + using TypeClass = Decimal128Type; + + using FixedSizeBinaryArray::FixedSizeBinaryArray; + + /// \brief Construct Decimal128Array from ArrayData instance + explicit Decimal128Array(const std::shared_ptr& data); + + std::string FormatValue(int64_t i) const; +}; + +// Backward compatibility +using DecimalArray = Decimal128Array; + +// ---------------------------------------------------------------------- +// Struct + +/// Concrete Array class for struct data +class ARROW_EXPORT StructArray : public Array { + public: + using TypeClass = StructType; + + explicit StructArray(const std::shared_ptr& data); + + StructArray(const std::shared_ptr& type, int64_t length, + const std::vector>& children, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + const StructType* struct_type() const; + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. The returned array has its offset, length and null + // count adjusted. + std::shared_ptr field(int pos) const; + + /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// \brief Flatten this array as a vector of arrays, one for each field + /// + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + /// \param[out] out The resulting vector of arrays + Status Flatten(MemoryPool* pool, ArrayVector* out) const; + + private: + // For caching boxed child data + mutable std::vector> boxed_fields_; +}; + +// ---------------------------------------------------------------------- +// Union + +/// Concrete Array class for union data +class ARROW_EXPORT UnionArray : public Array { + public: + using TypeClass = UnionType; + using type_id_t = uint8_t; + + explicit UnionArray(const std::shared_ptr& data); + + UnionArray(const std::shared_ptr& type, int64_t length, + const std::vector>& children, + const std::shared_ptr& type_ids, + const std::shared_ptr& value_offsets = NULLPTR, + const std::shared_ptr& null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + const std::vector& field_names, + const std::vector& type_codes, + std::shared_ptr* out); + + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + const std::vector& field_names, + std::shared_ptr* out) { + return MakeDense(type_ids, value_offsets, children, field_names, {}, out); + } + + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + const std::vector& type_codes, + std::shared_ptr* out) { + return MakeDense(type_ids, value_offsets, children, {}, type_codes, out); + } + + /// \brief Construct Dense UnionArray from types_ids, value_offsets and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. The value_offsets are assumed to be well-formed. + /// + /// The name of each field is filled by the index of the field. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] value_offsets An array of signed int32 values indicating the + /// relative offset into the respective child array for the type in a given slot. + /// The respective offsets for each child value array must be in order / increasing. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to value_offsets.length() + static Status MakeDense(const Array& type_ids, const Array& value_offsets, + const std::vector>& children, + std::shared_ptr* out) { + return MakeDense(type_ids, value_offsets, children, {}, {}, out); + } + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[in] type_codes Vector of type codes. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + const std::vector& field_names, + const std::vector& type_codes, + std::shared_ptr* out); + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] field_names Vector of strings containing the name of each field. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + const std::vector& field_names, + std::shared_ptr* out) { + return MakeSparse(type_ids, children, field_names, {}, out); + } + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[in] type_codes Vector of type codes. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + const std::vector& type_codes, + std::shared_ptr* out) { + return MakeSparse(type_ids, children, {}, type_codes, out); + } + + /// \brief Construct Sparse UnionArray from type_ids and children + /// + /// This function does the bare minimum of validation of the offsets and + /// input types. + /// + /// The name of each field is filled by the index of the field. + /// + /// \param[in] type_ids An array of 8-bit signed integers, enumerated from + /// 0 corresponding to each type. + /// \param[in] children Vector of children Arrays containing the data for each type. + /// \param[out] out Will have length equal to type_ids.length() + static Status MakeSparse(const Array& type_ids, + const std::vector>& children, + std::shared_ptr* out) { + return MakeSparse(type_ids, children, {}, {}, out); + } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr type_ids() const { return data_->buffers[1]; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return data_->buffers[2]; } + + int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + + const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } + + UnionMode::type mode() const { + return internal::checked_cast(*type()).mode(); + } + + // Return the given field as an individual array. + // For sparse unions, the returned array has its offset, length and null + // count adjusted. + // For dense unions, the returned array is unchanged. + std::shared_ptr child(int pos) const; + + /// Only use this while the UnionArray is in scope + const Array* UnsafeChild(int pos) const; + + protected: + void SetData(const std::shared_ptr& data); + + const type_id_t* raw_type_ids_; + const int32_t* raw_value_offsets_; + + // For caching boxed child data + mutable std::vector> boxed_fields_; +}; + +// ---------------------------------------------------------------------- +// DictionaryArray + +/// \brief Array type for dictionary-encoded data with a +/// data-dependent dictionary +/// +/// A dictionary array contains an array of non-negative integers (the +/// "dictionary indices") along with a data type containing a "dictionary" +/// corresponding to the distinct values represented in the data. +/// +/// For example, the array +/// +/// ["foo", "bar", "foo", "bar", "foo", "bar"] +/// +/// with dictionary ["bar", "foo"], would have dictionary array representation +/// +/// indices: [1, 0, 1, 0, 1, 0] +/// dictionary: ["bar", "foo"] +/// +/// The indices in principle may have any integer type (signed or unsigned), +/// though presently data in IPC exchanges must be signed int32. +class ARROW_EXPORT DictionaryArray : public Array { + public: + using TypeClass = DictionaryType; + + explicit DictionaryArray(const std::shared_ptr& data); + + DictionaryArray(const std::shared_ptr& type, + const std::shared_ptr& indices, + const std::shared_ptr& dictionary); + + /// \brief Construct DictionaryArray from dictionary and indices + /// array and validate + /// + /// This function does the validation of the indices and input type. It checks if + /// all indices are non-negative and smaller than the size of the dictionary + /// + /// \param[in] type a dictionary type + /// \param[in] dictionary the dictionary with same value type as the + /// type object + /// \param[in] indices an array of non-negative signed + /// integers smaller than the size of the dictionary + /// \param[out] out the resulting DictionaryArray instance + static Status FromArrays(const std::shared_ptr& type, + const std::shared_ptr& indices, + const std::shared_ptr& dictionary, + std::shared_ptr* out); + + /// \brief Transpose this DictionaryArray + /// + /// This method constructs a new dictionary array with the given dictionary type, + /// transposing indices using the transpose map. + /// The type and the transpose map are typically computed using + /// DictionaryType::Unify. + /// + /// \param[in] pool a pool to allocate the array data from + /// \param[in] type the new type object + /// \param[in] dictionary the new dictionary + /// \param[in] transpose_map a vector transposing this array's indices + /// into the target array's indices + /// \param[out] out the resulting DictionaryArray instance + Status Transpose(MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& dictionary, + const std::vector& transpose_map, + std::shared_ptr* out) const; + + /// \brief Return the dictionary for this array, which is stored as + /// a member of the ArrayData internal structure + std::shared_ptr dictionary() const; + std::shared_ptr indices() const; + + const DictionaryType* dict_type() const { return dict_type_; } + + private: + void SetData(const std::shared_ptr& data); + const DictionaryType* dict_type_; + std::shared_ptr indices_; +}; + +/// \brief Perform any validation checks to determine obvious inconsistencies +/// with the array's internal data +/// +/// This can be an expensive check. +/// +/// \param array an Array instance +/// \return Status +ARROW_EXPORT +Status ValidateArray(const Array& array); + +} // namespace arrow + +#endif // ARROW_ARRAY_H diff --git a/r/R/inst/include/arrow/array/builder_adaptive.h b/r/R/inst/include/arrow/array/builder_adaptive.h new file mode 100644 index 00000000000..7f24109526b --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_adaptive.h @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + explicit AdaptiveIntBuilderBase(MemoryPool* pool); + + /// \brief Append multiple nulls + /// \param[in] length the number of nulls to append + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeSetNull(length); + return Status::OK(); + } + + Status AppendNull() final { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + virtual Status CommitPendingData() = 0; + + std::shared_ptr data_; + uint8_t* raw_data_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_; + bool pending_has_nulls_; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { + auto v = static_cast(val); + + pending_data_[pending_pos_] = v; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_base.h b/r/R/inst/include/arrow/array/builder_base.h new file mode 100644 index 00000000000..36f6c7a2a4d --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_base.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer-builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +struct ArrayData; +class MemoryPool; + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), pool_(pool), null_bitmap_builder_(pool) {} + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + int num_children() const { return static_cast(children_.size()); } + + int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to add the indicated + /// number of elements without any further calls to Resize. Overallocation is + /// used in order to minimize the impact of incremental Reserve() calls. + /// + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity) { + auto current_capacity = capacity(); + auto min_capacity = length() + additional_capacity; + if (min_capacity <= current_capacity) return Status::OK(); + + // leave growth factor up to BufferBuilder + auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); + return Resize(new_capacity); + } + + /// Reset the builder. + virtual void Reset(); + + virtual Status AppendNull() = 0; + virtual Status AppendNulls(int64_t length) = 0; + + /// For cases where raw data was memcpy'd into the internal buffers, allows us + /// to advance the length of the builder. It is your responsibility to use + /// this function responsibly. + Status Advance(int64_t elements); + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + std::shared_ptr type() const { return type_; } + + protected: + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Uniform append. Append N times the same validity bit. + Status AppendToBitmap(int64_t num_bits, bool value); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + null_bitmap_builder_.UnsafeAppend(is_valid); + ++length_; + if (!is_valid) ++null_count_; + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(valid_bytes, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + + // Append the same validity value a given number of times. + void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { + if (value) { + UnsafeSetNotNull(num_bits); + } else { + UnsafeSetNull(num_bits); + } + } + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next validity bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + // Set the next validity bits to null (i.e. invalid). + void UnsafeSetNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + /// \brief Finish to an array of the specified ArrayType + template + Status FinishTyped(std::shared_ptr* out) { + std::shared_ptr out_untyped; + ARROW_RETURN_NOT_OK(Finish(&out_untyped)); + *out = std::static_pointer_cast(std::move(out_untyped)); + return Status::OK(); + } + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + + return Status::OK(); + } + + std::shared_ptr type_; + MemoryPool* pool_; + + TypedBufferBuilder null_bitmap_builder_; + int64_t null_count_ = 0; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_ = 0; + int64_t capacity_ = 0; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_binary.h b/r/R/inst/include/arrow/array/builder_binary.h new file mode 100644 index 00000000000..23a96450366 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_binary.h @@ -0,0 +1,365 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" // IWYU pragma: export + +namespace arrow { + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +// ---------------------------------------------------------------------- +// Binary and String + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + Status Append(const uint8_t* value, int32_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + ARROW_RETURN_NOT_OK(AppendNextOffset()); + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendNulls(int64_t length) final { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } + UnsafeAppendToBitmap(length, false); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, int32_t length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int32_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(util::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendOverflow(int64_t num_bytes); + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + return offsets_builder_.Append(static_cast(num_bytes)); + } + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using BinaryBuilder::Append; + using BinaryBuilder::Reset; + using BinaryBuilder::UnsafeAppend; + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(const util::string_view& view) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(view); + return Status::OK(); + } + + Status Append(const std::string& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(s); + return Status::OK(); + } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend( + util::string_view(reinterpret_cast(value.data()), value.size())); + return Status::OK(); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status AppendNull() final; + + Status AppendNulls(int64_t length) final; + + void UnsafeAppend(const uint8_t* value) { + UnsafeAppendToBitmap(true); + if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { + byte_builder_.UnsafeAppend(value, byte_width_); + } + } + + void UnsafeAppend(util::string_view value) { +#ifndef NDEBUG + CheckValueSize(static_cast(value.size())); +#endif + UnsafeAppend(reinterpret_cast(value.data())); + } + + void UnsafeAppendNull() { + UnsafeAppendToBitmap(false); + byte_builder_.UnsafeAdvance(byte_width_); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + +#ifndef NDEBUG + void CheckValueSize(int64_t size); +#endif +}; + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + ChunkedBinaryBuilder(int32_t max_chunk_size, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { + // Move onto next chunk, unless the builder length is currently 0, which + // means that max_chunk_size_ is less than the item length + if (builder_->length() > 0) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + // else fall through + } + + chunk_data_size_ += length; + return builder_->Append(value, length); + } + + Status Append(const util::string_view& value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + Status Reserve(int64_t values) { return builder_->Reserve(values); } + + virtual Status Finish(ArrayVector* out); + + protected: + Status NextChunk(); + + int64_t max_chunk_size_; + int64_t chunk_data_size_; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_decimal.h b/r/R/inst/include/arrow/array/builder_decimal.h new file mode 100644 index 00000000000..d5a26ff42f5 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_decimal.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" + +namespace arrow { + +class Decimal128; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal128& val); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +using DecimalBuilder = Decimal128Builder; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_dict.h b/r/R/inst/include/arrow/array/builder_dict.h new file mode 100644 index 00000000000..93cad2975a2 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_dict.h @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export + +#include "arrow/array.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +class ARROW_EXPORT DictionaryMemoTable { + public: + explicit DictionaryMemoTable(const std::shared_ptr& type); + explicit DictionaryMemoTable(const std::shared_ptr& dictionary); + ~DictionaryMemoTable(); + + int32_t GetOrInsert(const bool& value); + int32_t GetOrInsert(const int8_t& value); + int32_t GetOrInsert(const int16_t& value); + int32_t GetOrInsert(const int32_t& value); + int32_t GetOrInsert(const int64_t& value); + int32_t GetOrInsert(const uint8_t& value); + int32_t GetOrInsert(const uint16_t& value); + int32_t GetOrInsert(const uint32_t& value); + int32_t GetOrInsert(const uint64_t& value); + int32_t GetOrInsert(const float& value); + int32_t GetOrInsert(const double& value); + int32_t GetOrInsert(const util::string_view& value); + + Status GetArrayData(MemoryPool* pool, int64_t start_offset, + std::shared_ptr* out); + + int32_t size() const; + + private: + class DictionaryMemoTableImpl; + std::unique_ptr impl_; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from +/// dense array +/// +/// Unlike other builders, dictionary builder does not completely +/// reset the state on Finish calls. The arrays built after the +/// initial Finish call will reuse the previously created encoding and +/// build a delta dictionary when new terms occur. +/// +/// data +template +class DictionaryBuilder : public ArrayBuilder { + public: + using Scalar = typename internal::DictionaryScalar::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + template + DictionaryBuilder( + typename std::enable_if::value, + const std::shared_ptr&>::type type, + MemoryPool* pool) + : ArrayBuilder(type, pool), + memo_table_(new internal::DictionaryMemoTable(type)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) {} + + template + explicit DictionaryBuilder( + typename std::enable_if::value, + const std::shared_ptr&>::type type, + MemoryPool* pool) + : ArrayBuilder(type, pool), + memo_table_(new internal::DictionaryMemoTable(type)), + delta_offset_(0), + byte_width_(static_cast(*type).byte_width()), + values_builder_(pool) {} + + template + explicit DictionaryBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} + + DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), + memo_table_(new internal::DictionaryMemoTable(dictionary)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) {} + + ~DictionaryBuilder() override = default; + + /// \brief Append a scalar value + Status Append(const Scalar& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + + auto memo_index = memo_table_->GetOrInsert(value); + ARROW_RETURN_NOT_OK(values_builder_.Append(memo_index)); + length_ += 1; + + return Status::OK(); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const uint8_t*>::type value) { + return Append(util::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const char*>::type value) { + return Append(util::string_view(value, byte_width_)); + } + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return values_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return values_builder_.AppendNulls(length); + } + + /// \brief Append a whole dense array to the builder + template + Status AppendArray( + typename std::enable_if::value, + const Array&>::type array) { + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); + } + + template + Status AppendArray( + typename std::enable_if::value, + const Array&>::type array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid( + "Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); + } + } + return Status::OK(); + } + + void Reset() override { + ArrayBuilder::Reset(); + values_builder_.Reset(); + memo_table_.reset(new internal::DictionaryMemoTable(type_)); + delta_offset_ = 0; + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + if (capacity_ == 0) { + // Initialize hash table + // XXX should we let the user pass additional size heuristics? + delta_offset_ = 0; + } + ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); + capacity_ = values_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Finalize indices array + ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); + + // Generate dictionary array from hash table contents + std::shared_ptr dictionary_data; + + ARROW_RETURN_NOT_OK( + memo_table_->GetArrayData(pool_, delta_offset_, &dictionary_data)); + + // Set type of array data to the right dictionary type + (*out)->type = dictionary((*out)->type, type_); + (*out)->dictionary = MakeArray(dictionary_data); + + // Update internals for further uses of this DictionaryBuilder + delta_offset_ = memo_table_->size(); + values_builder_.Reset(); + + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// is the dictionary builder in the delta building mode + bool is_building_delta() { return delta_offset_ > 0; } + + protected: + std::unique_ptr memo_table_; + + int32_t delta_offset_; + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + AdaptiveIntBuilder values_builder_; +}; + +template <> +class DictionaryBuilder : public ArrayBuilder { + public: + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), values_builder_(pool) {} + explicit DictionaryBuilder(MemoryPool* pool) + : ArrayBuilder(null(), pool), values_builder_(pool) {} + + DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {} + + /// \brief Append a scalar null value + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return values_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return values_builder_.AppendNulls(length); + } + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array) { + for (int64_t i = 0; i < array.length(); i++) { + ARROW_RETURN_NOT_OK(AppendNull()); + } + return Status::OK(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); + capacity_ = values_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr dictionary = std::make_shared(0); + + ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); + (*out)->type = std::make_shared((*out)->type, type_); + (*out)->dictionary = dictionary; + + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + protected: + AdaptiveIntBuilder values_builder_; +}; + +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_nested.h b/r/R/inst/include/arrow/array/builder_nested.h new file mode 100644 index 00000000000..d3695e525a9 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_nested.h @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// List builder + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() final { return Append(false); } + + Status AppendNulls(int64_t length) final; + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status CheckNextOffset() const; + Status AppendNextOffset(); + Status AppendNextOffset(int64_t num_repeats); +}; + +// ---------------------------------------------------------------------- +// FixedSizeList builder + +/// \class FixedSizeListBuilder +/// \brief Builder class for fixed-length list array value types +class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { + public: + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int32_t list_size); + + FixedSizeListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a valid fixed length list. + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. + Status Append(); + + /// \brief Vector append + /// + /// If passed, valid_bytes wil be read and any zero byte + /// will cause the corresponding slot to be null + /// + /// This function affects only the validity bitmap; the child values must be appended + /// using the child array builder. This includes appending nulls for null lists. + /// XXX this restriction is confusing, should this method be omitted? + Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a null fixed length list. + /// + /// The child array builder will have the approriate number of nulls appended + /// automatically. + Status AppendNull() final; + + /// \brief Append length null fixed length lists. + /// + /// The child array builder will have the approriate number of nulls appended + /// automatically. + Status AppendNulls(int64_t length) final; + + ArrayBuilder* value_builder() const { return value_builder_.get(); } + + protected: + const int32_t list_size_; + std::shared_ptr value_builder_; +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() final { return Append(false); } + + Status AppendNulls(int64_t length) final; + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } + + int num_fields() const { return static_cast(children_.size()); } +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_primitive.h b/r/R/inst/include/arrow/array/builder_primitive.h new file mode 100644 index 00000000000..3d566846d19 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_primitive.h @@ -0,0 +1,427 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/type.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(null(), pool) {} + + /// \brief Append the specified number of null elements + Status AppendNulls(int64_t length) final { + if (length < 0) return Status::Invalid("length must be positive"); + null_count_ += length; + length_ += length; + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { return AppendNulls(1); } + + Status Append(std::nullptr_t) { return AppendNull(); } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class NumericBuilder : public ArrayBuilder { + public: + using value_type = typename T::c_type; + using ArrayType = typename TypeTraits::ArrayType; + using ArrayBuilder::ArrayBuilder; + + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(TypeTraits::type_singleton(), pool) {} + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent + /// uninitialized memory access + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, static_cast(0)); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(static_cast(0)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } + + void Reset() override { data_builder_.Reset(); } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + value_type operator[](int64_t index) const { return GetValue(index); } + + value_type& operator[](int64_t index) { + return reinterpret_cast(data_builder_.mutable_data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr data, null_bitmap; + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); + *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + capacity_ = length_ = null_count_ = 0; + return Status::OK(); + } + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values_begin, values_end); + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + } + + return Status::OK(); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + ArrayBuilder::UnsafeAppendToBitmap(true); + data_builder_.UnsafeAppend(val); + } + + void UnsafeAppendNull() { + ArrayBuilder::UnsafeAppendToBitmap(false); + data_builder_.UnsafeAppend(0); + } + + protected: + TypedBufferBuilder data_builder_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { + public: + using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, false); + UnsafeSetNull(length); + return Status::OK(); + } + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendNull(); + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + data_builder_.UnsafeAppend(val); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(false); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend( + length, [&values_begin]() -> bool { return *values_begin++; }); + + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + null_bitmap_builder_.UnsafeAppend( + length, [&valid_begin]() -> bool { return *valid_begin++; }); + } + length_ = null_bitmap_builder_.length(); + null_count_ = null_bitmap_builder_.false_count(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + TypedBufferBuilder data_builder_; +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_time.h b/r/R/inst/include/arrow/array/builder_time.h new file mode 100644 index 00000000000..3ff783b1b1c --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_time.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Contains declarations of time related Arrow builder types. + +#pragma once + +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/buffer-builder.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { + public: + using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; + + explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : DayTimeIntervalBuilder(day_time_interval(), pool) {} + + DayTimeIntervalBuilder(std::shared_ptr type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(type, pool), + builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} + + void Reset() override { builder_.Reset(); } + Status Resize(int64_t capacity) override { return builder_.Resize(capacity); } + Status Append(DayMilliseconds day_millis) { + return builder_.Append(reinterpret_cast(&day_millis)); + } + void UnsafeAppend(DayMilliseconds day_millis) { + builder_.UnsafeAppend(reinterpret_cast(&day_millis)); + } + using ArrayBuilder::UnsafeAppendNull; + Status AppendNull() override { return builder_.AppendNull(); } + Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); } + Status FinishInternal(std::shared_ptr* out) override { + auto result = builder_.FinishInternal(out); + if (*out != NULLPTR) { + (*out)->type = type(); + } + return result; + } + + private: + FixedSizeBinaryBuilder builder_; +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_union.h b/r/R/inst/include/arrow/array/builder_union.h new file mode 100644 index 00000000000..aac2e54f9a2 --- /dev/null +++ b/r/R/inst/include/arrow/array/builder_union.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/buffer-builder.h" + +namespace arrow { + +/// \class DenseUnionBuilder +/// +/// You need to call AppendChild for each of the children builders you want +/// to use. The function will return an int8_t, which is the type tag +/// associated with that child. You can then call Append with that tag +/// (followed by an append on the child builder) to add elements to +/// the union array. +/// +/// You can either specify the type when the UnionBuilder is constructed +/// or let the UnionBuilder infer the type at runtime (by omitting the +/// type argument from the constructor). +/// +/// This API is EXPERIMENTAL. +class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the union array along + /// with types, offsets, and null bitmap. + explicit DenseUnionBuilder(MemoryPool* pool, + const std::shared_ptr& type = NULLPTR); + + Status AppendNull() final { + ARROW_RETURN_NOT_OK(types_builder_.Append(0)); + ARROW_RETURN_NOT_OK(offsets_builder_.Append(0)); + return AppendToBitmap(false); + } + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(types_builder_.Reserve(length)); + ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length)); + ARROW_RETURN_NOT_OK(Reserve(length)); + for (int64_t i = 0; i < length; ++i) { + types_builder_.UnsafeAppend(0); + offsets_builder_.UnsafeAppend(0); + } + return AppendToBitmap(length, false); + } + + /// \brief Append an element to the UnionArray. This must be followed + /// by an append to the appropriate child builder. + /// \param[in] type index of the child the value will be appended + /// \param[in] offset offset of the value in that child + Status Append(int8_t type, int32_t offset) { + ARROW_RETURN_NOT_OK(types_builder_.Append(type)); + ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); + return AppendToBitmap(true); + } + + Status FinishInternal(std::shared_ptr* out) override; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + /// \brief Make a new child builder available to the UnionArray + /// + /// \param[in] child the child builder + /// \param[in] field_name the name of the field in the union array type + /// if type inference is used + /// \return child index, which is the "type" argument that needs + /// to be passed to the "Append" method to add a new element to + /// the union array. + int8_t AppendChild(const std::shared_ptr& child, + const std::string& field_name = "") { + children_.push_back(child); + field_names_.push_back(field_name); + return static_cast(children_.size() - 1); + } + + private: + TypedBufferBuilder types_builder_; + TypedBufferBuilder offsets_builder_; + std::vector field_names_; +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/array/concatenate.h b/r/R/inst/include/arrow/array/concatenate.h new file mode 100644 index 00000000000..67738d547f4 --- /dev/null +++ b/r/R/inst/include/arrow/array/concatenate.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out the resulting concatenated array +/// \return Status +ARROW_EXPORT +Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out); + +} // namespace arrow diff --git a/r/R/inst/include/arrow/buffer-builder.h b/r/R/inst/include/arrow/buffer-builder.h new file mode 100644 index 00000000000..f069ea4d7dd --- /dev/null +++ b/r/R/inst/include/arrow/buffer-builder.h @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_BUFFER_BUILDER_H +#define ARROW_BUFFER_BUILDER_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer builder classes + +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory +/// data +class ARROW_EXPORT BufferBuilder { + public: + explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : pool_(pool), + data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ + &util::internal::non_null_filler), + + capacity_(0), + size_(0) {} + + /// \brief Resize the buffer to the nearest multiple of 64 bytes + /// + /// \param new_capacity the new capacity of the of the builder. Will be + /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if + /// new capacity is smaller than the existing size, reallocate internal + /// buffer. Set to false to avoid reallocations when shrinking the builder. + /// \return Status + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + // Resize(0) is a no-op + if (new_capacity == 0) { + return Status::OK(); + } + if (buffer_ == NULLPTR) { + ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_)); + } else { + ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); + } + capacity_ = buffer_->capacity(); + data_ = buffer_->mutable_data(); + return Status::OK(); + } + + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param[in] additional_bytes number of additional bytes to make space for + /// \return Status + Status Reserve(const int64_t additional_bytes) { + auto min_capacity = size_ + additional_bytes; + if (min_capacity <= capacity_) { + return Status::OK(); + } + return Resize(GrowByFactor(capacity_, min_capacity), false); + } + + /// \brief Return a capacity expanded by an unspecified growth factor + static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) { + // NOTE: Doubling isn't a great overallocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // for discussion. + // Grow exactly if a large upsize (the caller might know the exact final size). + // Otherwise overallocate by 1.5 to keep a linear amortized cost. + return std::max(new_capacity, current_capacity * 3 / 2); + } + + /// \brief Append the given data to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const void* data, const int64_t length) { + if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { + ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false)); + } + UnsafeAppend(data, length); + return Status::OK(); + } + + /// \brief Append copies of a value to the buffer + /// + /// The buffer is automatically expanded if necessary. + Status Append(const int64_t num_copies, uint8_t value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + // Advance pointer and zero out memory + Status Advance(const int64_t length) { return Append(length, 0); } + + // Advance pointer, but don't allocate or zero memory + void UnsafeAdvance(const int64_t length) { size_ += length; } + + // Unsafe methods don't check existing size + void UnsafeAppend(const void* data, const int64_t length) { + memcpy(data_ + size_, data, static_cast(length)); + size_ += length; + } + + void UnsafeAppend(const int64_t num_copies, uint8_t value) { + memset(data_ + size_, value, static_cast(num_copies)); + size_ += num_copies; + } + + /// \brief Return result of builder as a Buffer object. + /// + /// The builder is reset and can be reused afterwards. + /// + /// \param[out] out the finalized Buffer object + /// \param shrink_to_fit if the buffer size is smaller than its capacity, + /// reallocate to fit more tightly in memory. Set to false to avoid + /// a reallocation, at the expense of potentially more memory consumption. + /// \return Status + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); + if (size_ != 0) buffer_->ZeroPadding(); + *out = buffer_; + Reset(); + return Status::OK(); + } + + void Reset() { + buffer_ = NULLPTR; + capacity_ = size_ = 0; + } + + /// \brief Set size to a smaller value without modifying builder + /// contents. For reusable BufferBuilder classes + /// \param[in] position must be non-negative and less than or equal + /// to the current length() + void Rewind(int64_t position) { size_ = position; } + + int64_t capacity() const { return capacity_; } + int64_t length() const { return size_; } + const uint8_t* data() const { return data_; } + uint8_t* mutable_data() { return data_; } + + private: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + +template +class TypedBufferBuilder; + +/// \brief A BufferBuilder for building a buffer of arithmetic elements +template +class TypedBufferBuilder::value>::type> { + public: + explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : bytes_builder_(pool) {} + + Status Append(T value) { + return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); + } + + Status Append(const T* values, int64_t num_elements) { + return bytes_builder_.Append(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + Status Append(const int64_t num_copies, T value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies + length())); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(T value) { + bytes_builder_.UnsafeAppend(reinterpret_cast(&value), sizeof(T)); + } + + void UnsafeAppend(const T* values, int64_t num_elements) { + bytes_builder_.UnsafeAppend(reinterpret_cast(values), + num_elements * sizeof(T)); + } + + template + void UnsafeAppend(Iter values_begin, Iter values_end) { + int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); + std::copy(values_begin, values_end, data); + } + + void UnsafeAppend(const int64_t num_copies, T value) { + auto data = mutable_data() + length(); + bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); + for (const auto end = data + num_copies; data != end; ++data) { + *data = value; + } + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit); + } + + Status Reserve(const int64_t additional_elements) { + return bytes_builder_.Reserve(additional_elements * sizeof(T)); + } + + Status Advance(const int64_t length) { + return bytes_builder_.Advance(length * sizeof(T)); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + return bytes_builder_.Finish(out, shrink_to_fit); + } + + void Reset() { bytes_builder_.Reset(); } + + int64_t length() const { return bytes_builder_.length() / sizeof(T); } + int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); } + const T* data() const { return reinterpret_cast(bytes_builder_.data()); } + T* mutable_data() { return reinterpret_cast(bytes_builder_.mutable_data()); } + + private: + BufferBuilder bytes_builder_; +}; + +/// \brief A BufferBuilder for building a buffer containing a bitmap +template <> +class TypedBufferBuilder { + public: + explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : bytes_builder_(pool) {} + + Status Append(bool value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(value); + return Status::OK(); + } + + Status Append(const uint8_t* valid_bytes, int64_t num_elements) { + ARROW_RETURN_NOT_OK(Reserve(num_elements)); + UnsafeAppend(valid_bytes, num_elements); + return Status::OK(); + } + + Status Append(const int64_t num_copies, bool value) { + ARROW_RETURN_NOT_OK(Reserve(num_copies)); + UnsafeAppend(num_copies, value); + return Status::OK(); + } + + void UnsafeAppend(bool value) { + BitUtil::SetBitTo(mutable_data(), bit_length_, value); + if (!value) { + ++false_count_; + } + ++bit_length_; + } + + void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { + if (num_elements == 0) return; + int64_t i = 0; + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = bytes[i++]; + false_count_ += !value; + return value; + }); + bit_length_ += num_elements; + } + + void UnsafeAppend(const int64_t num_copies, bool value) { + BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); + false_count_ += num_copies * !value; + bit_length_ += num_copies; + } + + template + void UnsafeAppend(const int64_t num_elements, Generator&& gen) { + if (num_elements == 0) return; + + if (count_falses) { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = gen(); + false_count_ += !value; + return value; + }); + } else { + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, + std::forward(gen)); + } + bit_length_ += num_elements; + } + + Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { + const int64_t old_byte_capacity = bytes_builder_.capacity(); + ARROW_RETURN_NOT_OK( + bytes_builder_.Resize(BitUtil::BytesForBits(new_capacity), shrink_to_fit)); + // Resize() may have chosen a larger capacity (e.g. for padding), + // so ask it again before calling memset(). + const int64_t new_byte_capacity = bytes_builder_.capacity(); + if (new_byte_capacity > old_byte_capacity) { + // The additional buffer space is 0-initialized for convenience, + // so that other methods can simply bump the length. + memset(mutable_data() + old_byte_capacity, 0, + static_cast(new_byte_capacity - old_byte_capacity)); + } + return Status::OK(); + } + + Status Reserve(const int64_t additional_elements) { + return Resize( + BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements), + false); + } + + Status Advance(const int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + bit_length_ += length; + false_count_ += length; + return Status::OK(); + } + + Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { + // set bytes_builder_.size_ == byte size of data + bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) - + bytes_builder_.length()); + bit_length_ = false_count_ = 0; + return bytes_builder_.Finish(out, shrink_to_fit); + } + + void Reset() { + bytes_builder_.Reset(); + bit_length_ = false_count_ = 0; + } + + int64_t length() const { return bit_length_; } + int64_t capacity() const { return bytes_builder_.capacity() * 8; } + const uint8_t* data() const { return bytes_builder_.data(); } + uint8_t* mutable_data() { return bytes_builder_.mutable_data(); } + int64_t false_count() const { return false_count_; } + + private: + BufferBuilder bytes_builder_; + int64_t bit_length_ = 0; + int64_t false_count_ = 0; +}; + +} // namespace arrow + +#endif // ARROW_BUFFER_BUILDER_H diff --git a/r/R/inst/include/arrow/buffer.h b/r/R/inst/include/arrow/buffer.h new file mode 100644 index 00000000000..3eb9b033b92 --- /dev/null +++ b/r/R/inst/include/arrow/buffer.h @@ -0,0 +1,444 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_BUFFER_H +#define ARROW_BUFFER_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Buffer classes + +/// \class Buffer +/// \brief Object containing a pointer to a piece of contiguous memory with a +/// particular size. +/// +/// Buffers have two related notions of length: size and capacity. Size is +/// the number of bytes that might have valid data. Capacity is the number +/// of bytes that were allocated for the buffer in total. +/// +/// The Buffer base class does not own its memory, but subclasses often do. +/// +/// The following invariant is always true: Size <= Capacity +class ARROW_EXPORT Buffer { + public: + /// \brief Construct from buffer and size without copying memory + /// + /// \param[in] data a memory buffer + /// \param[in] size buffer size + /// + /// \note The passed memory must be kept alive through some other means + Buffer(const uint8_t* data, int64_t size) + : is_mutable_(false), + data_(data), + mutable_data_(NULLPTR), + size_(size), + capacity_(size) {} + + /// \brief Construct from string_view without copying memory + /// + /// \param[in] data a string_view object + /// + /// \note The memory viewed by data must not be deallocated in the lifetime of the + /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere + explicit Buffer(util::string_view data) + : Buffer(reinterpret_cast(data.data()), + static_cast(data.size())) {} + + virtual ~Buffer() = default; + + /// An offset into data that is owned by another buffer, but we want to be + /// able to retain a valid pointer to it even after other shared_ptr's to the + /// parent buffer have been destroyed + /// + /// This method makes no assertions about alignment or padding of the buffer but + /// in general we expected buffers to be aligned and padded to 64 bytes. In the future + /// we might add utility methods to help determine if a buffer satisfies this contract. + Buffer(const std::shared_ptr& parent, const int64_t offset, const int64_t size) + : Buffer(parent->data() + offset, size) { + parent_ = parent; + } + + uint8_t operator[](std::size_t i) const { return data_[i]; } + + bool is_mutable() const { return is_mutable_; } + + /// \brief Construct a new std::string with a hexadecimal representation of the buffer. + /// \return std::string + std::string ToHexString(); + + /// Return true if both buffers are the same size and contain the same bytes + /// up to the number of compared bytes + bool Equals(const Buffer& other, int64_t nbytes) const; + + /// Return true if both buffers are the same size and contain the same bytes + bool Equals(const Buffer& other) const; + + /// Copy a section of the buffer into a new Buffer. + Status Copy(const int64_t start, const int64_t nbytes, MemoryPool* pool, + std::shared_ptr* out) const; + + /// Copy a section of the buffer using the default memory pool into a new Buffer. + Status Copy(const int64_t start, const int64_t nbytes, + std::shared_ptr* out) const; + + /// Zero bytes in padding, i.e. bytes between size_ and capacity_. + void ZeroPadding() { +#ifndef NDEBUG + CheckMutable(); +#endif + // A zero-capacity buffer can have a null data pointer + if (capacity_ != 0) { + memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + } + } + + /// \brief Construct a new buffer that owns its memory from a std::string + /// + /// \param[in] data a std::string object + /// \param[in] pool a memory pool + /// \param[out] out the created buffer + /// + /// \return Status message + static Status FromString(const std::string& data, MemoryPool* pool, + std::shared_ptr* out); + + /// \brief Construct a new buffer that owns its memory from a std::string + /// using the default memory pool + static Status FromString(const std::string& data, std::shared_ptr* out); + + /// \brief Construct an immutable buffer that takes ownership of the contents + /// of an std::string + /// \param[in] data an rvalue-reference of a string + /// \return a new Buffer instance + static std::shared_ptr FromString(std::string&& data); + + /// \brief Create buffer referencing typed memory with some length without + /// copying + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + /// \brief Create buffer referencing std::vector with some length without + /// copying + /// \param[in] data the vector to be referenced. If this vector is changed, + /// the buffer may become invalid + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(const std::vector& data) { + return std::make_shared(reinterpret_cast(data.data()), + static_cast(sizeof(T) * data.size())); + } + + /// \brief Copy buffer contents into a new std::string + /// \return std::string + /// \note Can throw std::bad_alloc if buffer is large + std::string ToString() const; + + /// \brief View buffer contents as a util::string_view + /// \return util::string_view + explicit operator util::string_view() const { + return util::string_view(reinterpret_cast(data_), size_); + } + + /// \brief Return a pointer to the buffer's data + const uint8_t* data() const { return data_; } + /// \brief Return a writable pointer to the buffer's data + /// + /// The buffer has to be mutable. Otherwise, an assertion may be thrown + /// or a null pointer may be returned. + uint8_t* mutable_data() { +#ifndef NDEBUG + CheckMutable(); +#endif + return mutable_data_; + } + + /// \brief Return the buffer's size in bytes + int64_t size() const { return size_; } + + /// \brief Return the buffer's capacity (number of allocated bytes) + int64_t capacity() const { return capacity_; } + + std::shared_ptr parent() const { return parent_; } + + protected: + bool is_mutable_; + const uint8_t* data_; + uint8_t* mutable_data_; + int64_t size_; + int64_t capacity_; + + // null by default, but may be set + std::shared_ptr parent_; + + void CheckMutable() const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); +}; + +using BufferVector = std::vector>; + +/// \defgroup buffer-slicing-functions Functions for slicing buffers +/// +/// @{ + +/// \brief Construct a view on a buffer at the given offset and length. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset, + const int64_t length) { + return std::make_shared(buffer, offset, length); +} + +/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. +/// +/// This function cannot fail and does not check for errors (except in debug builds) +static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, + const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceBuffer(buffer, offset, length); +} + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +ARROW_EXPORT +std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, + const int64_t offset, const int64_t length); + +/// \brief Like SliceBuffer, but construct a mutable buffer slice. +/// +/// If the parent buffer is not mutable, behavior is undefined (it may abort +/// in debug builds). +static inline std::shared_ptr SliceMutableBuffer( + const std::shared_ptr& buffer, const int64_t offset) { + int64_t length = buffer->size() - offset; + return SliceMutableBuffer(buffer, offset, length); +} + +/// @} + +/// \class MutableBuffer +/// \brief A Buffer whose contents can be mutated. May or may not own its data. +class ARROW_EXPORT MutableBuffer : public Buffer { + public: + MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { + mutable_data_ = data; + is_mutable_ = true; + } + + MutableBuffer(const std::shared_ptr& parent, const int64_t offset, + const int64_t size); + + /// \brief Create buffer referencing typed memory with some length + /// \param[in] data the typed memory as C array + /// \param[in] length the number of values in the array + /// \return a new shared_ptr + template + static std::shared_ptr Wrap(T* data, SizeType length) { + return std::make_shared(reinterpret_cast(data), + static_cast(sizeof(T) * length)); + } + + protected: + MutableBuffer() : Buffer(NULLPTR, 0) {} +}; + +/// \class ResizableBuffer +/// \brief A mutable buffer that can be resized +class ARROW_EXPORT ResizableBuffer : public MutableBuffer { + public: + /// Change buffer reported size to indicated size, allocating memory if + /// necessary. This will ensure that the capacity of the buffer is a multiple + /// of 64 bytes as defined in Layout.md. + /// Consider using ZeroPadding afterwards, to conform to the Arrow layout + /// specification. + /// + /// @param new_size The new size for the buffer. + /// @param shrink_to_fit Whether to shrink the capacity if new size < current size + virtual Status Resize(const int64_t new_size, bool shrink_to_fit = true) = 0; + + /// Ensure that buffer has enough memory allocated to fit the indicated + /// capacity (and meets the 64 byte padding requirement in Layout.md). + /// It does not change buffer's reported size and doesn't zero the padding. + virtual Status Reserve(const int64_t new_capacity) = 0; + + template + Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) { + return Resize(sizeof(T) * new_nb_elements, shrink_to_fit); + } + + template + Status TypedReserve(const int64_t new_nb_elements) { + return Reserve(sizeof(T) * new_nb_elements); + } + + protected: + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} +}; + +/// \defgroup buffer-allocation-functions Functions for allocating buffers +/// +/// @{ + +/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer (contains padding) +/// +/// \return Status message +ARROW_EXPORT +Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out); + +/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer (contains padding) +/// +/// \return Status message +ARROW_EXPORT +Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::unique_ptr* out); + +/// \brief Allocate a fixed-size mutable buffer from the default memory pool +/// +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer (contains padding) +/// +/// \return Status message +ARROW_EXPORT +Status AllocateBuffer(const int64_t size, std::shared_ptr* out); + +/// \brief Allocate a fixed-size mutable buffer from the default memory pool +/// +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer (contains padding) +/// +/// \return Status message +ARROW_EXPORT +Status AllocateBuffer(const int64_t size, std::unique_ptr* out); + +/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, + std::shared_ptr* out); + +/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, + std::unique_ptr* out); + +/// \brief Allocate a resizeable buffer from the default memory pool +/// +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateResizableBuffer(const int64_t size, std::shared_ptr* out); + +/// \brief Allocate a resizeable buffer from the default memory pool +/// +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateResizableBuffer(const int64_t size, std::unique_ptr* out); + +/// \brief Allocate a bitmap buffer from a memory pool +/// no guarantee on values is provided. +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] length size in bits of bitmap to allocate +/// \param[out] out the resulting buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out); + +/// \brief Allocate a zero-initialized bitmap buffer from a memory pool +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] length size in bits of bitmap to allocate +/// \param[out] out the resulting buffer (zero-initialized). +/// +/// \return Status message +ARROW_EXPORT +Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr* out); + +/// \brief Allocate a zero-initialized bitmap buffer from the default memory pool +/// +/// \param[in] length size in bits of bitmap to allocate +/// \param[out] out the resulting buffer +/// +/// \return Status message +ARROW_EXPORT +Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); + +/// \brief Concatenate multiple buffers into a single buffer +/// +/// \param[in] buffers to be concatenated +/// \param[in] pool memory pool to allocate the new buffer from +/// \param[out] out the concatenated buffer +/// +/// \return Status +ARROW_EXPORT +Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool, + std::shared_ptr* out); + +/// @} + +} // namespace arrow + +#endif // ARROW_BUFFER_H diff --git a/r/R/inst/include/arrow/builder.h b/r/R/inst/include/arrow/builder.h new file mode 100644 index 00000000000..56c3e2b3716 --- /dev/null +++ b/r/R/inst/include/arrow/builder.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_binary.h" // IWYU pragma: export +#include "arrow/array/builder_decimal.h" // IWYU pragma: export +#include "arrow/array/builder_dict.h" // IWYU pragma: export +#include "arrow/array/builder_nested.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export +#include "arrow/array/builder_time.h" // IWYU pragma: export +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class MemoryPool; + +/// \brief Construct an empty ArrayBuilder corresponding to the data +/// type +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type an instance of DictionaryType +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::unique_ptr* out); + +/// \brief Construct an empty DictionaryBuilder initialized optionally +/// with a pre-existing dictionary +/// \param[in] pool the MemoryPool to use for allocations +/// \param[in] type an instance of DictionaryType +/// \param[in] dictionary the initial dictionary, if any. May be nullptr +/// \param[out] out the created ArrayBuilder +ARROW_EXPORT +Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::shared_ptr& dictionary, + std::unique_ptr* out); + +} // namespace arrow diff --git a/r/R/inst/include/arrow/compare.h b/r/R/inst/include/arrow/compare.h new file mode 100644 index 00000000000..21da16b79e4 --- /dev/null +++ b/r/R/inst/include/arrow/compare.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for comparing Arrow data structures + +#ifndef ARROW_COMPARE_H +#define ARROW_COMPARE_H + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Tensor; +class SparseTensor; +struct Scalar; + +static constexpr double kDefaultAbsoluteTolerance = 1E-5; + +/// A container of options for equality comparisons +class EqualOptions { + public: + /// Whether or not NaNs are considered equal. + bool nans_equal() const { return nans_equal_; } + + /// Return a new EqualOptions object with the "nans_equal" property changed. + EqualOptions nans_equal(bool v) const { + auto res = EqualOptions(*this); + res.nans_equal_ = v; + return res; + } + + /// The absolute tolerance for approximate comparisons of floating-point values. + double atol() const { return atol_; } + + /// Return a new EqualOptions object with the "atol" property changed. + EqualOptions atol(double v) const { + auto res = EqualOptions(*this); + res.atol_ = v; + return res; + } + + static EqualOptions Defaults() { return EqualOptions(); } + + protected: + double atol_ = kDefaultAbsoluteTolerance; + bool nans_equal_ = false; +}; + +/// Returns true if the arrays are exactly equal +bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, + const EqualOptions& = EqualOptions::Defaults()); + +bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); + +/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal +bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right); + +/// Returns true if the arrays are approximately equal. For non-floating point +/// types, this is equivalent to ArrayEquals(left, right) +bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, + const EqualOptions& = EqualOptions::Defaults()); + +/// Returns true if indicated equal-length segment of arrays is exactly equal +bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, + int64_t start_idx, int64_t end_idx, + int64_t other_start_idx); + +/// Returns true if the type metadata are exactly equal +/// \param[in] left a DataType +/// \param[in] right a DataType +/// \param[in] check_metadata whether to compare KeyValueMetadata for child +/// fields +bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, + bool check_metadata = true); + +/// Returns true if scalars are equal +/// \param[in] left a Scalar +/// \param[in] right a Scalar +bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right); + +} // namespace arrow + +#endif // ARROW_COMPARE_H diff --git a/r/R/inst/include/arrow/compute/api.h b/r/R/inst/include/arrow/compute/api.h new file mode 100644 index 00000000000..2a2e79f1a4c --- /dev/null +++ b/r/R/inst/include/arrow/compute/api.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_API_H +#define ARROW_COMPUTE_API_H + +#include "arrow/compute/context.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export + +#include "arrow/compute/kernels/boolean.h" // IWYU pragma: export +#include "arrow/compute/kernels/cast.h" // IWYU pragma: export +#include "arrow/compute/kernels/compare.h" // IWYU pragma: export +#include "arrow/compute/kernels/count.h" // IWYU pragma: export +#include "arrow/compute/kernels/hash.h" // IWYU pragma: export +#include "arrow/compute/kernels/mean.h" // IWYU pragma: export +#include "arrow/compute/kernels/sum.h" // IWYU pragma: export +#include "arrow/compute/kernels/take.h" // IWYU pragma: export + +#endif // ARROW_COMPUTE_API_H diff --git a/r/R/inst/include/arrow/compute/benchmark-util.h b/r/R/inst/include/arrow/compute/benchmark-util.h new file mode 100644 index 00000000000..ee9cb9504a3 --- /dev/null +++ b/r/R/inst/include/arrow/compute/benchmark-util.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/util/cpu-info.h" + +namespace arrow { +namespace compute { + +using internal::CpuInfo; +static CpuInfo* cpu_info = CpuInfo::GetInstance(); + +static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE); +static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::L2_CACHE); +static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); +static const int64_t kCantFitInL3Size = kL3Size * 4; +static const std::vector kMemorySizes = {kL1Size, kL2Size, kL3Size, + kCantFitInL3Size}; + +template +struct BenchmarkArgsType; + +// Pattern matching that extracts the vector element type of Benchmark::Args() +template +struct BenchmarkArgsType&)> { + using type = Values; +}; + +// Benchmark changed its parameter type between releases from +// int to int64_t. As it doesn't have version macros, we need +// to apply C++ template magic. +using ArgsType = + typename BenchmarkArgsType::type; + +void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench, + const std::vector& sizes = kMemorySizes) { + bench->Unit(benchmark::kMicrosecond); + + for (auto size : sizes) + for (auto nulls : std::vector({0, 1, 10, 50})) + bench->Args({static_cast(size), nulls}); +} + +void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { + BenchmarkSetArgsWithSizes(bench, kMemorySizes); +} + +void RegressionSetArgs(benchmark::internal::Benchmark* bench) { + // Regression do not need to account for cache hierarchy, thus optimize for + // the best case. + BenchmarkSetArgsWithSizes(bench, {kL1Size}); +} + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/context.h b/r/R/inst/include/arrow/compute/context.h new file mode 100644 index 00000000000..8ac4700b91f --- /dev/null +++ b/r/R/inst/include/arrow/compute/context.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_CONTEXT_H +#define ARROW_COMPUTE_CONTEXT_H + +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; + +namespace internal { +class CpuInfo; +} // namespace internal + +namespace compute { + +#define RETURN_IF_ERROR(ctx) \ + if (ARROW_PREDICT_FALSE(ctx->HasError())) { \ + Status s = ctx->status(); \ + ctx->ResetStatus(); \ + return s; \ + } + +/// \brief Container for variables and options used by function evaluation +class ARROW_EXPORT FunctionContext { + public: + explicit FunctionContext(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + MemoryPool* memory_pool() const; + + /// \brief Allocate buffer from the context's memory pool + Status Allocate(const int64_t nbytes, std::shared_ptr* out); + + /// \brief Indicate that an error has occurred, to be checked by a parent caller + /// \param[in] status a Status instance + /// + /// \note Will not overwrite a prior set Status, so we will have the first + /// error that occurred until FunctionContext::ResetStatus is called + void SetStatus(const Status& status); + + /// \brief Clear any error status + void ResetStatus(); + + /// \brief Return true if an error has occurred + bool HasError() const { return !status_.ok(); } + + /// \brief Return the current status of the context + const Status& status() const { return status_; } + + internal::CpuInfo* cpu_info() const { return cpu_info_; } + + private: + Status status_; + MemoryPool* pool_; + internal::CpuInfo* cpu_info_; +}; + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_CONTEXT_H diff --git a/r/R/inst/include/arrow/compute/expression.h b/r/R/inst/include/arrow/compute/expression.h new file mode 100644 index 00000000000..cc558141546 --- /dev/null +++ b/r/R/inst/include/arrow/compute/expression.h @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class LogicalType; +class ExprVisitor; +class Operation; + +/// \brief Base class for all analytic expressions. Expressions may represent +/// data values (scalars, arrays, tables) +class ARROW_EXPORT Expr { + public: + /// \brief Instantiate expression from an abstract operation + /// \param[in] op the operation that generates the expression + explicit Expr(ConstOpPtr op); + + virtual ~Expr() = default; + + /// \brief A unique string identifier for the kind of expression + virtual std::string kind() const = 0; + + /// \brief Accept expression visitor + /// TODO(wesm) + // virtual Status Accept(ExprVisitor* visitor) const = 0; + + /// \brief The underlying operation + ConstOpPtr op() const { return op_; } + + protected: + ConstOpPtr op_; +}; + +/// The value cardinality: one or many. These correspond to the arrow::Scalar +/// and arrow::Array types +enum class ValueRank { SCALAR, ARRAY }; + +/// \brief Base class for a data-generated expression with a fixed and known +/// type. This includes arrays and scalars +class ARROW_EXPORT ValueExpr : public Expr { + public: + /// \brief The name of the expression, if any. The default is unnamed + // virtual const ExprName& name() const; + LogicalTypePtr type() const; + + /// \brief The value cardinality (scalar or array) of the expression + virtual ValueRank rank() const = 0; + + protected: + ValueExpr(ConstOpPtr op, LogicalTypePtr type); + + /// \brief The semantic data type of the expression + LogicalTypePtr type_; +}; + +class ARROW_EXPORT ArrayExpr : public ValueExpr { + protected: + using ValueExpr::ValueExpr; + std::string kind() const override; + ValueRank rank() const override; +}; + +class ARROW_EXPORT ScalarExpr : public ValueExpr { + protected: + using ValueExpr::ValueExpr; + std::string kind() const override; + ValueRank rank() const override; +}; + +namespace value { + +// These are mixin classes to provide a type hierarchy for values identify +class ValueMixin {}; +class Null : public ValueMixin {}; +class Bool : public ValueMixin {}; +class Number : public ValueMixin {}; +class Integer : public Number {}; +class SignedInteger : public Integer {}; +class Int8 : public SignedInteger {}; +class Int16 : public SignedInteger {}; +class Int32 : public SignedInteger {}; +class Int64 : public SignedInteger {}; +class UnsignedInteger : public Integer {}; +class UInt8 : public UnsignedInteger {}; +class UInt16 : public UnsignedInteger {}; +class UInt32 : public UnsignedInteger {}; +class UInt64 : public UnsignedInteger {}; +class Floating : public Number {}; +class Float16 : public Floating {}; +class Float32 : public Floating {}; +class Float64 : public Floating {}; +class Binary : public ValueMixin {}; +class Utf8 : public Binary {}; +class List : public ValueMixin {}; +class Struct : public ValueMixin {}; + +} // namespace value + +#define SIMPLE_EXPR_FACTORY(NAME) ARROW_EXPORT ExprPtr NAME(ConstOpPtr op); + +namespace scalar { + +#define DECLARE_SCALAR_EXPR(TYPE) \ + class ARROW_EXPORT TYPE : public ScalarExpr, public value::TYPE { \ + public: \ + explicit TYPE(ConstOpPtr op); \ + using ScalarExpr::kind; \ + }; + +DECLARE_SCALAR_EXPR(Null) +DECLARE_SCALAR_EXPR(Bool) +DECLARE_SCALAR_EXPR(Int8) +DECLARE_SCALAR_EXPR(Int16) +DECLARE_SCALAR_EXPR(Int32) +DECLARE_SCALAR_EXPR(Int64) +DECLARE_SCALAR_EXPR(UInt8) +DECLARE_SCALAR_EXPR(UInt16) +DECLARE_SCALAR_EXPR(UInt32) +DECLARE_SCALAR_EXPR(UInt64) +DECLARE_SCALAR_EXPR(Float16) +DECLARE_SCALAR_EXPR(Float32) +DECLARE_SCALAR_EXPR(Float64) +DECLARE_SCALAR_EXPR(Binary) +DECLARE_SCALAR_EXPR(Utf8) + +#undef DECLARE_SCALAR_EXPR + +SIMPLE_EXPR_FACTORY(null); +SIMPLE_EXPR_FACTORY(boolean); +SIMPLE_EXPR_FACTORY(int8); +SIMPLE_EXPR_FACTORY(int16); +SIMPLE_EXPR_FACTORY(int32); +SIMPLE_EXPR_FACTORY(int64); +SIMPLE_EXPR_FACTORY(uint8); +SIMPLE_EXPR_FACTORY(uint16); +SIMPLE_EXPR_FACTORY(uint32); +SIMPLE_EXPR_FACTORY(uint64); +SIMPLE_EXPR_FACTORY(float16); +SIMPLE_EXPR_FACTORY(float32); +SIMPLE_EXPR_FACTORY(float64); +SIMPLE_EXPR_FACTORY(binary); +SIMPLE_EXPR_FACTORY(utf8); + +class ARROW_EXPORT List : public ScalarExpr, public value::List { + public: + List(ConstOpPtr op, LogicalTypePtr type); + using ScalarExpr::kind; +}; + +class ARROW_EXPORT Struct : public ScalarExpr, public value::Struct { + public: + Struct(ConstOpPtr op, LogicalTypePtr type); + using ScalarExpr::kind; +}; + +} // namespace scalar + +namespace array { + +#define DECLARE_ARRAY_EXPR(TYPE) \ + class ARROW_EXPORT TYPE : public ArrayExpr, public value::TYPE { \ + public: \ + explicit TYPE(ConstOpPtr op); \ + using ArrayExpr::kind; \ + }; + +DECLARE_ARRAY_EXPR(Null) +DECLARE_ARRAY_EXPR(Bool) +DECLARE_ARRAY_EXPR(Int8) +DECLARE_ARRAY_EXPR(Int16) +DECLARE_ARRAY_EXPR(Int32) +DECLARE_ARRAY_EXPR(Int64) +DECLARE_ARRAY_EXPR(UInt8) +DECLARE_ARRAY_EXPR(UInt16) +DECLARE_ARRAY_EXPR(UInt32) +DECLARE_ARRAY_EXPR(UInt64) +DECLARE_ARRAY_EXPR(Float16) +DECLARE_ARRAY_EXPR(Float32) +DECLARE_ARRAY_EXPR(Float64) +DECLARE_ARRAY_EXPR(Binary) +DECLARE_ARRAY_EXPR(Utf8) + +#undef DECLARE_ARRAY_EXPR + +SIMPLE_EXPR_FACTORY(null); +SIMPLE_EXPR_FACTORY(boolean); +SIMPLE_EXPR_FACTORY(int8); +SIMPLE_EXPR_FACTORY(int16); +SIMPLE_EXPR_FACTORY(int32); +SIMPLE_EXPR_FACTORY(int64); +SIMPLE_EXPR_FACTORY(uint8); +SIMPLE_EXPR_FACTORY(uint16); +SIMPLE_EXPR_FACTORY(uint32); +SIMPLE_EXPR_FACTORY(uint64); +SIMPLE_EXPR_FACTORY(float16); +SIMPLE_EXPR_FACTORY(float32); +SIMPLE_EXPR_FACTORY(float64); +SIMPLE_EXPR_FACTORY(binary); +SIMPLE_EXPR_FACTORY(utf8); + +class ARROW_EXPORT List : public ArrayExpr, public value::List { + public: + List(ConstOpPtr op, LogicalTypePtr type); + using ArrayExpr::kind; +}; + +class ARROW_EXPORT Struct : public ArrayExpr, public value::Struct { + public: + Struct(ConstOpPtr op, LogicalTypePtr type); + using ArrayExpr::kind; +}; + +} // namespace array + +#undef SIMPLE_EXPR_FACTORY + +template +inline bool InheritsFrom(const ObjectType* obj) { + return dynamic_cast(obj) != NULLPTR; +} + +template +inline bool InheritsFrom(const ObjectType& obj) { + return dynamic_cast(&obj) != NULLPTR; +} + +/// \brief Construct a ScalarExpr containing an Operation given a logical type +ARROW_EXPORT +Status GetScalarExpr(ConstOpPtr op, LogicalTypePtr ty, ExprPtr* out); + +/// \brief Construct an ArrayExpr containing an Operation given a logical type +ARROW_EXPORT +Status GetArrayExpr(ConstOpPtr op, LogicalTypePtr ty, ExprPtr* out); + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernel.h b/r/R/inst/include/arrow/compute/kernel.h new file mode 100644 index 00000000000..aba659ebdd3 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernel.h @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNEL_H +#define ARROW_COMPUTE_KERNEL_H + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/scalar.h" +#include "arrow/table.h" +#include "arrow/util/macros.h" +#include "arrow/util/memory.h" +#include "arrow/util/variant.h" // IWYU pragma: export +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class FunctionContext; + +/// \class OpKernel +/// \brief Base class for operator kernels +/// +/// Note to implementors: +/// Operator kernels are intended to be the lowest level of an analytics/compute +/// engine. They will generally not be exposed directly to end-users. Instead +/// they will be wrapped by higher level constructs (e.g. top-level functions +/// or physical execution plan nodes). These higher level constructs are +/// responsible for user input validation and returning the appropriate +/// error Status. +/// +/// Due to this design, implementations of Call (the execution +/// method on subclasses) should use assertions (i.e. DCHECK) to double-check +/// parameter arguments when in higher level components returning an +/// InvalidArgument error might be more appropriate. +/// +class ARROW_EXPORT OpKernel { + public: + virtual ~OpKernel() = default; + /// \brief EXPERIMENTAL The output data type of the kernel + /// \return the output type + virtual std::shared_ptr out_type() const = 0; +}; + +struct Datum; +static inline bool CollectionEquals(const std::vector& left, + const std::vector& right); + +/// \class Datum +/// \brief Variant type for various Arrow C++ data structures +struct ARROW_EXPORT Datum { + enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION }; + + util::variant, std::shared_ptr, + std::shared_ptr, std::shared_ptr, + std::shared_ptr
, std::vector> + value; + + /// \brief Empty datum, to be populated elsewhere + Datum() : value(NULLPTR) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(value ? value->data() : NULLPTR) {} + + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr
& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::vector& value) // NOLINT implicit conversion + : value(value) {} + + // Cast from subtypes of Array to Datum + template ::value>::type> + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(std::shared_ptr(value)) {} + + // Convenience constructors + explicit Datum(bool value) : value(std::make_shared(value)) {} + explicit Datum(int8_t value) : value(std::make_shared(value)) {} + explicit Datum(uint8_t value) : value(std::make_shared(value)) {} + explicit Datum(int16_t value) : value(std::make_shared(value)) {} + explicit Datum(uint16_t value) : value(std::make_shared(value)) {} + explicit Datum(int32_t value) : value(std::make_shared(value)) {} + explicit Datum(uint32_t value) : value(std::make_shared(value)) {} + explicit Datum(int64_t value) : value(std::make_shared(value)) {} + explicit Datum(uint64_t value) : value(std::make_shared(value)) {} + explicit Datum(float value) : value(std::make_shared(value)) {} + explicit Datum(double value) : value(std::make_shared(value)) {} + + ~Datum() {} + + Datum(const Datum& other) noexcept { this->value = other.value; } + + Datum& operator=(const Datum& other) noexcept { + value = other.value; + return *this; + } + + // Define move constructor and move assignment, for better performance + Datum(Datum&& other) noexcept : value(std::move(other.value)) {} + + Datum& operator=(Datum&& other) noexcept { + value = std::move(other.value); + return *this; + } + + Datum::type kind() const { + switch (this->value.index()) { + case 0: + return Datum::NONE; + case 1: + return Datum::SCALAR; + case 2: + return Datum::ARRAY; + case 3: + return Datum::CHUNKED_ARRAY; + case 4: + return Datum::RECORD_BATCH; + case 5: + return Datum::TABLE; + case 6: + return Datum::COLLECTION; + default: + return Datum::NONE; + } + } + + std::shared_ptr array() const { + return util::get>(this->value); + } + + std::shared_ptr make_array() const { + return MakeArray(util::get>(this->value)); + } + + std::shared_ptr chunked_array() const { + return util::get>(this->value); + } + + std::shared_ptr record_batch() const { + return util::get>(this->value); + } + + std::shared_ptr
table() const { + return util::get>(this->value); + } + + const std::vector collection() const { + return util::get>(this->value); + } + + std::shared_ptr scalar() const { + return util::get>(this->value); + } + + bool is_array() const { return this->kind() == Datum::ARRAY; } + + bool is_arraylike() const { + return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY; + } + + bool is_scalar() const { return this->kind() == Datum::SCALAR; } + + /// \brief The value type of the variant, if any + /// + /// \return nullptr if no type + std::shared_ptr type() const { + if (this->kind() == Datum::ARRAY) { + return util::get>(this->value)->type; + } else if (this->kind() == Datum::CHUNKED_ARRAY) { + return util::get>(this->value)->type(); + } else if (this->kind() == Datum::SCALAR) { + return util::get>(this->value)->type; + } + return NULLPTR; + } + + bool Equals(const Datum& other) const { + if (this->kind() != other.kind()) return false; + + switch (this->kind()) { + case Datum::NONE: + return true; + case Datum::SCALAR: + return internal::SharedPtrEquals(this->scalar(), other.scalar()); + case Datum::ARRAY: + return internal::SharedPtrEquals(this->make_array(), other.make_array()); + case Datum::CHUNKED_ARRAY: + return internal::SharedPtrEquals(this->chunked_array(), other.chunked_array()); + case Datum::RECORD_BATCH: + return internal::SharedPtrEquals(this->record_batch(), other.record_batch()); + case Datum::TABLE: + return internal::SharedPtrEquals(this->table(), other.table()); + case Datum::COLLECTION: + return CollectionEquals(this->collection(), other.collection()); + default: + return false; + } + } +}; + +/// \class UnaryKernel +/// \brief An array-valued function of a single input argument. +/// +/// Note to implementors: Try to avoid making kernels that allocate memory if +/// the output size is a deterministic function of the Input Datum's metadata. +/// Instead separate the logic of the kernel and allocations necessary into +/// two different kernels. Some reusable kernels that allocate buffers +/// and delegate computation to another kernel are available in util-internal.h. +class ARROW_EXPORT UnaryKernel : public OpKernel { + public: + /// \brief Executes the kernel. + /// + /// \param[in] ctx The function context for the kernel + /// \param[in] input The kernel input data + /// \param[out] out The output of the function. Each implementation of this + /// function might assume different things about the existing contents of out + /// (e.g. which buffers are preallocated). In the future it is expected that + /// there will be a more generic mechansim for understanding the necessary + /// contracts. + virtual Status Call(FunctionContext* ctx, const Datum& input, Datum* out) = 0; +}; + +/// \class BinaryKernel +/// \brief An array-valued function of a two input arguments +class ARROW_EXPORT BinaryKernel : public OpKernel { + public: + virtual Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, + Datum* out) = 0; +}; + +static inline bool CollectionEquals(const std::vector& left, + const std::vector& right) { + if (left.size() != right.size()) return false; + + for (size_t i = 0; i < left.size(); i++) + if (!left[i].Equals(right[i])) return false; + + return true; +} + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNEL_H diff --git a/r/R/inst/include/arrow/compute/kernels/aggregate.h b/r/R/inst/include/arrow/compute/kernels/aggregate.h new file mode 100644 index 00000000000..2fe82636f81 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/aggregate.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/kernel.h" + +namespace arrow { + +class Array; +class Status; + +namespace compute { + +class FunctionContext; +struct Datum; + +/// AggregateFunction is an interface for Aggregates +/// +/// An aggregates transforms an array into single result called a state via the +/// Consume method.. State supports the merge operation via the Merge method. +/// State can be sealed into a final result via the Finalize method. +// +/// State ownership is handled by callers, thus the interface exposes 3 methods +/// for the caller to manage memory: +/// - Size +/// - New (placement new constructor invocation) +/// - Delete (state desctructor) +/// +/// Design inspired by ClickHouse aggregate functions. +class AggregateFunction { + public: + /// \brief Consume an array into a state. + virtual Status Consume(const Array& input, void* state) const = 0; + + /// \brief Merge states. + virtual Status Merge(const void* src, void* dst) const = 0; + + /// \brief Convert state into a final result. + virtual Status Finalize(const void* src, Datum* output) const = 0; + + virtual ~AggregateFunction() {} + + virtual std::shared_ptr out_type() const = 0; + + /// State management methods. + virtual int64_t Size() const = 0; + virtual void New(void* ptr) const = 0; + virtual void Delete(void* ptr) const = 0; +}; + +/// AggregateFunction partial implementation for static type state +template +class AggregateFunctionStaticState : public AggregateFunction { + virtual Status Consume(const Array& input, State* state) const = 0; + virtual Status Merge(const State& src, State* dst) const = 0; + virtual Status Finalize(const State& src, Datum* output) const = 0; + + Status Consume(const Array& input, void* state) const final { + return Consume(input, static_cast(state)); + } + + Status Merge(const void* src, void* dst) const final { + return Merge(*static_cast(src), static_cast(dst)); + } + + /// \brief Convert state into a final result. + Status Finalize(const void* src, Datum* output) const final { + return Finalize(*static_cast(src), output); + } + + int64_t Size() const final { return sizeof(State); } + + void New(void* ptr) const final { + // By using placement-new syntax, the constructor of the State is invoked + // in the memory location defined by the caller. This only supports State + // with a parameter-less constructor. + new (ptr) State; + } + + void Delete(void* ptr) const final { static_cast(ptr)->~State(); } +}; + +/// \brief UnaryKernel implemented by an AggregateState +class ARROW_EXPORT AggregateUnaryKernel : public UnaryKernel { + public: + explicit AggregateUnaryKernel(std::shared_ptr& aggregate) + : aggregate_function_(aggregate) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; + + std::shared_ptr out_type() const override; + + private: + std::shared_ptr aggregate_function_; +}; + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/boolean.h b/r/R/inst/include/arrow/compute/kernels/boolean.h new file mode 100644 index 00000000000..fb88659dbc4 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/boolean.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_BOOLEAN_H +#define ARROW_COMPUTE_KERNELS_BOOLEAN_H + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +struct Datum; +class FunctionContext; + +/// \brief Invert the values of a boolean datum +/// \param[in] context the FunctionContext +/// \param[in] value datum to invert +/// \param[out] out resulting datum +/// +/// \since 0.11.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Invert(FunctionContext* context, const Datum& value, Datum* out); + +/// \brief Element-wise AND of two boolean datums +/// \param[in] context the FunctionContext +/// \param[in] left left operand (array) +/// \param[in] right right operand (array) +/// \param[out] out resulting datum +/// +/// \since 0.11.0 +/// \note API not yet finalized +ARROW_EXPORT +Status And(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); + +/// \brief Element-wise OR of two boolean datums +/// \param[in] context the FunctionContext +/// \param[in] left left operand (array) +/// \param[in] right right operand (array) +/// \param[out] out resulting datum +/// +/// \since 0.11.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Or(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); + +/// \brief Element-wise XOR of two boolean datums +/// \param[in] context the FunctionContext +/// \param[in] left left operand (array) +/// \param[in] right right operand (array) +/// \param[out] out resulting datum +/// +/// \since 0.11.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Xor(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/r/R/inst/include/arrow/compute/kernels/cast.h b/r/R/inst/include/arrow/compute/kernels/cast.h new file mode 100644 index 00000000000..5a7c5be93bd --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/cast.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_CAST_H +#define ARROW_COMPUTE_KERNELS_CAST_H + +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace compute { + +struct Datum; +class FunctionContext; +class UnaryKernel; + +struct ARROW_EXPORT CastOptions { + CastOptions() + : allow_int_overflow(false), + allow_time_truncate(false), + allow_float_truncate(false), + allow_invalid_utf8(false) {} + + explicit CastOptions(bool safe) + : allow_int_overflow(!safe), + allow_time_truncate(!safe), + allow_float_truncate(!safe), + allow_invalid_utf8(!safe) {} + + static CastOptions Safe() { return CastOptions(true); } + + static CastOptions Unsafe() { return CastOptions(false); } + + bool allow_int_overflow; + bool allow_time_truncate; + bool allow_float_truncate; + // Indicate if conversions from Binary/FixedSizeBinary to string must + // validate the utf8 payload. + bool allow_invalid_utf8; +}; + +/// \since 0.7.0 +/// \note API not yet finalized +ARROW_EXPORT +Status GetCastFunction(const DataType& in_type, std::shared_ptr to_type, + const CastOptions& options, std::unique_ptr* kernel); + +/// \brief Cast from one array type to another +/// \param[in] context the FunctionContext +/// \param[in] value array to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[out] out resulting array +/// +/// \since 0.7.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Cast(FunctionContext* context, const Array& value, + std::shared_ptr to_type, const CastOptions& options, + std::shared_ptr* out); + +/// \brief Cast from one value to another +/// \param[in] context the FunctionContext +/// \param[in] value datum to cast +/// \param[in] to_type type to cast to +/// \param[in] options casting options +/// \param[out] out resulting datum +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Cast(FunctionContext* context, const Datum& value, + std::shared_ptr to_type, const CastOptions& options, Datum* out); + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/r/R/inst/include/arrow/compute/kernels/compare.h b/r/R/inst/include/arrow/compute/kernels/compare.h new file mode 100644 index 00000000000..a1924512916 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/compare.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +struct Scalar; +class Status; + +namespace compute { + +struct Datum; +class FilterFunction; +class FunctionContext; + +enum CompareOperator { + EQUAL, + NOT_EQUAL, + GREATER, + GREATER_EQUAL, + LESS, + LESS_EQUAL, +}; + +template +struct Comparator; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs == rhs; } +}; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs != rhs; } +}; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs > rhs; } +}; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs >= rhs; } +}; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs < rhs; } +}; + +template +struct Comparator { + constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs <= rhs; } +}; + +struct CompareOptions { + explicit CompareOptions(CompareOperator op) : op(op) {} + + enum CompareOperator op; +}; + +/// \brief Return a Compare FilterFunction +/// +/// \param[in] context FunctionContext passing context information +/// \param[in] type required to specialize the kernel +/// \param[in] options required to specify the compare operator +/// +/// \since 0.14.0 +/// \note API not yet finalized +ARROW_EXPORT +std::shared_ptr MakeCompareFilterFunction(FunctionContext* context, + const DataType& type, + struct CompareOptions options); + +/// \brief Compare a numeric array with a scalar. +/// +/// \param[in] context the FunctionContext +/// \param[in] left datum to compare, must be an Array +/// \param[in] right datum to compare, must be a Scalar of the same type than +/// left Datum. +/// \param[in] options compare options +/// \param[out] out resulting datum +/// +/// Note on floating point arrays, this uses ieee-754 compare semantics. +/// +/// \since 0.14.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Compare(FunctionContext* context, const Datum& left, const Datum& right, + struct CompareOptions options, Datum* out); + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/count.h b/r/R/inst/include/arrow/compute/kernels/count.h new file mode 100644 index 00000000000..c33ac48665a --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/count.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace compute { + +struct Datum; +class FunctionContext; +class AggregateFunction; + +/// \class CountOptions +/// +/// The user control the Count kernel behavior with this class. By default, the +/// it will count all non-null values. +struct ARROW_EXPORT CountOptions { + enum mode { + // Count all non-null values. + COUNT_ALL = 0, + // Count all null values. + COUNT_NULL, + }; + + explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {} + + enum mode count_mode = COUNT_ALL; +}; + +/// \brief Return Count function aggregate +ARROW_EXPORT +std::shared_ptr MakeCount(FunctionContext* context, + const CountOptions& options); + +/// \brief Count non-null (or null) values in an array. +/// +/// \param[in] context the FunctionContext +/// \param[in] options counting options, see CountOptions for more information +/// \param[in] datum to count +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Count(FunctionContext* context, const CountOptions& options, const Datum& datum, + Datum* out); + +/// \brief Count non-null (or null) values in an array. +/// +/// \param[in] context the FunctionContext +/// \param[in] options counting options, see CountOptions for more information +/// \param[in] array to count +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Count(FunctionContext* context, const CountOptions& options, const Array& array, + Datum* out); + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/filter.h b/r/R/inst/include/arrow/compute/kernels/filter.h new file mode 100644 index 00000000000..becd2d5a11a --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/filter.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/kernel.h" + +namespace arrow { + +class Array; +struct Scalar; +class Status; + +namespace compute { + +class FunctionContext; +struct Datum; + +/// FilterFunction is an interface for Filters +/// +/// Filters takes an array and emits a selection vector. The selection vector +/// is given in the form of a bitmask as a BooleanArray result. +class ARROW_EXPORT FilterFunction { + public: + /// Filter an array with a scalar argument. + virtual Status Filter(const ArrayData& input, const Scalar& scalar, + ArrayData* output) const = 0; + + /// By default, FilterFunction emits a result bitmap. + virtual std::shared_ptr out_type() const { return boolean(); } + + virtual ~FilterFunction() {} +}; + +/// \brief BinaryKernel bound to a filter function +class ARROW_EXPORT FilterBinaryKernel : public BinaryKernel { + public: + explicit FilterBinaryKernel(std::shared_ptr& filter) + : filter_function_(filter) {} + + Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, + Datum* out) override; + + std::shared_ptr out_type() const override; + + private: + std::shared_ptr filter_function_; +}; + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h b/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h new file mode 100644 index 00000000000..77334af36b5 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// THIS FILE IS AUTOMATICALLY GENERATED, DO NOT EDIT +// Generated by codegen.py script +#define BOOLEAN_CASES(TEMPLATE) \ + TEMPLATE(BooleanType, UInt8Type) \ + TEMPLATE(BooleanType, Int8Type) \ + TEMPLATE(BooleanType, UInt16Type) \ + TEMPLATE(BooleanType, Int16Type) \ + TEMPLATE(BooleanType, UInt32Type) \ + TEMPLATE(BooleanType, Int32Type) \ + TEMPLATE(BooleanType, UInt64Type) \ + TEMPLATE(BooleanType, Int64Type) \ + TEMPLATE(BooleanType, FloatType) \ + TEMPLATE(BooleanType, DoubleType) + +#define UINT8_CASES(TEMPLATE) \ + TEMPLATE(UInt8Type, BooleanType) \ + TEMPLATE(UInt8Type, Int8Type) \ + TEMPLATE(UInt8Type, UInt16Type) \ + TEMPLATE(UInt8Type, Int16Type) \ + TEMPLATE(UInt8Type, UInt32Type) \ + TEMPLATE(UInt8Type, Int32Type) \ + TEMPLATE(UInt8Type, UInt64Type) \ + TEMPLATE(UInt8Type, Int64Type) \ + TEMPLATE(UInt8Type, FloatType) \ + TEMPLATE(UInt8Type, DoubleType) + +#define INT8_CASES(TEMPLATE) \ + TEMPLATE(Int8Type, BooleanType) \ + TEMPLATE(Int8Type, UInt8Type) \ + TEMPLATE(Int8Type, UInt16Type) \ + TEMPLATE(Int8Type, Int16Type) \ + TEMPLATE(Int8Type, UInt32Type) \ + TEMPLATE(Int8Type, Int32Type) \ + TEMPLATE(Int8Type, UInt64Type) \ + TEMPLATE(Int8Type, Int64Type) \ + TEMPLATE(Int8Type, FloatType) \ + TEMPLATE(Int8Type, DoubleType) + +#define UINT16_CASES(TEMPLATE) \ + TEMPLATE(UInt16Type, BooleanType) \ + TEMPLATE(UInt16Type, UInt8Type) \ + TEMPLATE(UInt16Type, Int8Type) \ + TEMPLATE(UInt16Type, Int16Type) \ + TEMPLATE(UInt16Type, UInt32Type) \ + TEMPLATE(UInt16Type, Int32Type) \ + TEMPLATE(UInt16Type, UInt64Type) \ + TEMPLATE(UInt16Type, Int64Type) \ + TEMPLATE(UInt16Type, FloatType) \ + TEMPLATE(UInt16Type, DoubleType) + +#define INT16_CASES(TEMPLATE) \ + TEMPLATE(Int16Type, BooleanType) \ + TEMPLATE(Int16Type, UInt8Type) \ + TEMPLATE(Int16Type, Int8Type) \ + TEMPLATE(Int16Type, UInt16Type) \ + TEMPLATE(Int16Type, UInt32Type) \ + TEMPLATE(Int16Type, Int32Type) \ + TEMPLATE(Int16Type, UInt64Type) \ + TEMPLATE(Int16Type, Int64Type) \ + TEMPLATE(Int16Type, FloatType) \ + TEMPLATE(Int16Type, DoubleType) + +#define UINT32_CASES(TEMPLATE) \ + TEMPLATE(UInt32Type, BooleanType) \ + TEMPLATE(UInt32Type, UInt8Type) \ + TEMPLATE(UInt32Type, Int8Type) \ + TEMPLATE(UInt32Type, UInt16Type) \ + TEMPLATE(UInt32Type, Int16Type) \ + TEMPLATE(UInt32Type, Int32Type) \ + TEMPLATE(UInt32Type, UInt64Type) \ + TEMPLATE(UInt32Type, Int64Type) \ + TEMPLATE(UInt32Type, FloatType) \ + TEMPLATE(UInt32Type, DoubleType) + +#define UINT64_CASES(TEMPLATE) \ + TEMPLATE(UInt64Type, BooleanType) \ + TEMPLATE(UInt64Type, UInt8Type) \ + TEMPLATE(UInt64Type, Int8Type) \ + TEMPLATE(UInt64Type, UInt16Type) \ + TEMPLATE(UInt64Type, Int16Type) \ + TEMPLATE(UInt64Type, UInt32Type) \ + TEMPLATE(UInt64Type, Int32Type) \ + TEMPLATE(UInt64Type, Int64Type) \ + TEMPLATE(UInt64Type, FloatType) \ + TEMPLATE(UInt64Type, DoubleType) + +#define INT32_CASES(TEMPLATE) \ + TEMPLATE(Int32Type, BooleanType) \ + TEMPLATE(Int32Type, UInt8Type) \ + TEMPLATE(Int32Type, Int8Type) \ + TEMPLATE(Int32Type, UInt16Type) \ + TEMPLATE(Int32Type, Int16Type) \ + TEMPLATE(Int32Type, UInt32Type) \ + TEMPLATE(Int32Type, UInt64Type) \ + TEMPLATE(Int32Type, Int64Type) \ + TEMPLATE(Int32Type, FloatType) \ + TEMPLATE(Int32Type, DoubleType) + +#define INT64_CASES(TEMPLATE) \ + TEMPLATE(Int64Type, BooleanType) \ + TEMPLATE(Int64Type, UInt8Type) \ + TEMPLATE(Int64Type, Int8Type) \ + TEMPLATE(Int64Type, UInt16Type) \ + TEMPLATE(Int64Type, Int16Type) \ + TEMPLATE(Int64Type, UInt32Type) \ + TEMPLATE(Int64Type, Int32Type) \ + TEMPLATE(Int64Type, UInt64Type) \ + TEMPLATE(Int64Type, FloatType) \ + TEMPLATE(Int64Type, DoubleType) + +#define FLOAT_CASES(TEMPLATE) \ + TEMPLATE(FloatType, BooleanType) \ + TEMPLATE(FloatType, UInt8Type) \ + TEMPLATE(FloatType, Int8Type) \ + TEMPLATE(FloatType, UInt16Type) \ + TEMPLATE(FloatType, Int16Type) \ + TEMPLATE(FloatType, UInt32Type) \ + TEMPLATE(FloatType, Int32Type) \ + TEMPLATE(FloatType, UInt64Type) \ + TEMPLATE(FloatType, Int64Type) \ + TEMPLATE(FloatType, DoubleType) + +#define DOUBLE_CASES(TEMPLATE) \ + TEMPLATE(DoubleType, BooleanType) \ + TEMPLATE(DoubleType, UInt8Type) \ + TEMPLATE(DoubleType, Int8Type) \ + TEMPLATE(DoubleType, UInt16Type) \ + TEMPLATE(DoubleType, Int16Type) \ + TEMPLATE(DoubleType, UInt32Type) \ + TEMPLATE(DoubleType, Int32Type) \ + TEMPLATE(DoubleType, UInt64Type) \ + TEMPLATE(DoubleType, Int64Type) \ + TEMPLATE(DoubleType, FloatType) + +#define DATE32_CASES(TEMPLATE) \ + TEMPLATE(Date32Type, Date64Type) + +#define DATE64_CASES(TEMPLATE) \ + TEMPLATE(Date64Type, Date32Type) + +#define TIME32_CASES(TEMPLATE) \ + TEMPLATE(Time32Type, Time32Type) \ + TEMPLATE(Time32Type, Time64Type) + +#define TIME64_CASES(TEMPLATE) \ + TEMPLATE(Time64Type, Time32Type) \ + TEMPLATE(Time64Type, Time64Type) + +#define TIMESTAMP_CASES(TEMPLATE) \ + TEMPLATE(TimestampType, Date32Type) \ + TEMPLATE(TimestampType, Date64Type) \ + TEMPLATE(TimestampType, TimestampType) + +#define BINARY_CASES(TEMPLATE) \ + TEMPLATE(BinaryType, StringType) + +#define STRING_CASES(TEMPLATE) \ + TEMPLATE(StringType, BooleanType) \ + TEMPLATE(StringType, UInt8Type) \ + TEMPLATE(StringType, Int8Type) \ + TEMPLATE(StringType, UInt16Type) \ + TEMPLATE(StringType, Int16Type) \ + TEMPLATE(StringType, UInt32Type) \ + TEMPLATE(StringType, Int32Type) \ + TEMPLATE(StringType, UInt64Type) \ + TEMPLATE(StringType, Int64Type) \ + TEMPLATE(StringType, FloatType) \ + TEMPLATE(StringType, DoubleType) \ + TEMPLATE(StringType, TimestampType) + +#define DICTIONARY_CASES(TEMPLATE) \ + TEMPLATE(DictionaryType, UInt8Type) \ + TEMPLATE(DictionaryType, Int8Type) \ + TEMPLATE(DictionaryType, UInt16Type) \ + TEMPLATE(DictionaryType, Int16Type) \ + TEMPLATE(DictionaryType, UInt32Type) \ + TEMPLATE(DictionaryType, Int32Type) \ + TEMPLATE(DictionaryType, UInt64Type) \ + TEMPLATE(DictionaryType, Int64Type) \ + TEMPLATE(DictionaryType, FloatType) \ + TEMPLATE(DictionaryType, DoubleType) \ + TEMPLATE(DictionaryType, Date32Type) \ + TEMPLATE(DictionaryType, Date64Type) \ + TEMPLATE(DictionaryType, Time32Type) \ + TEMPLATE(DictionaryType, Time64Type) \ + TEMPLATE(DictionaryType, TimestampType) \ + TEMPLATE(DictionaryType, NullType) \ + TEMPLATE(DictionaryType, BinaryType) \ + TEMPLATE(DictionaryType, FixedSizeBinaryType) \ + TEMPLATE(DictionaryType, StringType) \ + TEMPLATE(DictionaryType, Decimal128Type) diff --git a/r/R/inst/include/arrow/compute/kernels/hash.h b/r/R/inst/include/arrow/compute/kernels/hash.h new file mode 100644 index 00000000000..edc7c493e46 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/hash.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_HASH_H +#define ARROW_COMPUTE_KERNELS_HASH_H + +#include + +#include "arrow/compute/kernel.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +struct ArrayData; + +namespace compute { + +class FunctionContext; + +/// \brief Compute unique elements from an array-like object +/// +/// Note if a null occurs in the input it will NOT be included in the output. +/// +/// \param[in] context the FunctionContext +/// \param[in] datum array-like input +/// \param[out] out result as Array +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Unique(FunctionContext* context, const Datum& datum, std::shared_ptr* out); + +// Constants for accessing the output of ValueCounts +ARROW_EXPORT extern const char kValuesFieldName[]; +ARROW_EXPORT extern const char kCountsFieldName[]; +ARROW_EXPORT extern const int32_t kValuesFieldIndex; +ARROW_EXPORT extern const int32_t kCountsFieldIndex; +/// \brief Return counts of unique elements from an array-like object. +/// +/// Note that the counts do not include counts for nulls in the array. These can be +/// obtained separately from metadata. +/// +/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values +/// which can lead to unexpected results if the input Array has these values. +/// +/// \param[in] context the FunctionContext +/// \param[in] value array-like input +/// \param[out] counts An array of structs. +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status ValueCounts(FunctionContext* context, const Datum& value, + std::shared_ptr* counts); + +/// \brief Dictionary-encode values in an array-like object +/// \param[in] context the FunctionContext +/// \param[in] data array-like input +/// \param[out] out result with same shape and type as input +/// +/// \since 0.8.0 +/// \note API not yet finalized +ARROW_EXPORT +Status DictionaryEncode(FunctionContext* context, const Datum& data, Datum* out); + +// TODO(wesm): Define API for incremental dictionary encoding + +// TODO(wesm): Define API for regularizing DictionaryArray objects with +// different dictionaries + +// +// ARROW_EXPORT +// Status DictionaryEncode(FunctionContext* context, const Datum& data, +// const Array& prior_dictionary, Datum* out); + +// TODO(wesm): Implement these next +// ARROW_EXPORT +// Status Match(FunctionContext* context, const Datum& values, const Datum& member_set, +// Datum* out); + +// ARROW_EXPORT +// Status IsIn(FunctionContext* context, const Datum& values, const Datum& member_set, +// Datum* out); + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_HASH_H diff --git a/r/R/inst/include/arrow/compute/kernels/mean.h b/r/R/inst/include/arrow/compute/kernels/mean.h new file mode 100644 index 00000000000..5074d4e7b7d --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/mean.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace compute { + +struct Datum; +class FunctionContext; +class AggregateFunction; + +ARROW_EXPORT +std::shared_ptr MakeMeanAggregateFunction(const DataType& type, + FunctionContext* context); + +/// \brief Compute the mean of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] value datum to compute the mean, expecting Array +/// \param[out] mean datum of the computed mean as a DoubleScalar +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Mean(FunctionContext* context, const Datum& value, Datum* mean); + +/// \brief Compute the mean of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] array to compute the mean +/// \param[out] mean datum of the computed mean as a DoubleScalar +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Mean(FunctionContext* context, const Array& array, Datum* mean); + +} // namespace compute +}; // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/sum-internal.h b/r/R/inst/include/arrow/compute/kernels/sum-internal.h new file mode 100644 index 00000000000..a4e7ea63439 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/sum-internal.h @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/aggregate.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +class Array; +class DataType; + +namespace compute { + +// Find the largest compatible primitive type for a primitive type. +template +struct FindAccumulatorType {}; + +template +struct FindAccumulatorType> { + using Type = Int64Type; +}; + +template +struct FindAccumulatorType> { + using Type = UInt64Type; +}; + +template +struct FindAccumulatorType> { + using Type = DoubleType; +}; + +template +class SumAggregateFunction final : public AggregateFunctionStaticState { + using CType = typename TypeTraits::CType; + using ArrayType = typename TypeTraits::ArrayType; + + // A small number of elements rounded to the next cacheline. This should + // amount to a maximum of 4 cachelines when dealing with 8 bytes elements. + static constexpr int64_t kTinyThreshold = 32; + static_assert(kTinyThreshold >= (2 * CHAR_BIT) + 1, + "ConsumeSparse requires 3 bytes of null bitmap, and 17 is the" + "required minimum number of bits/elements to cover 3 bytes."); + + public: + Status Consume(const Array& input, StateType* state) const override { + const ArrayType& array = static_cast(input); + + if (input.null_count() == 0) { + *state = ConsumeDense(array); + } else if (input.length() <= kTinyThreshold) { + // In order to simplify ConsumeSparse implementation (requires at least 3 + // bytes of bitmap data), small arrays are handled differently. + *state = ConsumeTiny(array); + } else { + *state = ConsumeSparse(array); + } + + return Status::OK(); + } + + Status Merge(const StateType& src, StateType* dst) const override { + *dst += src; + return Status::OK(); + } + + Status Finalize(const StateType& src, Datum* output) const override { + *output = src.Finalize(); + return Status::OK(); + } + + std::shared_ptr out_type() const override { return StateType::out_type(); } + + private: + StateType ConsumeDense(const ArrayType& array) const { + StateType local; + + const auto values = array.raw_values(); + const int64_t length = array.length(); + for (int64_t i = 0; i < length; i++) { + local.sum += values[i]; + } + + local.count = length; + + return local; + } + + StateType ConsumeTiny(const ArrayType& array) const { + StateType local; + + internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), + array.length()); + const auto values = array.raw_values(); + for (int64_t i = 0; i < array.length(); i++) { + if (reader.IsSet()) { + local.sum += values[i]; + local.count++; + } + reader.Next(); + } + + return local; + } + + // While this is not branchless, gcc needs this to be in a different function + // for it to generate cmov which ends to be slightly faster than + // multiplication but safe for handling NaN with doubles. + inline CType MaskedValue(bool valid, CType value) const { return valid ? value : 0; } + + inline StateType UnrolledSum(uint8_t bits, const CType* values) const { + StateType local; + + if (bits < 0xFF) { + // Some nulls + for (size_t i = 0; i < 8; i++) { + local.sum += MaskedValue(bits & (1U << i), values[i]); + } + local.count += BitUtil::kBytePopcount[bits]; + } else { + // No nulls + for (size_t i = 0; i < 8; i++) { + local.sum += values[i]; + } + local.count += 8; + } + + return local; + } + + StateType ConsumeSparse(const ArrayType& array) const { + StateType local; + + // Sliced bitmaps on non-byte positions induce problem with the branchless + // unrolled technique. Thus extra padding is added on both left and right + // side of the slice such that both ends are byte-aligned. The first and + // last bitmap are properly masked to ignore extra values induced by + // padding. + // + // The execution is divided in 3 sections. + // + // 1. Compute the sum of the first masked byte. + // 2. Compute the sum of the middle bytes + // 3. Compute the sum of the last masked byte. + + const int64_t length = array.length(); + const int64_t offset = array.offset(); + + // The number of bytes covering the range, this includes partial bytes. + // This number bounded by `<= (length / 8) + 2`, e.g. a possible extra byte + // on the left, and on the right. + const int64_t covering_bytes = BitUtil::CoveringBytes(offset, length); + DCHECK_GE(covering_bytes, 3); + + // Align values to the first batch of 8 elements. Note that raw_values() is + // already adjusted with the offset, thus we rewind a little to align to + // the closest 8-batch offset. + const auto values = array.raw_values() - (offset % 8); + + // Align bitmap at the first consumable byte. + const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(offset, 8) / 8; + + // Consume the first (potentially partial) byte. + const uint8_t first_mask = BitUtil::kTrailingBitmask[offset % 8]; + local += UnrolledSum(bitmap[0] & first_mask, values); + + // Consume the (full) middle bytes. The loop iterates in unit of + // batches of 8 values and 1 byte of bitmap. + for (int64_t i = 1; i < covering_bytes - 1; i++) { + local += UnrolledSum(bitmap[i], &values[i * 8]); + } + + // Consume the last (potentially partial) byte. + const int64_t last_idx = covering_bytes - 1; + const uint8_t last_mask = BitUtil::kPrecedingWrappingBitmask[(offset + length) % 8]; + local += UnrolledSum(bitmap[last_idx] & last_mask, &values[last_idx * 8]); + + return local; + } +}; // namespace compute + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/sum.h b/r/R/inst/include/arrow/compute/kernels/sum.h new file mode 100644 index 00000000000..e6f95490d7c --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/sum.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Status; + +namespace compute { + +struct Datum; +class FunctionContext; +class AggregateFunction; + +/// \brief Return a Sum Kernel +/// +/// \param[in] type required to specialize the kernel +/// \param[in] context the FunctionContext +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +std::shared_ptr MakeSumAggregateFunction(const DataType& type, + FunctionContext* context); + +/// \brief Sum values of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] value datum to sum, expecting Array or ChunkedArray +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Sum(FunctionContext* context, const Datum& value, Datum* out); + +/// \brief Sum values of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] array to sum +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Sum(FunctionContext* context, const Array& array, Datum* out); + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/take.h b/r/R/inst/include/arrow/compute/kernels/take.h new file mode 100644 index 00000000000..3aa5ed5eedf --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/take.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/kernel.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace compute { + +class FunctionContext; + +struct ARROW_EXPORT TakeOptions {}; + +/// \brief Take from an array of values at indices in another array +/// +/// The output array will be of the same type as the input values +/// array, with elements taken from the values array at the given +/// indices. If an index is null then the taken element will be null. +/// +/// For example given values = ["a", "b", "c", null, "e", "f"] and +/// indices = [2, 1, null, 3], the output will be +/// = [values[2], values[1], null, values[3]] +/// = ["c", "b", null, null] +/// +/// \param[in] context the FunctionContext +/// \param[in] values array from which to take +/// \param[in] indices which values to take +/// \param[in] options options +/// \param[out] out resulting array +ARROW_EXPORT +Status Take(FunctionContext* context, const Array& values, const Array& indices, + const TakeOptions& options, std::shared_ptr* out); + +/// \brief Take from an array of values at indices in another array +/// +/// \param[in] context the FunctionContext +/// \param[in] values datum from which to take +/// \param[in] indices which values to take +/// \param[in] options options +/// \param[out] out resulting datum +ARROW_EXPORT +Status Take(FunctionContext* context, const Datum& values, const Datum& indices, + const TakeOptions& options, Datum* out); + +/// \brief BinaryKernel implementing Take operation +class ARROW_EXPORT TakeKernel : public BinaryKernel { + public: + explicit TakeKernel(const std::shared_ptr& type, TakeOptions options = {}) + : type_(type), options_(options) {} + + Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, + Datum* out) override; + + std::shared_ptr out_type() const override { return type_; } + + private: + std::shared_ptr type_; + TakeOptions options_; +}; +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/util-internal.h b/r/R/inst/include/arrow/compute/kernels/util-internal.h new file mode 100644 index 00000000000..25a670c8b25 --- /dev/null +++ b/r/R/inst/include/arrow/compute/kernels/util-internal.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H +#define ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/compute/kernel.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class FunctionContext; + +// \brief Make a copy of the buffers into a destination array without carrying +// the type. +static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) { + output->length = input.length; + output->null_count = input.null_count; + output->buffers = input.buffers; + output->offset = input.offset; + output->child_data = input.child_data; +} + +namespace detail { + +/// \brief Invoke the kernel on value using the ctx and store results in outputs. +/// +/// \param[in,out] ctx The function context to use when invoking the kernel. +/// \param[in,out] kernel The kernel to execute. +/// \param[in] value The input value to execute the kernel with. +/// \param[out] outputs One ArrayData datum for each ArrayData available in value. +ARROW_EXPORT +Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel, + const Datum& value, std::vector* outputs); + +ARROW_EXPORT +Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel, + const Datum& left, const Datum& right, + std::vector* outputs); +ARROW_EXPORT +Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel, + const Datum& left, const Datum& right, Datum* output); + +/// \brief Assign validity bitmap to output, copying bitmap if necessary, but +/// zero-copy otherwise, so that the same value slots are valid/not-null in the +/// output (sliced arrays). +/// +/// \param[in] ctx the kernel FunctionContext +/// \param[in] input the input array +/// \param[out] output the output array. Must have length set correctly. +ARROW_EXPORT +Status PropagateNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output); + +/// \brief Set validity bitmap in output with all null values. +/// +/// \param[in] ctx the kernel FunctionContext +/// \param[in] input the input array +/// \param[out] output the output array. Must have length and buffer set correctly. +ARROW_EXPORT +Status SetAllNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output); + +/// \brief Assign validity bitmap to output, taking the intersection of left and right +/// null bitmaps if necessary, but zero-copy otherwise. +/// +/// \param[in] ctx the kernel FunctionContext +/// \param[in] left the left operand +/// \param[in] right the right operand +/// \param[out] output the output array. Must have length set correctly. +ARROW_EXPORT +Status AssignNullIntersection(FunctionContext* ctx, const ArrayData& left, + const ArrayData& right, ArrayData* output); + +ARROW_EXPORT +Datum WrapArraysLike(const Datum& value, + const std::vector>& arrays); + +ARROW_EXPORT +Datum WrapDatumsLike(const Datum& value, const std::vector& datums); + +/// \brief Kernel used to preallocate outputs for primitive types. This +/// does not include allocations for the validity bitmap (PropagateNulls +/// should be used for that). +class ARROW_EXPORT PrimitiveAllocatingUnaryKernel : public UnaryKernel { + public: + // \brief Construct with a delegate that must live longer + // then this object. + explicit PrimitiveAllocatingUnaryKernel(UnaryKernel* delegate); + /// \brief Allocates ArrayData with the necessary data buffers allocated and + /// then written into by the delegate kernel + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; + + std::shared_ptr out_type() const override; + + private: + UnaryKernel* delegate_; +}; + +/// \brief Kernel used to preallocate outputs for primitive types. +class ARROW_EXPORT PrimitiveAllocatingBinaryKernel : public BinaryKernel { + public: + // \brief Construct with a kernel to delegate operatoions to. + // + // Ownership is not taken of the delegate kernel, it must outlive + // the life time of this object. + explicit PrimitiveAllocatingBinaryKernel(BinaryKernel* delegate); + + /// \brief Sets out to be of type ArrayData with the necessary + /// data buffers prepopulated. + Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, + Datum* out) override; + + std::shared_ptr out_type() const override; + + private: + BinaryKernel* delegate_; +}; + +} // namespace detail + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H diff --git a/r/R/inst/include/arrow/compute/logical_type.h b/r/R/inst/include/arrow/compute/logical_type.h new file mode 100644 index 00000000000..7acbeefe4a5 --- /dev/null +++ b/r/R/inst/include/arrow/compute/logical_type.h @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Metadata objects for creating well-typed expressions. These are distinct +// from (and higher level than) arrow::DataType as some type parameters (like +// decimal scale and precision) may not be known at expression build time, and +// these are resolved later on evaluation + +#pragma once + +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace compute { + +class Expr; + +/// \brief An object that represents either a single concrete value type or a +/// group of related types, to help with expression type validation and other +/// purposes +class ARROW_EXPORT LogicalType { + public: + enum Id { + ANY, + NUMBER, + INTEGER, + SIGNED_INTEGER, + UNSIGNED_INTEGER, + FLOATING, + NULL_, + BOOL, + UINT8, + INT8, + UINT16, + INT16, + UINT32, + INT32, + UINT64, + INT64, + FLOAT16, + FLOAT32, + FLOAT64, + BINARY, + UTF8, + DATE, + TIME, + TIMESTAMP, + DECIMAL, + LIST, + STRUCT + }; + + Id id() const { return id_; } + + virtual ~LogicalType() = default; + + virtual std::string ToString() const = 0; + + /// \brief Check if expression is an instance of this type class + virtual bool IsInstance(const Expr& expr) const = 0; + + /// \brief Get a logical expression type from a concrete Arrow in-memory + /// array type + static Status FromArrow(const ::arrow::DataType& type, LogicalTypePtr* out); + + protected: + explicit LogicalType(Id id) : id_(id) {} + Id id_; +}; + +namespace type { + +/// \brief Logical type for any value type +class ARROW_EXPORT Any : public LogicalType { + public: + Any() : LogicalType(LogicalType::ANY) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for null +class ARROW_EXPORT Null : public LogicalType { + public: + Null() : LogicalType(LogicalType::NULL_) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for concrete boolean +class ARROW_EXPORT Bool : public LogicalType { + public: + Bool() : LogicalType(LogicalType::BOOL) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for any number (integer or floating point) +class ARROW_EXPORT Number : public LogicalType { + public: + Number() : Number(LogicalType::NUMBER) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit Number(Id type_id) : LogicalType(type_id) {} +}; + +/// \brief Logical type for any integer +class ARROW_EXPORT Integer : public Number { + public: + Integer() : Integer(LogicalType::INTEGER) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit Integer(Id type_id) : Number(type_id) {} +}; + +/// \brief Logical type for any floating point number +class ARROW_EXPORT Floating : public Number { + public: + Floating() : Floating(LogicalType::FLOATING) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit Floating(Id type_id) : Number(type_id) {} +}; + +/// \brief Logical type for any signed integer +class ARROW_EXPORT SignedInteger : public Integer { + public: + SignedInteger() : SignedInteger(LogicalType::SIGNED_INTEGER) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit SignedInteger(Id type_id) : Integer(type_id) {} +}; + +/// \brief Logical type for any unsigned integer +class ARROW_EXPORT UnsignedInteger : public Integer { + public: + UnsignedInteger() : UnsignedInteger(LogicalType::UNSIGNED_INTEGER) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit UnsignedInteger(Id type_id) : Integer(type_id) {} +}; + +/// \brief Logical type for int8 +class ARROW_EXPORT Int8 : public SignedInteger { + public: + Int8() : SignedInteger(LogicalType::INT8) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for int16 +class ARROW_EXPORT Int16 : public SignedInteger { + public: + Int16() : SignedInteger(LogicalType::INT16) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for int32 +class ARROW_EXPORT Int32 : public SignedInteger { + public: + Int32() : SignedInteger(LogicalType::INT32) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for int64 +class ARROW_EXPORT Int64 : public SignedInteger { + public: + Int64() : SignedInteger(LogicalType::INT64) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for uint8 +class ARROW_EXPORT UInt8 : public UnsignedInteger { + public: + UInt8() : UnsignedInteger(LogicalType::UINT8) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for uint16 +class ARROW_EXPORT UInt16 : public UnsignedInteger { + public: + UInt16() : UnsignedInteger(LogicalType::UINT16) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for uint32 +class ARROW_EXPORT UInt32 : public UnsignedInteger { + public: + UInt32() : UnsignedInteger(LogicalType::UINT32) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for uint64 +class ARROW_EXPORT UInt64 : public UnsignedInteger { + public: + UInt64() : UnsignedInteger(LogicalType::UINT64) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for 16-bit floating point +class ARROW_EXPORT Float16 : public Floating { + public: + Float16() : Floating(LogicalType::FLOAT16) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for 32-bit floating point +class ARROW_EXPORT Float32 : public Floating { + public: + Float32() : Floating(LogicalType::FLOAT32) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for 64-bit floating point +class ARROW_EXPORT Float64 : public Floating { + public: + Float64() : Floating(LogicalType::FLOAT64) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +/// \brief Logical type for variable-size binary +class ARROW_EXPORT Binary : public LogicalType { + public: + Binary() : Binary(LogicalType::BINARY) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; + + protected: + explicit Binary(Id type_id) : LogicalType(type_id) {} +}; + +/// \brief Logical type for variable-size binary +class ARROW_EXPORT Utf8 : public Binary { + public: + Utf8() : Binary(LogicalType::UTF8) {} + bool IsInstance(const Expr& expr) const override; + std::string ToString() const override; +}; + +#define SIMPLE_TYPE_FACTORY(NAME) ARROW_EXPORT LogicalTypePtr NAME(); + +SIMPLE_TYPE_FACTORY(any); +SIMPLE_TYPE_FACTORY(null); +SIMPLE_TYPE_FACTORY(boolean); +SIMPLE_TYPE_FACTORY(number); +SIMPLE_TYPE_FACTORY(integer); +SIMPLE_TYPE_FACTORY(signed_integer); +SIMPLE_TYPE_FACTORY(unsigned_integer); +SIMPLE_TYPE_FACTORY(floating); +SIMPLE_TYPE_FACTORY(int8); +SIMPLE_TYPE_FACTORY(int16); +SIMPLE_TYPE_FACTORY(int32); +SIMPLE_TYPE_FACTORY(int64); +SIMPLE_TYPE_FACTORY(uint8); +SIMPLE_TYPE_FACTORY(uint16); +SIMPLE_TYPE_FACTORY(uint32); +SIMPLE_TYPE_FACTORY(uint64); +SIMPLE_TYPE_FACTORY(float16); +SIMPLE_TYPE_FACTORY(float32); +SIMPLE_TYPE_FACTORY(float64); +SIMPLE_TYPE_FACTORY(binary); +SIMPLE_TYPE_FACTORY(utf8); + +#undef SIMPLE_TYPE_FACTORY + +} // namespace type +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operation.h b/r/R/inst/include/arrow/compute/operation.h new file mode 100644 index 00000000000..c06f8c311cc --- /dev/null +++ b/r/R/inst/include/arrow/compute/operation.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace compute { + +/// \brief An operation is a node in a computation graph, taking input data +/// expression dependencies and emitting an output expression +class ARROW_EXPORT Operation : public std::enable_shared_from_this { + public: + virtual ~Operation() = default; + + /// \brief Check input expression arguments and output the type of resulting + /// expression that this operation produces. If the input arguments are + /// invalid, error Status is returned + /// \param[out] out the returned well-typed expression + /// \return success or failure + virtual Status ToExpr(ExprPtr* out) const = 0; + + /// \brief Return the input expressions used to instantiate the + /// operation. The default implementation returns an empty vector + /// \return a vector of expressions + virtual std::vector input_args() const; +}; + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operations/cast.h b/r/R/inst/include/arrow/compute/operations/cast.h new file mode 100644 index 00000000000..0052ebb6082 --- /dev/null +++ b/r/R/inst/include/arrow/compute/operations/cast.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/operation.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class LogicalType; + +namespace ops { + +/// \brief A cast operation creates an expression from a known constant +/// scalar value +class ARROW_EXPORT Cast : public Operation { + public: + Cast(std::shared_ptr value, std::shared_ptr out_type); + Status ToExpr(std::shared_ptr* out) const override; + + private: + std::shared_ptr value_; + std::shared_ptr out_type_; +}; + +} // namespace ops +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operations/literal.h b/r/R/inst/include/arrow/compute/operations/literal.h new file mode 100644 index 00000000000..b596b339c89 --- /dev/null +++ b/r/R/inst/include/arrow/compute/operations/literal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/operation.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +struct Scalar; + +namespace compute { +namespace ops { + +/// \brief A literal operation creates an expression from a known constant +/// scalar value +class ARROW_EXPORT Literal : public Operation { + public: + explicit Literal(const std::shared_ptr& value); + Status ToExpr(std::shared_ptr* out) const override; + + private: + std::shared_ptr value_; +}; + +} // namespace ops +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/test-util.h b/r/R/inst/include/arrow/compute/test-util.h new file mode 100644 index 00000000000..bec54cc3615 --- /dev/null +++ b/r/R/inst/include/arrow/compute/test-util.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_TEST_UTIL_H +#define ARROW_COMPUTE_TEST_UTIL_H + +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" + +namespace arrow { +namespace compute { + +class ComputeFixture { + public: + ComputeFixture() : ctx_(default_memory_pool()) {} + + protected: + FunctionContext ctx_; +}; + +class MockUnaryKernel : public UnaryKernel { + public: + MOCK_METHOD3(Call, Status(FunctionContext* ctx, const Datum& input, Datum* out)); + MOCK_CONST_METHOD0(out_type, std::shared_ptr()); +}; + +class MockBinaryKernel : public BinaryKernel { + public: + MOCK_METHOD4(Call, Status(FunctionContext* ctx, const Datum& left, const Datum& right, + Datum* out)); + MOCK_CONST_METHOD0(out_type, std::shared_ptr()); +}; + +template +std::shared_ptr _MakeArray(const std::shared_ptr& type, + const std::vector& values, + const std::vector& is_valid) { + std::shared_ptr result; + if (is_valid.size() > 0) { + ArrayFromVector(type, is_valid, values, &result); + } else { + ArrayFromVector(type, values, &result); + } + return result; +} + +template +struct DatumEqual {}; + +template +struct DatumEqual::value>::type> { + static constexpr double kArbitraryDoubleErrorBound = 1.0; + using ScalarType = typename TypeTraits::ScalarType; + + static void EnsureEqual(const Datum& lhs, const Datum& rhs) { + ASSERT_EQ(lhs.kind(), rhs.kind()); + if (lhs.kind() == Datum::SCALAR) { + auto left = internal::checked_cast(lhs.scalar().get()); + auto right = internal::checked_cast(rhs.scalar().get()); + ASSERT_EQ(left->is_valid, right->is_valid); + ASSERT_EQ(left->type->id(), right->type->id()); + ASSERT_NEAR(left->value, right->value, kArbitraryDoubleErrorBound); + } + } +}; + +template +struct DatumEqual::value>::type> { + using ScalarType = typename TypeTraits::ScalarType; + static void EnsureEqual(const Datum& lhs, const Datum& rhs) { + ASSERT_EQ(lhs.kind(), rhs.kind()); + if (lhs.kind() == Datum::SCALAR) { + auto left = internal::checked_cast(lhs.scalar().get()); + auto right = internal::checked_cast(rhs.scalar().get()); + ASSERT_EQ(left->is_valid, right->is_valid); + ASSERT_EQ(left->type->id(), right->type->id()); + ASSERT_EQ(left->value, right->value); + } + } +}; + +} // namespace compute +} // namespace arrow + +#endif diff --git a/r/R/inst/include/arrow/compute/type_fwd.h b/r/R/inst/include/arrow/compute/type_fwd.h new file mode 100644 index 00000000000..48d45ecd118 --- /dev/null +++ b/r/R/inst/include/arrow/compute/type_fwd.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/type_fwd.h" + +namespace arrow { +namespace compute { + +class Expr; +class LogicalType; +class Operation; + +using ArrowTypePtr = std::shared_ptr<::arrow::DataType>; +using ExprPtr = std::shared_ptr; +using ConstOpPtr = std::shared_ptr; +using OpPtr = std::shared_ptr; +using LogicalTypePtr = std::shared_ptr; + +} // namespace compute +} // namespace arrow diff --git a/r/R/inst/include/arrow/csv/api.h b/r/R/inst/include/arrow/csv/api.h new file mode 100644 index 00000000000..8e311844c52 --- /dev/null +++ b/r/R/inst/include/arrow/csv/api.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_API_H +#define ARROW_CSV_API_H + +#include "arrow/csv/options.h" +#include "arrow/csv/reader.h" + +#endif // ARROW_CSV_API_H diff --git a/r/R/inst/include/arrow/csv/chunker.h b/r/R/inst/include/arrow/csv/chunker.h new file mode 100644 index 00000000000..6c61632614c --- /dev/null +++ b/r/R/inst/include/arrow/csv/chunker.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_CHUNKER_H +#define ARROW_CSV_CHUNKER_H + +#include + +#include "arrow/csv/options.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace csv { + +/// \class Chunker +/// \brief A reusable block-based chunker for CSV data +/// +/// The chunker takes a block of CSV data and finds a suitable place +/// to cut it up without splitting a row. +/// If the block is truncated (i.e. not all data can be chunked), it is up +/// to the caller to arrange the next block to start with the trailing data. +/// +/// Note: if the previous block ends with CR (0x0d) and a new block starts +/// with LF (0x0a), the chunker will consider the leading newline as an empty line. +class ARROW_EXPORT Chunker { + public: + explicit Chunker(ParseOptions options); + + /// \brief Carve up a chunk in a block of data + /// + /// Process a block of CSV data, reading up to size bytes. + /// The number of bytes in the chunk is returned in out_size. + Status Process(const char* data, uint32_t size, uint32_t* out_size); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); + + // Like Process(), but specialized for some parsing options + template + Status ProcessSpecialized(const char* data, uint32_t size, uint32_t* out_size); + + // Detect a single line from the data pointer. Return the line end, + // or nullptr if the remaining line is truncated. + template + inline const char* ReadLine(const char* data, const char* data_end); + + ParseOptions options_; +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_CHUNKER_H diff --git a/r/R/inst/include/arrow/csv/column-builder.h b/r/R/inst/include/arrow/csv/column-builder.h new file mode 100644 index 00000000000..054a642295c --- /dev/null +++ b/r/R/inst/include/arrow/csv/column-builder.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_COLUMN_BUILDER_H +#define ARROW_CSV_COLUMN_BUILDER_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class ChunkedArray; +class DataType; + +namespace internal { + +class TaskGroup; + +} // namespace internal + +namespace csv { + +class BlockParser; +struct ConvertOptions; + +class ARROW_EXPORT ColumnBuilder { + public: + virtual ~ColumnBuilder() = default; + + /// Spawn a task that will try to convert and append the given CSV block. + /// All calls to Append() should happen on the same thread, otherwise + /// call Insert() instead. + virtual void Append(const std::shared_ptr& parser); + + /// Spawn a task that will try to convert and insert the given CSV block + virtual void Insert(int64_t block_index, + const std::shared_ptr& parser) = 0; + + /// Return the final chunked array. The TaskGroup _must_ have finished! + virtual Status Finish(std::shared_ptr* out) = 0; + + /// Change the task group. The previous TaskGroup _must_ have finished! + void SetTaskGroup(const std::shared_ptr& task_group); + + std::shared_ptr task_group() { return task_group_; } + + /// Construct a strictly-typed ColumnBuilder. + static Status Make(const std::shared_ptr& type, int32_t col_index, + const ConvertOptions& options, + const std::shared_ptr& task_group, + std::shared_ptr* out); + + /// Construct a type-inferring ColumnBuilder. + static Status Make(int32_t col_index, const ConvertOptions& options, + const std::shared_ptr& task_group, + std::shared_ptr* out); + + protected: + explicit ColumnBuilder(const std::shared_ptr& task_group) + : task_group_(task_group) {} + + std::shared_ptr task_group_; + ArrayVector chunks_; +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_COLUMN_BUILDER_H diff --git a/r/R/inst/include/arrow/csv/converter.h b/r/R/inst/include/arrow/csv/converter.h new file mode 100644 index 00000000000..d64fe695d0a --- /dev/null +++ b/r/R/inst/include/arrow/csv/converter.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_CONVERTER_H +#define ARROW_CSV_CONVERTER_H + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class MemoryPool; +class Status; + +namespace csv { + +class BlockParser; + +class ARROW_EXPORT Converter { + public: + Converter(const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool); + virtual ~Converter() = default; + + virtual Status Convert(const BlockParser& parser, int32_t col_index, + std::shared_ptr* out) = 0; + + std::shared_ptr type() const { return type_; } + + static Status Make(const std::shared_ptr& type, const ConvertOptions& options, + std::shared_ptr* out); + static Status Make(const std::shared_ptr& type, const ConvertOptions& options, + MemoryPool* pool, std::shared_ptr* out); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + virtual Status Initialize() = 0; + + const ConvertOptions options_; + MemoryPool* pool_; + std::shared_ptr type_; +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_CONVERTER_H diff --git a/r/R/inst/include/arrow/csv/options.h b/r/R/inst/include/arrow/csv/options.h new file mode 100644 index 00000000000..9cd312ac079 --- /dev/null +++ b/r/R/inst/include/arrow/csv/options.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_OPTIONS_H +#define ARROW_CSV_OPTIONS_H + +#include +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; + +namespace csv { + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + // Field delimiter + char delimiter = ','; + // Whether quoting is used + bool quoting = true; + // Quoting character (if `quoting` is true) + char quote_char = '"'; + // Whether a quote inside a value is double-quoted + bool double_quote = true; + // Whether escaping is used + bool escaping = false; + // Escaping character (if `escaping` is true) + char escape_char = '\\'; + // Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters + bool newlines_in_values = false; + // Whether empty lines are ignored. If false, an empty line represents + // a single empty value (assuming a one-column CSV file). + bool ignore_empty_lines = true; + + // XXX Should this be in ReadOptions? + // Number of header rows to skip (including the first row containing column names) + int32_t header_rows = 1; + + static ParseOptions Defaults(); +}; + +struct ARROW_EXPORT ConvertOptions { + // Conversion options + + // Whether to check UTF8 validity of string columns + bool check_utf8 = true; + // Optional per-column types (disabling type inference on those columns) + std::unordered_map> column_types; + // Recognized spellings for null values + std::vector null_values; + // Recognized spellings for boolean values + std::vector true_values; + std::vector false_values; + // Whether string / binary columns can have null values. + // If true, then strings in "null_values" are considered null for string columns. + // If false, then all strings are valid string values. + bool strings_can_be_null = false; + + static ConvertOptions Defaults(); +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + // Whether to use the global CPU thread pool + bool use_threads = true; + // Block size we request from the IO layer; also determines the size of + // chunks when use_threads is true + int32_t block_size = 1 << 20; // 1 MB + + static ReadOptions Defaults(); +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_OPTIONS_H diff --git a/r/R/inst/include/arrow/csv/parser.h b/r/R/inst/include/arrow/csv/parser.h new file mode 100644 index 00000000000..fdddc37a2c0 --- /dev/null +++ b/r/R/inst/include/arrow/csv/parser.h @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_PARSER_H +#define ARROW_CSV_PARSER_H + +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/csv/options.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace csv { + +constexpr int32_t kMaxParserNumRows = 100000; + +/// \class BlockParser +/// \brief A reusable block-based parser for CSV data +/// +/// The parser takes a block of CSV data and delimits rows and fields, +/// unquoting and unescaping them on the fly. Parsed data is own by the +/// parser, so the original buffer can be discarded after Parse() returns. +/// +/// If the block is truncated (i.e. not all data can be parsed), it is up +/// to the caller to arrange the next block to start with the trailing data. +/// Also, if the previous block ends with CR (0x0d) and a new block starts +/// with LF (0x0a), the parser will consider the leading newline as an empty +/// line; the caller should therefore strip it. +class ARROW_EXPORT BlockParser { + public: + explicit BlockParser(ParseOptions options, int32_t num_cols = -1, + int32_t max_num_rows = kMaxParserNumRows); + explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1, + int32_t max_num_rows = kMaxParserNumRows); + + /// \brief Parse a block of data + /// + /// Parse a block of CSV data, ingesting up to max_num_rows rows. + /// The number of bytes actually parsed is returned in out_size. + Status Parse(const char* data, uint32_t size, uint32_t* out_size); + + /// \brief Parse the final block of data + /// + /// Like Parse(), but called with the final block in a file. + /// The last row may lack a trailing line separator. + Status ParseFinal(const char* data, uint32_t size, uint32_t* out_size); + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return num_rows_; } + /// \brief Return the number of parsed columns + int32_t num_cols() const { return num_cols_; } + /// \brief Return the total size in bytes of parsed data + uint32_t num_bytes() const { return parsed_size_; } + + /// \brief Visit parsed values in a column + /// + /// The signature of the visitor is + /// Status(const uint8_t* data, uint32_t size, bool quoted) + template + Status VisitColumn(int32_t col_index, Visitor&& visit) const { + for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { + const auto& values_buffer = values_buffers_[buf_index]; + const auto values = reinterpret_cast(values_buffer->data()); + const auto max_pos = + static_cast(values_buffer->size() / sizeof(ValueDesc)) - 1; + for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) { + auto start = values[pos].offset; + auto stop = values[pos + 1].offset; + auto quoted = values[pos + 1].quoted; + ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); + } + } + return Status::OK(); + } + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); + + Status DoParse(const char* data, uint32_t size, bool is_final, uint32_t* out_size); + template + Status DoParseSpecialized(const char* data, uint32_t size, bool is_final, + uint32_t* out_size); + + template + Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer, + const char* data, const char* data_end, bool is_final, + int32_t rows_in_chunk, const char** out_data, bool* finished_parsing); + + // Parse a single line from the data pointer + template + Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer, + const char* data, const char* data_end, bool is_final, + const char** out_data); + + MemoryPool* pool_; + const ParseOptions options_; + // The number of rows parsed from the block + int32_t num_rows_; + // The number of columns (can be -1 at start) + int32_t num_cols_; + // The maximum number of rows to parse from this block + int32_t max_num_rows_; + + // Linear scratchpad for parsed values + struct ValueDesc { + uint32_t offset : 31; + bool quoted : 1; + }; + + // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes? + // It may help with null parsing... + std::vector> values_buffers_; + std::shared_ptr parsed_buffer_; + const uint8_t* parsed_; + int32_t values_size_; + int32_t parsed_size_; + + class ResizableValuesWriter; + class PresizedValuesWriter; + class PresizedParsedWriter; +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_PARSER_H diff --git a/r/R/inst/include/arrow/csv/reader.h b/r/R/inst/include/arrow/csv/reader.h new file mode 100644 index 00000000000..edf6f110980 --- /dev/null +++ b/r/R/inst/include/arrow/csv/reader.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_READER_H +#define ARROW_CSV_READER_H + +#include + +#include "arrow/csv/options.h" // IWYU pragma: keep +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class Table; + +namespace io { +class InputStream; +} // namespace io + +namespace csv { + +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + virtual Status Read(std::shared_ptr
* out) = 0; + + // XXX pass optional schema? + static Status Make(MemoryPool* pool, std::shared_ptr input, + const ReadOptions&, const ParseOptions&, const ConvertOptions&, + std::shared_ptr* out); +}; + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_READER_H diff --git a/r/R/inst/include/arrow/csv/test-common.h b/r/R/inst/include/arrow/csv/test-common.h new file mode 100644 index 00000000000..624023f6037 --- /dev/null +++ b/r/R/inst/include/arrow/csv/test-common.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CSV_TEST_COMMON_H +#define ARROW_CSV_TEST_COMMON_H + +#include +#include +#include + +#include "arrow/csv/parser.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { +namespace csv { + +std::string MakeCSVData(std::vector lines) { + std::string s; + for (const auto& line : lines) { + s += line; + } + return s; +} + +// Make a BlockParser from a vector of lines representing a CSV file +void MakeCSVParser(std::vector lines, ParseOptions options, + std::shared_ptr* out) { + auto csv = MakeCSVData(lines); + auto parser = std::make_shared(options); + uint32_t out_size; + ASSERT_OK(parser->Parse(csv.data(), static_cast(csv.size()), &out_size)); + ASSERT_EQ(out_size, csv.size()) << "trailing CSV data not parsed"; + *out = parser; +} + +void MakeCSVParser(std::vector lines, std::shared_ptr* out) { + MakeCSVParser(lines, ParseOptions::Defaults(), out); +} + +// Make a BlockParser from a vector of strings representing a single CSV column +void MakeColumnParser(std::vector items, std::shared_ptr* out) { + auto options = ParseOptions::Defaults(); + // Need this to test for null (empty) values + options.ignore_empty_lines = false; + std::vector lines; + for (const auto& item : items) { + lines.push_back(item + '\n'); + } + MakeCSVParser(lines, options, out); + ASSERT_EQ((*out)->num_cols(), 1) << "Should have seen only 1 CSV column"; + ASSERT_EQ((*out)->num_rows(), items.size()); +} + +} // namespace csv +} // namespace arrow + +#endif // ARROW_CSV_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/api.h b/r/R/inst/include/arrow/dbi/hiveserver2/api.h new file mode 100644 index 00000000000..6ac849ef87b --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/api.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/dbi/hiveserver2/columnar-row-set.h" +#include "arrow/dbi/hiveserver2/operation.h" +#include "arrow/dbi/hiveserver2/service.h" +#include "arrow/dbi/hiveserver2/session.h" +#include "arrow/dbi/hiveserver2/types.h" +#include "arrow/dbi/hiveserver2/util.h" + +#include "arrow/status.h" diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h b/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h new file mode 100644 index 00000000000..a62c738020b --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace hiveserver2 { + +// The Column class is used to access data that was fetched in columnar format. +// The contents of the data can be accessed through the data() fn, which returns +// a ptr to a vector containing the contents of this column in the fetched +// results, avoiding copies. This vector will be of size length(). +// +// If any of the values are null, they will be represented in the data vector as +// default values, i.e. 0 for numeric types. The nulls() fn returns a ptr to a +// bit array representing which values are null, and the IsNull() fn is provided +// for convenience when working with this bit array. The user should check +// IsNull() to distinguish between actual instances of the default values and nulls. +// +// A Column object is returned from a ColumnarRowSet and is only valid as long +// as that ColumnarRowSet still exists. +// +// Example: +// unique_ptr col = columnar_row_set->GetInt32Col(); +// for (int i = 0; i < col->length(); i++) { +// if (col->IsNull(i)) { +// cout << "NULL\n"; +// } else { +// cout << col->data()[i] << "\n"; +// } +// } +class ARROW_EXPORT Column { + public: + virtual ~Column() {} + + virtual int64_t length() const = 0; + + const uint8_t* nulls() const { return nulls_; } + int64_t nulls_size() const { return nulls_size_; } + + // Returns true iff the value for the i-th row within this set of data for this + // column is null. + bool IsNull(int64_t i) const { return (nulls_[i / 8] & (1 << (i % 8))) != 0; } + + protected: + explicit Column(const std::string* nulls); + + // The memory for these ptrs is owned by the ColumnarRowSet that + // created this Column. + // + // Due to the issue described in HUE-2722, the null bitmap may have fewer + // bytes than expected for some versions of Hive, so we retain the ability to + // check the buffer size in case this happens. + const uint8_t* nulls_; + int64_t nulls_size_; +}; + +template +class ARROW_EXPORT TypedColumn : public Column { + public: + const std::vector& data() const { return *data_; } + int64_t length() const { return data().size(); } + + // Returns the value for the i-th row within this set of data for this column. + const T& GetData(int64_t i) const { return data()[i]; } + + private: + // For access to the c'tor. + friend class ColumnarRowSet; + + TypedColumn(const std::string* nulls, const std::vector* data) + : Column(nulls), data_(data) {} + + const std::vector* data_; +}; + +typedef TypedColumn BoolColumn; +typedef TypedColumn ByteColumn; +typedef TypedColumn Int16Column; +typedef TypedColumn Int32Column; +typedef TypedColumn Int64Column; +typedef TypedColumn DoubleColumn; +typedef TypedColumn StringColumn; +typedef TypedColumn BinaryColumn; + +// A ColumnarRowSet represents the full results returned by a call to +// Operation::Fetch() when a columnar format is being used. +// +// ColumnarRowSet provides access to specific columns by their type and index in +// the results. All Column objects returned from a given ColumnarRowSet will have +// the same length(). A Column object returned by a ColumnarRowSet is only valid +// as long as the ColumnarRowSet still exists. +// +// Example: +// unique_ptr op; +// session->ExecuteStatement("select int_col, string_col from tbl", &op); +// unique_ptr columnar_row_set; +// if (op->Fetch(&columnar_row_set).ok()) { +// unique_ptr int32_col = columnar_row_set->GetInt32Col(0); +// unique_ptr string_col = columnar_row_set->GetStringCol(1); +// } +class ARROW_EXPORT ColumnarRowSet { + public: + ~ColumnarRowSet(); + + std::unique_ptr GetBoolCol(int i) const; + std::unique_ptr GetByteCol(int i) const; + std::unique_ptr GetInt16Col(int i) const; + std::unique_ptr GetInt32Col(int i) const; + std::unique_ptr GetInt64Col(int i) const; + std::unique_ptr GetDoubleCol(int i) const; + std::unique_ptr GetStringCol(int i) const; + std::unique_ptr GetBinaryCol(int i) const; + + template + std::unique_ptr GetCol(int i) const; + + private: + // Hides Thrift objects from the header. + struct ColumnarRowSetImpl; + + ARROW_DISALLOW_COPY_AND_ASSIGN(ColumnarRowSet); + + // For access to the c'tor. + friend class Operation; + + explicit ColumnarRowSet(ColumnarRowSetImpl* impl); + + std::unique_ptr impl_; +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/operation.h b/r/R/inst/include/arrow/dbi/hiveserver2/operation.h new file mode 100644 index 00000000000..f275592e23d --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/operation.h @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/dbi/hiveserver2/columnar-row-set.h" +#include "arrow/dbi/hiveserver2/types.h" + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace hiveserver2 { + +struct ThriftRPC; + +// Maps directly to TFetchOrientation in the HiveServer2 interface. +enum class FetchOrientation { + NEXT, // supported + PRIOR, // not supported + RELATIVE, // not supported + ABSOLUTE, // not supported + FIRST, // supported if query result caching is enabled in Impala + LAST // not supported +}; + +// Represents a single HiveServer2 operation. Used to monitor the status of an operation +// and to retrieve its results. The only Operation function that will block is Fetch, +// which blocks if there aren't any results ready yet. +// +// Operations are created using Session functions, eg. ExecuteStatement. They must +// have Close called on them before they can be deleted. +// +// This class is not thread-safe. +class ARROW_EXPORT Operation { + public: + // Maps directly to TOperationState in the HiveServer2 interface. + enum class State { + INITIALIZED, + RUNNING, + FINISHED, + CANCELED, + CLOSED, + ERROR, + UNKNOWN, + PENDING, + }; + + ~Operation(); + + // Fetches the current state of this operation. If successful, sets the operation state + // in 'out' and returns an OK status, otherwise an error status is returned. May be + // called after successfully creating the operation and before calling Close. + Status GetState(Operation::State* out) const; + + // May be called after successfully creating the operation and before calling Close. + Status GetLog(std::string* out) const; + + // May be called after successfully creating the operation and before calling Close. + Status GetProfile(std::string* out) const; + + // Fetches metadata for the columns in the output of this operation, such as the + // names and types of the columns, and returns it as a list of column descriptions. + // May be called after successfully creating the operation and before calling Close. + Status GetResultSetMetadata(std::vector* column_descs) const; + + // Fetches a batch of results, stores them in 'results', and sets has_more_rows. + // Fetch will block if there aren't any results that are ready. + Status Fetch(std::unique_ptr* results, bool* has_more_rows) const; + Status Fetch(int max_rows, FetchOrientation orientation, + std::unique_ptr* results, bool* has_more_rows) const; + + // May be called after successfully creating the operation and before calling Close. + Status Cancel() const; + + // Closes the operation. Must be called before the operation is deleted. May be safely + // called on an invalid or already closed operation - will only return an error if the + // operation is open but the close rpc fails. + Status Close(); + + // May be called after successfully creating the operation and before calling Close. + bool HasResultSet() const; + + // Returns true iff this operation's results will be returned in a columnar format. + // May be called at any time. + bool IsColumnar() const; + + protected: + // Hides Thrift objects from the header. + struct OperationImpl; + + explicit Operation(const std::shared_ptr& rpc); + + std::unique_ptr impl_; + std::shared_ptr rpc_; + + // True iff this operation has been successfully created and has not been closed yet, + // corresponding to when the operation has a valid operation handle. + bool open_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Operation); +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/service.h b/r/R/inst/include/arrow/dbi/hiveserver2/service.h new file mode 100644 index 00000000000..bfa7a97db3a --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/service.h @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace hiveserver2 { + +class Session; +struct ThriftRPC; + +// Stores per-session or per-operation configuration parameters. +class HS2ClientConfig { + public: + void SetOption(const std::string& key, const std::string& value) { + config_[key] = value; + } + + bool GetOption(const std::string& key, std::string* value_out) { + if (config_.find(key) != config_.end() && value_out) { + *value_out = config_[key]; + return true; + } + return false; + } + + const std::map& GetConfig() const { return config_; } + + private: + std::map config_; +}; + +// Maps directly to TProtocolVersion in the HiveServer2 interface. +enum class ProtocolVersion { + PROTOCOL_V1, // not supported + PROTOCOL_V2, // not supported + PROTOCOL_V3, // not supported + PROTOCOL_V4, // not supported + PROTOCOL_V5, // not supported + PROTOCOL_V6, // supported + PROTOCOL_V7, // supported +}; + +// Manages a connection to a HiveServer2 server. Primarily used to create +// new sessions via OpenSession. +// +// Service objects are created using Service::Connect(). They must +// have Close called on them before they can be deleted. +// +// This class is not thread-safe. +// +// Example: +// unique_ptr service; +// if (Service::Connect(host, port, protocol_version, &service).ok()) { +// // do some work +// service->Close(); +// } +class ARROW_EXPORT Service { + public: + // Creates a new connection to a HS2 service at the given host and port. If + // conn_timeout > 0, connection attempts will timeout after conn_timeout ms, otherwise + // no timeout is used. protocol_version is the HiveServer2 protocol to use, and + // determines whether the results returned by operations from this service are row or + // column oriented. Only column oriented protocols are currently supported. + // + // The client calling Connect has ownership of the new Service that is created. + // Executing RPCs with an Session or Operation corresponding to a particular + // Service after that Service has been closed or deleted in undefined. + static Status Connect(const std::string& host, int port, int conn_timeout, + ProtocolVersion protocol_version, + std::unique_ptr* service); + + ~Service(); + + // Closes the connection. Must be called before the service is deleted. May be + // safely called on an invalid or already closed service - will only return an + // error if the service is open but the close rpc fails. + Status Close(); + + // Returns true iff this service has an active connection to the HiveServer2 server. + bool IsConnected() const; + + // Set the send and receive timeout for Thrift RPCs in ms. 0 indicates no timeout, + // negative values are ignored. + void SetRecvTimeout(int timeout); + void SetSendTimeout(int timeout); + + // Opens a new HS2 session using this service. + // The client calling OpenSession has ownership of the Session that is created. + // Operations on the Session are undefined once it is closed. + Status OpenSession(const std::string& user, const HS2ClientConfig& config, + std::unique_ptr* session) const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Service); + + // Hides Thrift objects from the header. + struct ServiceImpl; + + Service(const std::string& host, int port, int conn_timeout, + ProtocolVersion protocol_version); + + // Opens the connection to the server. Called by Connect before new service is returned + // to the user. Must be called before OpenSession. + Status Open(); + + std::string host_; + int port_; + int conn_timeout_; + + std::unique_ptr impl_; + std::shared_ptr rpc_; +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/session.h b/r/R/inst/include/arrow/dbi/hiveserver2/session.h new file mode 100644 index 00000000000..4e223de6c17 --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/session.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dbi/hiveserver2/operation.h" +#include "arrow/dbi/hiveserver2/service.h" + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +namespace hiveserver2 { + +struct ThriftRPC; + +// Manages a single HiveServer2 session - stores the session handle returned by +// the OpenSession RPC and uses it to create and return operations. +// +// Sessions are created with Service::OpenSession(). They must have Close +// called on them before they can be deleted. +// +// Executing RPCs with an Operation corresponding to a particular Session after +// that Session has been closed or deleted is undefined. +// +// This class is not thread-safe. +class ARROW_EXPORT Session { + public: + ~Session(); + + // Closes the session. Must be called before the session is deleted. May be safely + // called on an invalid or already closed session - will only return an error if the + // session is open but the close rpc fails. + Status Close(); + + Status ExecuteStatement(const std::string& statement, + std::unique_ptr* operation) const; + Status ExecuteStatement(const std::string& statement, + const HS2ClientConfig& conf_overlay, + std::unique_ptr* operation) const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Session); + + // Hides Thrift objects from the header. + struct SessionImpl; + + // For access to the c'tor. + friend class Service; + + explicit Session(const std::shared_ptr& rpc); + + // Performs the RPC that initiates the session and stores the returned handle. + // Must be called before operations can be executed. + Status Open(const HS2ClientConfig& config, const std::string& user); + + std::unique_ptr impl_; + std::shared_ptr rpc_; + + // True if Open has been called and Close has not. + bool open_; +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h b/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h new file mode 100644 index 00000000000..aad535fc1f3 --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/dbi/hiveserver2/columnar-row-set.h" +#include "arrow/dbi/hiveserver2/operation.h" +#include "arrow/dbi/hiveserver2/service.h" +#include "arrow/dbi/hiveserver2/types.h" + +#include "arrow/dbi/hiveserver2/ImpalaHiveServer2Service.h" +#include "arrow/dbi/hiveserver2/TCLIService.h" + +namespace arrow { +namespace hiveserver2 { + +// PIMPL structs. +struct ColumnarRowSet::ColumnarRowSetImpl { + apache::hive::service::cli::thrift::TFetchResultsResp resp; +}; + +struct Operation::OperationImpl { + apache::hive::service::cli::thrift::TOperationHandle handle; + apache::hive::service::cli::thrift::TSessionHandle session_handle; +}; + +struct ThriftRPC { + std::unique_ptr client; +}; + +const std::string OperationStateToString(const Operation::State& state); + +const std::string TypeIdToString(const ColumnType::TypeId& type_id); + +// Functions for converting Thrift object to hs2client objects and vice-versa. +apache::hive::service::cli::thrift::TFetchOrientation::type +FetchOrientationToTFetchOrientation(FetchOrientation orientation); + +apache::hive::service::cli::thrift::TProtocolVersion::type +ProtocolVersionToTProtocolVersion(ProtocolVersion protocol); + +Operation::State TOperationStateToOperationState( + const apache::hive::service::cli::thrift::TOperationState::type& tstate); + +Status TStatusToStatus(const apache::hive::service::cli::thrift::TStatus& tstatus); + +// Converts a TTypeDesc to a ColumnType. Currently only primitive types are supported. +// The converted type is returned as a pointer to allow for polymorphism with ColumnType +// and its subclasses. +std::unique_ptr TTypeDescToColumnType( + const apache::hive::service::cli::thrift::TTypeDesc& ttype_desc); + +ColumnType::TypeId TTypeIdToTypeId( + const apache::hive::service::cli::thrift::TTypeId::type& type_id); + +} // namespace hiveserver2 +} // namespace arrow + +#define TRY_RPC_OR_RETURN(rpc) \ + do { \ + try { \ + (rpc); \ + } catch (apache::thrift::TException & tx) { \ + return Status::IOError(tx.what()); \ + } \ + } while (0) + +#define THRIFT_RETURN_NOT_OK(tstatus) \ + do { \ + if (tstatus.statusCode != hs2::TStatusCode::SUCCESS_STATUS && \ + tstatus.statusCode != hs2::TStatusCode::SUCCESS_WITH_INFO_STATUS) { \ + return TStatusToStatus(tstatus); \ + } \ + } while (0) diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/types.h b/r/R/inst/include/arrow/dbi/hiveserver2/types.h new file mode 100644 index 00000000000..38cebcc2eeb --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/types.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace arrow { +namespace hiveserver2 { + +// Represents a column's type. +// +// For now only PrimitiveType is implemented, as thase are the only types Impala will +// currently return. In the future, nested types will be represented as other subclasses +// of ColumnType containing ptrs to other ColumnTypes - for example, an ArrayType subclass +// would contain a single ptr to another ColumnType representing the type of objects +// stored in the array. +class ColumnType { + public: + virtual ~ColumnType() = default; + + // Maps directly to TTypeId in the HiveServer2 interface. + enum class TypeId { + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + FLOAT, + DOUBLE, + STRING, + TIMESTAMP, + BINARY, + ARRAY, + MAP, + STRUCT, + UNION, + USER_DEFINED, + DECIMAL, + NULL_TYPE, + DATE, + VARCHAR, + CHAR, + INVALID, + }; + + virtual TypeId type_id() const = 0; + virtual std::string ToString() const = 0; +}; + +class PrimitiveType : public ColumnType { + public: + explicit PrimitiveType(const TypeId& type_id) : type_id_(type_id) {} + + TypeId type_id() const override { return type_id_; } + std::string ToString() const override; + + private: + const TypeId type_id_; +}; + +// Represents CHAR and VARCHAR types. +class CharacterType : public PrimitiveType { + public: + CharacterType(const TypeId& type_id, int max_length) + : PrimitiveType(type_id), max_length_(max_length) {} + + int max_length() const { return max_length_; } + + private: + const int max_length_; +}; + +// Represents DECIMAL types. +class DecimalType : public PrimitiveType { + public: + DecimalType(const TypeId& type_id, int precision, int scale) + : PrimitiveType(type_id), precision_(precision), scale_(scale) {} + + int precision() const { return precision_; } + int scale() const { return scale_; } + + private: + const int precision_; + const int scale_; +}; + +// Represents the metadata for a single column. +class ColumnDesc { + public: + ColumnDesc(const std::string& column_name, std::unique_ptr type, + int position, const std::string& comment) + : column_name_(column_name), + type_(move(type)), + position_(position), + comment_(comment) {} + + const std::string& column_name() const { return column_name_; } + const ColumnType* type() const { return type_.get(); } + int position() const { return position_; } + const std::string& comment() const { return comment_; } + + const PrimitiveType* GetPrimitiveType() const; + const CharacterType* GetCharacterType() const; + const DecimalType* GetDecimalType() const; + + private: + const std::string column_name_; + std::unique_ptr type_; + const int position_; + const std::string comment_; +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/util.h b/r/R/inst/include/arrow/dbi/hiveserver2/util.h new file mode 100644 index 00000000000..a17e7b2286b --- /dev/null +++ b/r/R/inst/include/arrow/dbi/hiveserver2/util.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/dbi/hiveserver2/operation.h" + +namespace arrow { +namespace hiveserver2 { + +// Utility functions. Intended primary for testing purposes - clients should not +// rely on stability of the behavior or API of these functions. +class Util { + public: + // Fetches the operation's results and returns them in a nicely formatted string. + static void PrintResults(const Operation* op, std::ostream& out); +}; + +} // namespace hiveserver2 +} // namespace arrow diff --git a/r/R/inst/include/arrow/extension_type.h b/r/R/inst/include/arrow/extension_type.h new file mode 100644 index 00000000000..48bc1e9bff7 --- /dev/null +++ b/r/R/inst/include/arrow/extension_type.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// User-defined extension types. EXPERIMENTAL in 0.13.0 +/// \since 0.13.0 + +#pragma once + +#include +#include + +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief The base class for custom / user-defined types. +class ARROW_EXPORT ExtensionType : public DataType { + public: + static constexpr Type::type type_id = Type::EXTENSION; + + /// \brief The type of array used to represent this extension type's data + std::shared_ptr storage_type() const { return storage_type_; } + + std::string ToString() const override; + std::string name() const override; + + /// \brief Unique name of extension type used to identify type for + /// serialization + /// \return the string name of the extension + virtual std::string extension_name() const = 0; + + /// \brief Determine if two instances of the same extension types are + /// equal. Invoked from ExtensionType::Equals + /// \param[in] other the type to compare this type with + /// \return bool true if type instances are equal + virtual bool ExtensionEquals(const ExtensionType& other) const = 0; + + /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance + /// \param[in] data the physical storage for the extension type + virtual std::shared_ptr MakeArray(std::shared_ptr data) const = 0; + + /// \brief Create an instance of the ExtensionType given the actual storage + /// type and the serialized representation + /// \param[in] storage_type the physical storage type of the extension + /// \param[in] serialized_data the serialized representation produced by + /// Serialize + /// \param[out] out the reconstructed extension type + /// \return Status + virtual Status Deserialize(std::shared_ptr storage_type, + const std::string& serialized_data, + std::shared_ptr* out) const = 0; + + /// \brief Create a serialized representation of the extension type's + /// metadata. The storage type will be handled automatically in IPC code + /// paths + /// \return the serialized representation + virtual std::string Serialize() const = 0; + + protected: + explicit ExtensionType(std::shared_ptr storage_type) + : DataType(Type::EXTENSION), storage_type_(storage_type) {} + + std::shared_ptr storage_type_; +}; + +/// \brief Base array class for user-defined extension types +class ARROW_EXPORT ExtensionArray : public Array { + public: + explicit ExtensionArray(const std::shared_ptr& data) { SetData(data); } + + /// \brief The physical storage for the extension array + std::shared_ptr storage() const { return storage_; } + + protected: + void SetData(const std::shared_ptr& data); + std::shared_ptr storage_; +}; + +/// \brief Register an extension type globally. The name returned by the type's +/// extension_name() method should be unique. This method is thread-safe +/// \param[in] type an instance of the extension type +/// \return Status +ARROW_EXPORT +Status RegisterExtensionType(std::shared_ptr type); + +/// \brief Delete an extension type from the global registry. This method is +/// thread-safe +/// \param[in] type_name the unique name of a registered extension type +/// \return Status error if the type name is unknown +ARROW_EXPORT +Status UnregisterExtensionType(const std::string& type_name); + +/// \brief Retrieve an extension type from the global registry. Returns nullptr +/// if not found. This method is thread-safe +/// \return the globally-registered extension type +ARROW_EXPORT +std::shared_ptr GetExtensionType(const std::string& type_name); + +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/filesystem.h b/r/R/inst/include/arrow/filesystem/filesystem.h new file mode 100644 index 00000000000..9a3e5a0dd58 --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/filesystem.h @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +// The Windows API defines macros from *File resolving to either +// *FileA or *FileW. Need to undo them. +#ifdef _WIN32 +#ifdef DeleteFile +#undef DeleteFile +#endif +#ifdef CopyFile +#undef CopyFile +#endif +#endif + +namespace arrow { + +namespace io { + +class InputStream; +class OutputStream; +class RandomAccessFile; + +} // namespace io + +namespace fs { + +// A system clock time point expressed as a 64-bit (or more) number of +// nanoseconds since the epoch. +using TimePoint = + std::chrono::time_point; + +/// \brief EXPERIMENTAL: FileSystem entry type +enum class ARROW_EXPORT FileType { + // Target does not exist + NonExistent, + // Target exists but its type is unknown (could be a special file such + // as a Unix socket or character device, or Windows NUL / CON / ...) + Unknown, + // Target is a regular file + File, + // Target is a directory + Directory +}; + +ARROW_EXPORT std::string ToString(FileType); + +static const int64_t kNoSize = -1; +static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); + +/// \brief EXPERIMENTAL: FileSystem entry stats +struct ARROW_EXPORT FileStats { + FileStats() = default; + FileStats(FileStats&&) = default; + FileStats& operator=(FileStats&&) = default; + FileStats(const FileStats&) = default; + FileStats& operator=(const FileStats&) = default; + + // The file type. + FileType type() const { return type_; } + void set_type(FileType type) { type_ = type; } + + // The full file path in the filesystem. + std::string path() const { return path_; } + void set_path(const std::string& path) { path_ = path; } + + // The file base name (component after the last directory separator). + std::string base_name() const; + + // The size in bytes, if available. Only regular files are guaranteed + // to have a size. + int64_t size() const { return size_; } + void set_size(int64_t size) { size_ = size; } + + // The time of last modification, if available. + TimePoint mtime() const { return mtime_; } + void set_mtime(TimePoint mtime) { mtime_ = mtime; } + + protected: + FileType type_ = FileType::Unknown; + std::string path_; + int64_t size_ = kNoSize; + TimePoint mtime_ = kNoTime; +}; + +/// \brief EXPERIMENTAL: file selector +struct ARROW_EXPORT Selector { + // The directory in which to select files. + // If the path exists but doesn't point to a directory, this should be an error. + std::string base_dir; + // The behavior if `base_dir` doesn't exist in the filesystem. If false, + // an error is returned. If true, an empty selection is returned. + bool allow_non_existent = false; + // Whether to recurse into subdirectories. + bool recursive = false; + + Selector() {} +}; + +/// \brief EXPERIMENTAL: abstract file system API +class ARROW_EXPORT FileSystem { + public: + virtual ~FileSystem(); + + /// Get statistics for the given target. + /// + /// Any symlink is automatically dereferenced, recursively. + /// A non-existing or unreachable file returns an Ok status and + /// has a FileType of value NonExistent. An error status indicates + /// a truly exceptional condition (low-level I/O error, etc.). + virtual Status GetTargetStats(const std::string& path, FileStats* out) = 0; + /// Same, for many targets at once. + virtual Status GetTargetStats(const std::vector& paths, + std::vector* out); + /// Same, according to a selector. + /// + /// The selector's base directory will not be part of the results, even if + /// it exists. + /// If it doesn't exist, see `Selector::allow_non_existent`. + virtual Status GetTargetStats(const Selector& select, std::vector* out) = 0; + + /// Create a directory and subdirectories. + /// + /// This function succeeds if the directory already exists. + virtual Status CreateDir(const std::string& path, bool recursive = true) = 0; + + /// Delete a directory and its contents, recursively. + virtual Status DeleteDir(const std::string& path) = 0; + + /// Delete a file. + virtual Status DeleteFile(const std::string& path) = 0; + /// Delete many files. + /// + /// The default implementation issues individual delete operations in sequence. + virtual Status DeleteFiles(const std::vector& paths); + + /// Move / rename a file or directory. + /// + /// If the destination exists: + /// - if it is a non-empty directory, an error is returned + /// - otherwise, if it has the same type as the source, it is replaced + /// - otherwise, behavior is unspecified (implementation-dependent). + virtual Status Move(const std::string& src, const std::string& dest) = 0; + + /// Copy a file. + /// + /// If the destination exists and is a directory, an error is returned. + /// Otherwise, it is replaced. + virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; + + /// Open an input stream for sequential reading. + virtual Status OpenInputStream(const std::string& path, + std::shared_ptr* out) = 0; + + /// Open an input file for random access reading. + virtual Status OpenInputFile(const std::string& path, + std::shared_ptr* out) = 0; + + /// Open an output stream for sequential writing. + /// + /// If the target already exists, existing data is truncated. + virtual Status OpenOutputStream(const std::string& path, + std::shared_ptr* out) = 0; + + /// Open an output stream for appending. + /// + /// If the target doesn't exist, a new empty file is created. + virtual Status OpenAppendStream(const std::string& path, + std::shared_ptr* out) = 0; +}; + +/// \brief EXPERIMENTAL: a FileSystem implementation that delegates to another +/// implementation after prepending a fixed base path. +/// +/// This is useful to expose a logical view of a subtree of a filesystem, +/// for example a directory in a LocalFileSystem. +/// This makes no security guarantee. For example, symlinks may allow to +/// "escape" the subtree and access other parts of the underlying filesystem. +class ARROW_EXPORT SubTreeFileSystem : public FileSystem { + public: + explicit SubTreeFileSystem(const std::string& base_path, + std::shared_ptr base_fs); + ~SubTreeFileSystem() override; + + using FileSystem::GetTargetStats; + Status GetTargetStats(const std::string& path, FileStats* out) override; + Status GetTargetStats(const Selector& select, std::vector* out) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Status OpenInputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenInputFile(const std::string& path, + std::shared_ptr* out) override; + + Status OpenOutputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenAppendStream(const std::string& path, + std::shared_ptr* out) override; + + protected: + const std::string base_path_; + std::shared_ptr base_fs_; + + std::string PrependBase(const std::string& s) const; + Status PrependBaseNonEmpty(std::string* s) const; + Status StripBase(const std::string& s, std::string* out) const; + Status FixStats(FileStats* st) const; +}; + +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/localfs.h b/r/R/inst/include/arrow/filesystem/localfs.h new file mode 100644 index 00000000000..c720ac2b93c --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/localfs.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace fs { + +/// \brief EXPERIMENTAL: a FileSystem implementation accessing files +/// on the local machine. +/// +/// Details such as symlinks are abstracted away (symlinks are always followed, +/// except when deleting an entry). +class ARROW_EXPORT LocalFileSystem : public FileSystem { + public: + LocalFileSystem(); + ~LocalFileSystem() override; + + using FileSystem::GetTargetStats; + Status GetTargetStats(const std::string& path, FileStats* out) override; + Status GetTargetStats(const Selector& select, std::vector* out) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Status OpenInputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenInputFile(const std::string& path, + std::shared_ptr* out) override; + + Status OpenOutputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenAppendStream(const std::string& path, + std::shared_ptr* out) override; +}; + +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/mockfs.h b/r/R/inst/include/arrow/filesystem/mockfs.h new file mode 100644 index 00000000000..ba7b57636d3 --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/mockfs.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace fs { +namespace internal { + +struct DirInfo { + std::string full_path; + TimePoint mtime; + + bool operator==(const DirInfo& other) const { + return mtime == other.mtime && full_path == other.full_path; + } + + friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const DirInfo&); +}; + +struct FileInfo { + std::string full_path; + TimePoint mtime; + std::string data; + + bool operator==(const FileInfo& other) const { + return mtime == other.mtime && full_path == other.full_path && data == other.data; + } + + friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const FileInfo&); +}; + +/// A mock FileSystem implementation that holds its contents in memory. +/// +/// Useful for validating the FileSystem API, writing conformance suite, +/// and bootstrapping FileSystem-based APIs. +class ARROW_EXPORT MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(TimePoint current_time); + ~MockFileSystem() override; + + // XXX It's not very practical to have to explicitly declare inheritance + // of default overrides. + using FileSystem::GetTargetStats; + Status GetTargetStats(const std::string& path, FileStats* out) override; + Status GetTargetStats(const Selector& select, std::vector* out) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Status OpenInputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenInputFile(const std::string& path, + std::shared_ptr* out) override; + + Status OpenOutputStream(const std::string& path, + std::shared_ptr* out) override; + + Status OpenAppendStream(const std::string& path, + std::shared_ptr* out) override; + + // Contents-dumping helpers to ease testing. + // Output is lexicographically-ordered by full path. + std::vector AllDirs(); + std::vector AllFiles(); + + class Impl; + + protected: + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/path-util.h b/r/R/inst/include/arrow/filesystem/path-util.h new file mode 100644 index 00000000000..444451d32ab --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/path-util.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/status.h" + +namespace arrow { +namespace fs { +namespace internal { + +constexpr char kSep = '/'; + +// Computations on abstract paths (not local paths with system-dependent behaviour). +// Abstract paths are typically used in URIs. + +// Split an abstract path into its individual components. +ARROW_EXPORT +std::vector SplitAbstractPath(const std::string& s); + +// Return the parent directory and basename of an abstract path. Both values may be +// empty. +ARROW_EXPORT +std::pair GetAbstractPathParent(const std::string& s); + +// Validate the components of an abstract path. +ARROW_EXPORT +Status ValidateAbstractPathParts(const std::vector& parts); + +// Append a non-empty stem to an abstract path. +ARROW_EXPORT +std::string ConcatAbstractPath(const std::string& base, const std::string& stem); + +ARROW_EXPORT +std::string EnsureTrailingSlash(const std::string& s); + +// Join the components of an abstract path. +template +std::string JoinAbstractPath(StringIt it, StringIt end) { + std::string path; + for (; it != end; ++it) { + if (!path.empty()) { + path += kSep; + } + path += *it; + } + return path; +} + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/test-util.h b/r/R/inst/include/arrow/filesystem/test-util.h new file mode 100644 index 00000000000..179b08cf7e7 --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/test-util.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace fs { + +static constexpr double kTimeSlack = 2.0; // In seconds + +ARROW_EXPORT +void AssertFileStats(const FileStats& st, const std::string& path, FileType type); + +ARROW_EXPORT +void AssertFileStats(const FileStats& st, const std::string& path, FileType type, + TimePoint mtime); + +ARROW_EXPORT +void AssertFileStats(const FileStats& st, const std::string& path, FileType type, + TimePoint mtime, int64_t size); + +ARROW_EXPORT +void AssertFileStats(const FileStats& st, const std::string& path, FileType type, + int64_t size); + +ARROW_EXPORT +void CreateFile(FileSystem* fs, const std::string& path, const std::string& data); + +// Sort of vector of FileStats by lexicographic path order +ARROW_EXPORT +void SortStats(std::vector* stats); + +template +void AssertDurationBetween(Duration d, double min_secs, double max_secs) { + auto seconds = std::chrono::duration_cast>(d); + ASSERT_GE(seconds.count(), min_secs); + ASSERT_LE(seconds.count(), max_secs); +} + +// Generic tests for FileSystem implementations. +// To use this class, subclass both from it and ::testing::Test, +// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS() +// to define the various tests. +class ARROW_EXPORT GenericFileSystemTest { + public: + virtual ~GenericFileSystemTest(); + + void TestEmpty(); + void TestCreateDir(); + void TestDeleteDir(); + void TestDeleteFile(); + void TestDeleteFiles(); + void TestMoveFile(); + void TestMoveDir(); + void TestCopyFile(); + void TestGetTargetStatsSingle(); + void TestGetTargetStatsVector(); + void TestGetTargetStatsSelector(); + void TestOpenOutputStream(); + void TestOpenAppendStream(); + void TestOpenInputStream(); + void TestOpenInputFile(); + + protected: + virtual std::shared_ptr GetEmptyFileSystem() = 0; + + void TestEmpty(FileSystem* fs); + void TestCreateDir(FileSystem* fs); + void TestDeleteDir(FileSystem* fs); + void TestDeleteFile(FileSystem* fs); + void TestDeleteFiles(FileSystem* fs); + void TestMoveFile(FileSystem* fs); + void TestMoveDir(FileSystem* fs); + void TestCopyFile(FileSystem* fs); + void TestGetTargetStatsSingle(FileSystem* fs); + void TestGetTargetStatsVector(FileSystem* fs); + void TestGetTargetStatsSelector(FileSystem* fs); + void TestOpenOutputStream(FileSystem* fs); + void TestOpenAppendStream(FileSystem* fs); + void TestOpenInputStream(FileSystem* fs); + void TestOpenInputFile(FileSystem* fs); +}; + +#define GENERIC_FS_TEST_FUNCTION(TEST_CLASS, NAME) \ + TEST_F(TEST_CLASS, NAME) { Test##NAME(); } + +#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, Empty) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, CreateDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteFiles) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, MoveFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, MoveDir) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, CopyFile) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsSingle) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsVector) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsSelector) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenOutputStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenAppendStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenInputStream) \ + GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenInputFile) + +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/util-internal.h b/r/R/inst/include/arrow/filesystem/util-internal.h new file mode 100644 index 00000000000..eabdad4a6fa --- /dev/null +++ b/r/R/inst/include/arrow/filesystem/util-internal.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace fs { +namespace internal { + +ARROW_EXPORT +Status CopyStream(const std::shared_ptr& src, + const std::shared_ptr& dest, int64_t chunk_size); + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/api.h b/r/R/inst/include/arrow/flight/api.h new file mode 100644 index 00000000000..855ef7c3553 --- /dev/null +++ b/r/R/inst/include/arrow/flight/api.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/flight/client.h" +#include "arrow/flight/client_auth.h" +#include "arrow/flight/server.h" +#include "arrow/flight/server_auth.h" +#include "arrow/flight/types.h" diff --git a/r/R/inst/include/arrow/flight/client.h b/r/R/inst/include/arrow/flight/client.h new file mode 100644 index 00000000000..689c9f8c5b5 --- /dev/null +++ b/r/R/inst/include/arrow/flight/client.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// \brief Implementation of Flight RPC client using gRPC. API should be +// considered experimental for now + +#pragma once + +#include +#include +#include +#include + +#include "arrow/ipc/writer.h" +#include "arrow/status.h" + +#include "arrow/flight/types.h" // IWYU pragma: keep +#include "arrow/flight/visibility.h" + +namespace arrow { + +class MemoryPool; +class RecordBatch; +class RecordBatchReader; +class Schema; + +namespace flight { + +class ClientAuthHandler; + +/// \brief A duration type for Flight call timeouts. +typedef std::chrono::duration TimeoutDuration; + +/// \brief Hints to the underlying RPC layer for Arrow Flight calls. +class ARROW_FLIGHT_EXPORT FlightCallOptions { + public: + /// Create a default set of call options. + FlightCallOptions(); + + /// \brief An optional timeout for this call. Negative durations + /// mean an implementation-defined default behavior will be used + /// instead. This is the default value. + TimeoutDuration timeout; +}; + +class ARROW_FLIGHT_EXPORT FlightClientOptions { + public: + std::string tls_root_certs; +}; + +/// \brief Client class for Arrow Flight RPC services (gRPC-based). +/// API experimental for now +class ARROW_FLIGHT_EXPORT FlightClient { + public: + ~FlightClient(); + + /// \brief Connect to an unauthenticated flight service + /// \param[in] location the URI + /// \param[out] client the created FlightClient + /// \return Status OK status may not indicate that the connection was + /// successful + static Status Connect(const Location& location, std::unique_ptr* client); + + /// \brief Connect to an unauthenticated flight service + /// \param[in] location the URI + /// \param[in] options Other options for setting up the client + /// \param[out] client the created FlightClient + /// \return Status OK status may not indicate that the connection was + /// successful + static Status Connect(const Location& location, const FlightClientOptions& options, + std::unique_ptr* client); + + /// \brief Authenticate to the server using the given handler. + /// \param[in] options Per-RPC options + /// \param[in] auth_handler The authentication mechanism to use + /// \return Status OK if the client authenticated successfully + Status Authenticate(const FlightCallOptions& options, + std::unique_ptr auth_handler); + + /// \brief Perform the indicated action, returning an iterator to the stream + /// of results, if any + /// \param[in] options Per-RPC options + /// \param[in] action the action to be performed + /// \param[out] results an iterator object for reading the returned results + /// \return Status + Status DoAction(const FlightCallOptions& options, const Action& action, + std::unique_ptr* results); + Status DoAction(const Action& action, std::unique_ptr* results) { + return DoAction({}, action, results); + } + + /// \brief Retrieve a list of available Action types + /// \param[in] options Per-RPC options + /// \param[out] actions the available actions + /// \return Status + Status ListActions(const FlightCallOptions& options, std::vector* actions); + Status ListActions(std::vector* actions) { + return ListActions({}, actions); + } + + /// \brief Request access plan for a single flight, which may be an existing + /// dataset or a command to be executed + /// \param[in] options Per-RPC options + /// \param[in] descriptor the dataset request, whether a named dataset or + /// command + /// \param[out] info the FlightInfo describing where to access the dataset + /// \return Status + Status GetFlightInfo(const FlightCallOptions& options, + const FlightDescriptor& descriptor, + std::unique_ptr* info); + Status GetFlightInfo(const FlightDescriptor& descriptor, + std::unique_ptr* info) { + return GetFlightInfo({}, descriptor, info); + } + + /// \brief List all available flights known to the server + /// \param[out] listing an iterator that returns a FlightInfo for each flight + /// \return Status + Status ListFlights(std::unique_ptr* listing); + + /// \brief List available flights given indicated filter criteria + /// \param[in] options Per-RPC options + /// \param[in] criteria the filter criteria (opaque) + /// \param[out] listing an iterator that returns a FlightInfo for each flight + /// \return Status + Status ListFlights(const FlightCallOptions& options, const Criteria& criteria, + std::unique_ptr* listing); + + /// \brief Given a flight ticket and schema, request to be sent the + /// stream. Returns record batch stream reader + /// \param[in] options Per-RPC options + /// \param[in] ticket The flight ticket to use + /// \param[out] stream the returned RecordBatchReader + /// \return Status + Status DoGet(const FlightCallOptions& options, const Ticket& ticket, + std::unique_ptr* stream); + Status DoGet(const Ticket& ticket, std::unique_ptr* stream) { + return DoGet({}, ticket, stream); + } + + /// \brief Upload data to a Flight described by the given + /// descriptor. The caller must call Close() on the returned stream + /// once they are done writing. + /// \param[in] options Per-RPC options + /// \param[in] descriptor the descriptor of the stream + /// \param[in] schema the schema for the data to upload + /// \param[out] stream a writer to write record batches to + /// \return Status + Status DoPut(const FlightCallOptions& options, const FlightDescriptor& descriptor, + const std::shared_ptr& schema, + std::unique_ptr* stream); + Status DoPut(const FlightDescriptor& descriptor, const std::shared_ptr& schema, + std::unique_ptr* stream) { + return DoPut({}, descriptor, schema, stream); + } + + private: + FlightClient(); + class FlightClientImpl; + std::unique_ptr impl_; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/client_auth.h b/r/R/inst/include/arrow/flight/client_auth.h new file mode 100644 index 00000000000..9dad36aa094 --- /dev/null +++ b/r/R/inst/include/arrow/flight/client_auth.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/flight/visibility.h" +#include "arrow/status.h" + +namespace arrow { + +namespace flight { + +/// \brief A reader for messages from the server during an +/// authentication handshake. +class ARROW_FLIGHT_EXPORT ClientAuthReader { + public: + virtual ~ClientAuthReader() = default; + virtual Status Read(std::string* response) = 0; +}; + +/// \brief A writer for messages to the server during an +/// authentication handshake. +class ARROW_FLIGHT_EXPORT ClientAuthSender { + public: + virtual ~ClientAuthSender() = default; + virtual Status Write(const std::string& token) = 0; +}; + +/// \brief An authentication implementation for a Flight service. +/// Authentication includes both an initial negotiation and a per-call +/// token validation. Implementations may choose to use either or both +/// mechanisms. +class ARROW_FLIGHT_EXPORT ClientAuthHandler { + public: + virtual ~ClientAuthHandler() = default; + /// \brief Authenticate the client on initial connection. The client + /// can send messages to/read responses from the server at any time. + /// \return Status OK if authenticated successfully + virtual Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) = 0; + /// \brief Get a per-call token. + /// \param[out] token The token to send to the server. + virtual Status GetToken(std::string* token) = 0; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/customize_protobuf.h b/r/R/inst/include/arrow/flight/customize_protobuf.h new file mode 100644 index 00000000000..f27ab0b6878 --- /dev/null +++ b/r/R/inst/include/arrow/flight/customize_protobuf.h @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/flight/platform.h" +#include "arrow/util/config.h" + +// Silence protobuf warnings +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4244) +#endif + +#ifdef GRPCPP_PP_INCLUDE +#include +#else +#include +#endif + +// It is necessary to undefined this macro so that the protobuf +// SerializationTraits specialization is not declared in proto_utils.h. We've +// copied that specialization below and modified it to exclude +// protocol::FlightData from the default implementation so we can specialize +// for our faster serialization-deserialization path +#undef GRPC_OPEN_SOURCE_PROTO + +#ifdef GRPCPP_PP_INCLUDE +#include +#else +#include +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +namespace grpc { + +class ByteBuffer; + +} // namespace grpc + +namespace arrow { +namespace flight { + +struct FlightPayload; + +namespace internal { + +struct FlightData; + +// Those two functions are defined in serialization-internal.cc + +// Write FlightData to a grpc::ByteBuffer without extra copying +grpc::Status FlightDataSerialize(const FlightPayload& msg, grpc::ByteBuffer* out, + bool* own_buffer); + +// Read internal::FlightData from grpc::ByteBuffer containing FlightData +// protobuf without copying +grpc::Status FlightDataDeserialize(grpc::ByteBuffer* buffer, FlightData* out); + +} // namespace internal + +namespace protocol { + +class FlightData; + +} // namespace protocol +} // namespace flight +} // namespace arrow + +namespace grpc { + +// This class provides a protobuf serializer. It translates between protobuf +// objects and grpc_byte_buffers. More information about SerializationTraits can +// be found in include/grpcpp/impl/codegen/serialization_traits.h. +template +class SerializationTraits< + T, typename std::enable_if< + std::is_base_of::value && + !std::is_same::value>::type> { + public: + static Status Serialize(const grpc::protobuf::Message& msg, ByteBuffer* bb, + bool* own_buffer) { + return GenericSerialize(msg, bb, own_buffer); + } + + static Status Deserialize(ByteBuffer* buffer, grpc::protobuf::Message* msg) { + return GenericDeserialize(buffer, msg); + } +}; + +template +class SerializationTraits::value>::type> { + public: + // In the functions below, we cast back the Message argument to its real + // type (see ReadPayload() and WritePayload() for the initial cast). + static Status Serialize(const grpc::protobuf::Message& msg, ByteBuffer* bb, + bool* own_buffer) { + return arrow::flight::internal::FlightDataSerialize( + *reinterpret_cast(&msg), bb, own_buffer); + } + + static Status Deserialize(ByteBuffer* buffer, grpc::protobuf::Message* msg) { + return arrow::flight::internal::FlightDataDeserialize( + buffer, reinterpret_cast(msg)); + } +}; + +} // namespace grpc diff --git a/r/R/inst/include/arrow/flight/internal.h b/r/R/inst/include/arrow/flight/internal.h new file mode 100644 index 00000000000..784e8ebae1c --- /dev/null +++ b/r/R/inst/include/arrow/flight/internal.h @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/flight/protocol-internal.h" // IWYU pragma: keep +#include "arrow/flight/types.h" +#include "arrow/util/macros.h" + +namespace grpc { + +class Status; + +} // namespace grpc + +namespace arrow { + +class Schema; +class Status; + +namespace pb = arrow::flight::protocol; + +namespace ipc { + +class Message; + +} // namespace ipc + +namespace flight { + +#define GRPC_RETURN_NOT_OK(expr) \ + do { \ + ::arrow::Status _s = (expr); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + return ::arrow::flight::internal::ToGrpcStatus(_s); \ + } \ + } while (0) + +#define GRPC_RETURN_NOT_GRPC_OK(expr) \ + do { \ + ::grpc::Status _s = (expr); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + return _s; \ + } \ + } while (0) + +namespace internal { + +static const char* AUTH_HEADER = "auth-token-bin"; + +ARROW_FLIGHT_EXPORT +Status SchemaToString(const Schema& schema, std::string* out); + +ARROW_FLIGHT_EXPORT +Status FromGrpcStatus(const grpc::Status& grpc_status); + +ARROW_FLIGHT_EXPORT +grpc::Status ToGrpcStatus(const Status& arrow_status); + +// These functions depend on protobuf types which are not exported in the Flight DLL. + +Status FromProto(const pb::ActionType& pb_type, ActionType* type); +Status FromProto(const pb::Action& pb_action, Action* action); +Status FromProto(const pb::Result& pb_result, Result* result); +Status FromProto(const pb::Criteria& pb_criteria, Criteria* criteria); +Status FromProto(const pb::Location& pb_location, Location* location); +Status FromProto(const pb::Ticket& pb_ticket, Ticket* ticket); +Status FromProto(const pb::FlightData& pb_data, FlightDescriptor* descriptor, + std::unique_ptr* message); +Status FromProto(const pb::FlightDescriptor& pb_descr, FlightDescriptor* descr); +Status FromProto(const pb::FlightEndpoint& pb_endpoint, FlightEndpoint* endpoint); +Status FromProto(const pb::FlightInfo& pb_info, FlightInfo::Data* info); + +Status ToProto(const FlightDescriptor& descr, pb::FlightDescriptor* pb_descr); +Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info); +Status ToProto(const ActionType& type, pb::ActionType* pb_type); +Status ToProto(const Action& action, pb::Action* pb_action); +Status ToProto(const Result& result, pb::Result* pb_result); +void ToProto(const Ticket& ticket, pb::Ticket* pb_ticket); + +} // namespace internal +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/platform.h b/r/R/inst/include/arrow/flight/platform.h new file mode 100644 index 00000000000..7f1b0954d84 --- /dev/null +++ b/r/R/inst/include/arrow/flight/platform.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Internal header. Platform-specific definitions for gRPC. + +#pragma once + +#ifdef _MSC_VER + +// The protobuf documentation says that C4251 warnings when using the +// library are spurious and suppressed when the build the library and +// compiler, but must be also suppressed in downstream projects +#pragma warning(disable : 4251) + +#endif // _MSC_VER + +#include "arrow/util/config.h" // IWYU pragma: keep +#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep diff --git a/r/R/inst/include/arrow/flight/protocol-internal.h b/r/R/inst/include/arrow/flight/protocol-internal.h new file mode 100644 index 00000000000..98bf9238809 --- /dev/null +++ b/r/R/inst/include/arrow/flight/protocol-internal.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations + +#pragma once + +// This addresses platform-specific defines, e.g. on Windows +#include "arrow/flight/platform.h" // IWYU pragma: keep + +// This header holds the Flight protobuf definitions. + +// Need to include this first to get our gRPC customizations +#include "arrow/flight/customize_protobuf.h" // IWYU pragma: export + +#include "arrow/flight/Flight.grpc.pb.h" // IWYU pragma: export +#include "arrow/flight/Flight.pb.h" // IWYU pragma: export diff --git a/r/R/inst/include/arrow/flight/serialization-internal.h b/r/R/inst/include/arrow/flight/serialization-internal.h new file mode 100644 index 00000000000..aa47af6ae35 --- /dev/null +++ b/r/R/inst/include/arrow/flight/serialization-internal.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// (De)serialization utilities that hook into gRPC, efficiently +// handling Arrow-encoded data in a gRPC call. + +#pragma once + +#include + +#include "arrow/flight/internal.h" +#include "arrow/flight/types.h" +#include "arrow/ipc/message.h" +#include "arrow/status.h" + +namespace arrow { + +class Buffer; + +namespace flight { +namespace internal { + +/// Internal, not user-visible type used for memory-efficient reads from gRPC +/// stream +struct FlightData { + /// Used only for puts, may be null + std::unique_ptr descriptor; + + /// Non-length-prefixed Message header as described in format/Message.fbs + std::shared_ptr metadata; + + /// Message body + std::shared_ptr body; + + /// Open IPC message from the metadata and body + Status OpenMessage(std::unique_ptr* message); +}; + +/// Write Flight message on gRPC stream with zero-copy optimizations. +/// True is returned on success, false if some error occurred (connection closed?). +bool WritePayload(const FlightPayload& payload, + grpc::ClientWriter* writer); +bool WritePayload(const FlightPayload& payload, + grpc::ServerWriter* writer); + +/// Read Flight message from gRPC stream with zero-copy optimizations. +/// True is returned on success, false if stream ended. +bool ReadPayload(grpc::ClientReader* reader, FlightData* data); +bool ReadPayload(grpc::ServerReader* reader, FlightData* data); + +} // namespace internal +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/server.h b/r/R/inst/include/arrow/flight/server.h new file mode 100644 index 00000000000..7164b64c4ab --- /dev/null +++ b/r/R/inst/include/arrow/flight/server.h @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Interfaces to use for defining Flight RPC servers. API should be considered +// experimental for now + +#pragma once + +#include +#include +#include + +#include "arrow/flight/server_auth.h" +#include "arrow/flight/types.h" // IWYU pragma: keep +#include "arrow/flight/visibility.h" // IWYU pragma: keep +#include "arrow/ipc/dictionary.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" + +namespace arrow { + +class MemoryPool; +class Schema; +class Status; + +namespace flight { + +/// \brief Interface that produces a sequence of IPC payloads to be sent in +/// FlightData protobuf messages +class ARROW_FLIGHT_EXPORT FlightDataStream { + public: + virtual ~FlightDataStream(); + + virtual std::shared_ptr schema() = 0; + + /// \brief Compute FlightPayload containing serialized RecordBatch schema + virtual Status GetSchemaPayload(FlightPayload* payload) = 0; + + // When the stream is completed, the last payload written will have null + // metadata + virtual Status Next(FlightPayload* payload) = 0; +}; + +/// \brief A basic implementation of FlightDataStream that will provide +/// a sequence of FlightData messages to be written to a gRPC stream +class ARROW_FLIGHT_EXPORT RecordBatchStream : public FlightDataStream { + public: + /// \param[in] reader produces a sequence of record batches + /// \param[in,out] pool a MemoryPool to use for allocations + explicit RecordBatchStream(const std::shared_ptr& reader, + MemoryPool* pool = default_memory_pool()); + ~RecordBatchStream() override; + + std::shared_ptr schema() override; + Status GetSchemaPayload(FlightPayload* payload) override; + Status Next(FlightPayload* payload) override; + + private: + class RecordBatchStreamImpl; + std::unique_ptr impl_; +}; + +// Silence warning +// "non dll-interface class RecordBatchReader used as base for dll-interface class" +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4275) +#endif + +/// \brief A reader for IPC payloads uploaded by a client +class ARROW_FLIGHT_EXPORT FlightMessageReader : public RecordBatchReader { + public: + /// \brief Get the descriptor for this upload. + virtual const FlightDescriptor& descriptor() const = 0; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/// \brief Call state/contextual data. +class ARROW_FLIGHT_EXPORT ServerCallContext { + public: + virtual ~ServerCallContext() = default; + /// \brief The name of the authenticated peer (may be the empty string) + virtual const std::string& peer_identity() const = 0; +}; + +class ARROW_FLIGHT_EXPORT FlightServerOptions { + public: + explicit FlightServerOptions(const Location& location_); + + Location location; + std::unique_ptr auth_handler; + std::string tls_cert_chain; + std::string tls_private_key; +}; + +/// \brief Skeleton RPC server implementation which can be used to create +/// custom servers by implementing its abstract methods +class ARROW_FLIGHT_EXPORT FlightServerBase { + public: + FlightServerBase(); + virtual ~FlightServerBase(); + + // Lifecycle methods. + + /// \brief Initialize a Flight server listening at the given location. + /// This method must be called before any other method. + /// \param[in] options The configuration for this server. + Status Init(FlightServerOptions& options); + + /// \brief Set the server to stop when receiving any of the given signal + /// numbers. + /// This method must be called before Serve(). + Status SetShutdownOnSignals(const std::vector sigs); + + /// \brief Start serving. + /// This method blocks until either Shutdown() is called or one of the signals + /// registered in SetShutdownOnSignals() is received. + Status Serve(); + + /// \brief Query whether Serve() was interrupted by a signal. + /// This method must be called after Serve() has returned. + /// + /// \return int the signal number that interrupted Serve(), if any, otherwise 0 + int GotSignal() const; + + /// \brief Shut down the server. Can be called from signal handler or another + /// thread while Serve() blocks. + /// + /// TODO(wesm): Shutdown with deadline + void Shutdown(); + + // Implement these methods to create your own server. The default + // implementations will return a not-implemented result to the client + + /// \brief Retrieve a list of available fields given an optional opaque + /// criteria + /// \param[in] context The call context. + /// \param[in] criteria may be null + /// \param[out] listings the returned listings iterator + /// \return Status + virtual Status ListFlights(const ServerCallContext& context, const Criteria* criteria, + std::unique_ptr* listings); + + /// \brief Retrieve the schema and an access plan for the indicated + /// descriptor + /// \param[in] context The call context. + /// \param[in] request may be null + /// \param[out] info the returned flight info provider + /// \return Status + virtual Status GetFlightInfo(const ServerCallContext& context, + const FlightDescriptor& request, + std::unique_ptr* info); + + /// \brief Get a stream of IPC payloads to put on the wire + /// \param[in] context The call context. + /// \param[in] request an opaque ticket + /// \param[out] stream the returned stream provider + /// \return Status + virtual Status DoGet(const ServerCallContext& context, const Ticket& request, + std::unique_ptr* stream); + + /// \brief Process a stream of IPC payloads sent from a client + /// \param[in] context The call context. + /// \param[in] reader a sequence of uploaded record batches + /// \return Status + virtual Status DoPut(const ServerCallContext& context, + std::unique_ptr reader); + + /// \brief Execute an action, return stream of zero or more results + /// \param[in] context The call context. + /// \param[in] action the action to execute, with type and body + /// \param[out] result the result iterator + /// \return Status + virtual Status DoAction(const ServerCallContext& context, const Action& action, + std::unique_ptr* result); + + /// \brief Retrieve the list of available actions + /// \param[in] context The call context. + /// \param[out] actions a vector of available action types + /// \return Status + virtual Status ListActions(const ServerCallContext& context, + std::vector* actions); + + private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/server_auth.h b/r/R/inst/include/arrow/flight/server_auth.h new file mode 100644 index 00000000000..b1ccb096d7b --- /dev/null +++ b/r/R/inst/include/arrow/flight/server_auth.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// \brief Server-side APIs to implement authentication for Flight. + +#pragma once + +#include + +#include "arrow/flight/visibility.h" +#include "arrow/status.h" + +namespace arrow { + +namespace flight { + +/// \brief A reader for messages from the client during an +/// authentication handshake. +class ARROW_FLIGHT_EXPORT ServerAuthReader { + public: + virtual ~ServerAuthReader() = default; + virtual Status Read(std::string* token) = 0; +}; + +/// \brief A writer for messages to the client during an +/// authentication handshake. +class ARROW_FLIGHT_EXPORT ServerAuthSender { + public: + virtual ~ServerAuthSender() = default; + virtual Status Write(const std::string& message) = 0; +}; + +/// \brief An authentication implementation for a Flight service. +/// Authentication includes both an initial negotiation and a per-call +/// token validation. Implementations may choose to use either or both +/// mechanisms. +/// An implementation may need to track some state, e.g. a mapping of +/// client tokens to authenticated identities. +class ARROW_FLIGHT_EXPORT ServerAuthHandler { + public: + virtual ~ServerAuthHandler(); + /// \brief Authenticate the client on initial connection. The server + /// can send and read responses from the client at any time. + virtual Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) = 0; + /// \brief Validate a per-call client token. + /// \param[in] token The client token. May be the empty string if + /// the client does not provide a token. + /// \param[out] peer_identity The identity of the peer, if this + /// authentication method supports it. + /// \return Status OK if the token is valid, any other status if + /// validation failed + virtual Status IsValid(const std::string& token, std::string* peer_identity) = 0; +}; + +/// \brief An authentication mechanism that does nothing. +class ARROW_FLIGHT_EXPORT NoOpAuthHandler : public ServerAuthHandler { + public: + ~NoOpAuthHandler() override; + Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) override; + Status IsValid(const std::string& token, std::string* peer_identity) override; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/test-util.h b/r/R/inst/include/arrow/flight/test-util.h new file mode 100644 index 00000000000..2e1f4b0ed15 --- /dev/null +++ b/r/R/inst/include/arrow/flight/test-util.h @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" + +#include "arrow/flight/client_auth.h" +#include "arrow/flight/server_auth.h" +#include "arrow/flight/types.h" +#include "arrow/flight/visibility.h" + +namespace boost { +namespace process { + +class child; + +} // namespace process +} // namespace boost + +namespace arrow { +namespace flight { + +// ---------------------------------------------------------------------- +// Fixture to use for running test servers + +// Get a TCP port number to listen on. This is a different number every time, +// as reusing the same port accross tests can produce spurious "Stream removed" +// errors as Windows. +ARROW_FLIGHT_EXPORT +int GetListenPort(); + +class ARROW_FLIGHT_EXPORT TestServer { + public: + explicit TestServer(const std::string& executable_name) + : executable_name_(executable_name), port_(GetListenPort()) {} + explicit TestServer(const std::string& executable_name, int port) + : executable_name_(executable_name), port_(port) {} + + void Start(); + + int Stop(); + + bool IsRunning(); + + int port() const; + + private: + std::string executable_name_; + int port_; + std::shared_ptr<::boost::process::child> server_process_; +}; + +class ARROW_FLIGHT_EXPORT InProcessTestServer { + public: + explicit InProcessTestServer(std::unique_ptr server, + const Location& location) + : server_(std::move(server)), location_(location), thread_() {} + ~InProcessTestServer(); + Status Start(); + void Stop(); + const Location& location() const; + + private: + std::unique_ptr server_; + Location location_; + std::thread thread_; +}; + +// ---------------------------------------------------------------------- +// A RecordBatchReader for serving a sequence of in-memory record batches + +// Silence warning +// "non dll-interface class RecordBatchReader used as base for dll-interface class" +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4275) +#endif + +class ARROW_FLIGHT_EXPORT BatchIterator : public RecordBatchReader { + public: + BatchIterator(const std::shared_ptr& schema, + const std::vector>& batches) + : schema_(schema), batches_(batches), position_(0) {} + + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* out) override { + if (position_ >= batches_.size()) { + *out = nullptr; + } else { + *out = batches_[position_++]; + } + return Status::OK(); + } + + private: + std::shared_ptr schema_; + std::vector> batches_; + size_t position_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// ---------------------------------------------------------------------- +// Example data for test-server and unit tests + +using BatchVector = std::vector>; + +ARROW_FLIGHT_EXPORT +std::shared_ptr ExampleIntSchema(); + +ARROW_FLIGHT_EXPORT +std::shared_ptr ExampleStringSchema(); + +ARROW_FLIGHT_EXPORT +std::shared_ptr ExampleDictSchema(); + +ARROW_FLIGHT_EXPORT +Status ExampleIntBatches(BatchVector* out); + +ARROW_FLIGHT_EXPORT +Status ExampleDictBatches(BatchVector* out); + +ARROW_FLIGHT_EXPORT +std::vector ExampleFlightInfo(); + +ARROW_FLIGHT_EXPORT +std::vector ExampleActionTypes(); + +ARROW_FLIGHT_EXPORT +Status MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor, + const std::vector& endpoints, int64_t total_records, + int64_t total_bytes, FlightInfo::Data* out); + +// ---------------------------------------------------------------------- +// A pair of authentication handlers that check for a predefined password +// and set the peer identity to a predefined username. + +class ARROW_FLIGHT_EXPORT TestServerAuthHandler : public ServerAuthHandler { + public: + explicit TestServerAuthHandler(const std::string& username, + const std::string& password); + ~TestServerAuthHandler() override; + Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) override; + Status IsValid(const std::string& token, std::string* peer_identity) override; + + private: + std::string username_; + std::string password_; +}; + +class ARROW_FLIGHT_EXPORT TestClientAuthHandler : public ClientAuthHandler { + public: + explicit TestClientAuthHandler(const std::string& username, + const std::string& password); + ~TestClientAuthHandler() override; + Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override; + Status GetToken(std::string* token) override; + + private: + std::string username_; + std::string password_; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/types.h b/r/R/inst/include/arrow/flight/types.h new file mode 100644 index 00000000000..8d372252636 --- /dev/null +++ b/r/R/inst/include/arrow/flight/types.h @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Data structure for Flight RPC. API should be considered experimental for now + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/flight/visibility.h" +#include "arrow/ipc/writer.h" + +namespace arrow { + +class Buffer; +class Schema; +class Status; + +namespace ipc { + +class DictionaryMemo; + +} // namespace ipc + +namespace internal { + +class Uri; + +} // namespace internal + +namespace flight { + +/// \brief A type of action that can be performed with the DoAction RPC +struct ARROW_FLIGHT_EXPORT ActionType { + /// Name of action + std::string type; + + /// Opaque action description + std::string description; +}; + +/// \brief Opaque selection critera for ListFlights RPC +struct ARROW_FLIGHT_EXPORT Criteria { + /// Opaque criteria expression, dependent on server implementation + std::string expression; +}; + +/// \brief An action to perform with the DoAction RPC +struct ARROW_FLIGHT_EXPORT Action { + /// The action type + std::string type; + + /// The action content as a Buffer + std::shared_ptr body; +}; + +/// \brief Opaque result returned after executing an action +struct ARROW_FLIGHT_EXPORT Result { + std::shared_ptr body; +}; + +/// \brief A message received after completing a DoPut stream +struct ARROW_FLIGHT_EXPORT PutResult {}; + +/// \brief A request to retrieve or generate a dataset +struct ARROW_FLIGHT_EXPORT FlightDescriptor { + enum DescriptorType { + UNKNOWN = 0, /// Unused + PATH = 1, /// Named path identifying a dataset + CMD = 2 /// Opaque command to generate a dataset + }; + + /// The descriptor type + DescriptorType type; + + /// Opaque value used to express a command. Should only be defined when type + /// is CMD + std::string cmd; + + /// List of strings identifying a particular dataset. Should only be defined + /// when type is PATH + std::vector path; + + bool Equals(const FlightDescriptor& other) const; + + std::string ToString() const; + + // Convenience factory functions + + static FlightDescriptor Command(const std::string& c) { + return FlightDescriptor{CMD, c, {}}; + } + + static FlightDescriptor Path(const std::vector& p) { + return FlightDescriptor{PATH, "", p}; + } +}; + +/// \brief Data structure providing an opaque identifier or credential to use +/// when requesting a data stream with the DoGet RPC +struct ARROW_FLIGHT_EXPORT Ticket { + std::string ticket; +}; + +class FlightClient; +class FlightServerBase; + +static const char* kSchemeGrpc = "grpc"; +static const char* kSchemeGrpcTcp = "grpc+tcp"; +static const char* kSchemeGrpcUnix = "grpc+unix"; +static const char* kSchemeGrpcTls = "grpc+tls"; + +/// \brief A host location (a URI) +struct ARROW_FLIGHT_EXPORT Location { + public: + /// \brief Initialize a blank location. + Location(); + + /// \brief Initialize a location by parsing a URI string + static Status Parse(const std::string& uri_string, Location* location); + + /// \brief Initialize a location for a non-TLS, gRPC-based Flight + /// service from a host and port + /// \param[in] host The hostname to connect to + /// \param[in] port The port + /// \param[out] location The resulting location + static Status ForGrpcTcp(const std::string& host, const int port, Location* location); + + /// \brief Initialize a location for a domain socket-based Flight + /// service + /// \param[in] path The path to the domain socket + /// \param[out] location The resulting location + static Status ForGrpcUnix(const std::string& path, Location* location); + + /// \brief Get a representation of this URI as a string. + std::string ToString() const; + + /// \brief Get the scheme of this URI. + std::string scheme() const; + + bool Equals(const Location& other) const; + + friend bool operator==(const Location& left, const Location& right) { + return left.Equals(right); + } + friend bool operator!=(const Location& left, const Location& right) { + return !(left == right); + } + + private: + friend class FlightClient; + friend class FlightServerBase; + std::shared_ptr uri_; +}; + +/// \brief A flight ticket and list of locations where the ticket can be +/// redeemed +struct ARROW_FLIGHT_EXPORT FlightEndpoint { + /// Opaque ticket identify; use with DoGet RPC + Ticket ticket; + + /// List of locations where ticket can be redeemed. If the list is empty, the + /// ticket can only be redeemed on the current service where the ticket was + /// generated + std::vector locations; +}; + +/// \brief Staging data structure for messages about to be put on the wire +/// +/// This structure corresponds to FlightData in the protocol. +struct ARROW_FLIGHT_EXPORT FlightPayload { + std::shared_ptr descriptor; + ipc::internal::IpcPayload ipc_message; +}; + +/// \brief The access coordinates for retireval of a dataset, returned by +/// GetFlightInfo +class ARROW_FLIGHT_EXPORT FlightInfo { + public: + struct Data { + std::string schema; + FlightDescriptor descriptor; + std::vector endpoints; + int64_t total_records; + int64_t total_bytes; + }; + + explicit FlightInfo(const Data& data) : data_(data), reconstructed_schema_(false) {} + explicit FlightInfo(Data&& data) + : data_(std::move(data)), reconstructed_schema_(false) {} + + /// \brief Deserialize the Arrow schema of the dataset, to be passed + /// to each call to DoGet. Populate any dictionary encoded fields + /// into a DictionaryMemo for bookkeeping + /// \param[in,out] dictionary_memo for dictionary bookkeeping, will + /// be modified + /// \param[out] out the reconstructed Schema + Status GetSchema(ipc::DictionaryMemo* dictionary_memo, + std::shared_ptr* out) const; + + const std::string& serialized_schema() const { return data_.schema; } + + /// The descriptor associated with this flight, may not be set + const FlightDescriptor& descriptor() const { return data_.descriptor; } + + /// A list of endpoints associated with the flight (dataset). To consume the + /// whole flight, all endpoints must be consumed + const std::vector& endpoints() const { return data_.endpoints; } + + /// The total number of records (rows) in the dataset. If unknown, set to -1 + int64_t total_records() const { return data_.total_records; } + + /// The total number of bytes in the dataset. If unknown, set to -1 + int64_t total_bytes() const { return data_.total_bytes; } + + private: + Data data_; + mutable std::shared_ptr schema_; + mutable bool reconstructed_schema_; +}; + +/// \brief An iterator to FlightInfo instances returned by ListFlights +class ARROW_FLIGHT_EXPORT FlightListing { + public: + virtual ~FlightListing() = default; + + /// \brief Retrieve the next FlightInfo from the iterator. Returns nullptr + /// when there are none left + /// \param[out] info a single FlightInfo + /// \return Status + virtual Status Next(std::unique_ptr* info) = 0; +}; + +/// \brief An iterator to Result instances returned by DoAction +class ARROW_FLIGHT_EXPORT ResultStream { + public: + virtual ~ResultStream() = default; + + /// \brief Retrieve the next Result from the iterator. Returns nullptr + /// when there are none left + /// \param[out] info a single Result + /// \return Status + virtual Status Next(std::unique_ptr* info) = 0; +}; + +// \brief Create a FlightListing from a vector of FlightInfo objects. This can +// be iterated once, then it is consumed +class ARROW_FLIGHT_EXPORT SimpleFlightListing : public FlightListing { + public: + explicit SimpleFlightListing(const std::vector& flights); + explicit SimpleFlightListing(std::vector&& flights); + + Status Next(std::unique_ptr* info) override; + + private: + int position_; + std::vector flights_; +}; + +class ARROW_FLIGHT_EXPORT SimpleResultStream : public ResultStream { + public: + explicit SimpleResultStream(std::vector&& results); + Status Next(std::unique_ptr* result) override; + + private: + std::vector results_; + size_t position_; +}; + +} // namespace flight +} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/visibility.h b/r/R/inst/include/arrow/flight/visibility.h new file mode 100644 index 00000000000..bdee8b751d8 --- /dev/null +++ b/r/R/inst/include/arrow/flight/visibility.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_FLIGHT_STATIC +#define ARROW_FLIGHT_EXPORT +#elif defined(ARROW_FLIGHT_EXPORTING) +#define ARROW_FLIGHT_EXPORT __declspec(dllexport) +#else +#define ARROW_FLIGHT_EXPORT __declspec(dllimport) +#endif + +#define ARROW_FLIGHT_NO_EXPORT +#else // Not Windows +#ifndef ARROW_FLIGHT_EXPORT +#define ARROW_FLIGHT_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef ARROW_FLIGHT_NO_EXPORT +#define ARROW_FLIGHT_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/r/R/inst/include/arrow/gpu/cuda_api.h b/r/R/inst/include/arrow/gpu/cuda_api.h new file mode 100644 index 00000000000..c63b77e8721 --- /dev/null +++ b/r/R/inst/include/arrow/gpu/cuda_api.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_GPU_CUDA_API_H +#define ARROW_GPU_CUDA_API_H + +#include "arrow/gpu/cuda_arrow_ipc.h" +#include "arrow/gpu/cuda_context.h" +#include "arrow/gpu/cuda_memory.h" +#include "arrow/gpu/cuda_version.h" + +#endif // ARROW_GPU_CUDA_API_H diff --git a/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h b/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h new file mode 100644 index 00000000000..4eb85e797c7 --- /dev/null +++ b/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_GPU_CUDA_ARROW_IPC_H +#define ARROW_GPU_CUDA_ARROW_IPC_H + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +#include "arrow/gpu/cuda_memory.h" + +namespace arrow { + +class MemoryPool; +class RecordBatch; +class Schema; + +namespace ipc { + +class Message; + +} // namespace ipc + +namespace cuda { + +/// \brief Write record batch message to GPU device memory +/// \param[in] batch record batch to write +/// \param[in] ctx CudaContext to allocate device memory from +/// \param[out] out the returned device buffer which contains the record batch message +/// \return Status +ARROW_EXPORT +Status SerializeRecordBatch(const RecordBatch& batch, CudaContext* ctx, + std::shared_ptr* out); + +/// \brief Read Arrow IPC message located on GPU device +/// \param[in] reader a CudaBufferReader +/// \param[in] pool a MemoryPool to allocate CPU memory for the metadata +/// \param[out] message the deserialized message, body still on device +/// +/// This function reads the message metadata into host memory, but leaves the +/// message body on the device +ARROW_EXPORT +Status ReadMessage(CudaBufferReader* reader, MemoryPool* pool, + std::unique_ptr* message); + +/// \brief ReadRecordBatch specialized to handle metadata on CUDA device +/// \param[in] schema the Schema for the record batch +/// \param[in] buffer a CudaBuffer containing the complete IPC message +/// \param[in] pool a MemoryPool to use for allocating space for the metadata +/// \param[out] out the reconstructed RecordBatch, with device pointers +ARROW_EXPORT +Status ReadRecordBatch(const std::shared_ptr& schema, + const std::shared_ptr& buffer, MemoryPool* pool, + std::shared_ptr* out); + +} // namespace cuda +} // namespace arrow + +#endif // ARROW_GPU_CUDA_ARROW_IPC_H diff --git a/r/R/inst/include/arrow/gpu/cuda_common.h b/r/R/inst/include/arrow/gpu/cuda_common.h new file mode 100644 index 00000000000..87371ce20ad --- /dev/null +++ b/r/R/inst/include/arrow/gpu/cuda_common.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Non-public header + +#ifndef ARROW_GPU_CUDA_COMMON_H +#define ARROW_GPU_CUDA_COMMON_H + +#include + +namespace arrow { +namespace cuda { + +#define CU_RETURN_NOT_OK(STMT) \ + do { \ + CUresult ret = (STMT); \ + if (ret != CUDA_SUCCESS) { \ + return Status::IOError("Cuda Driver API call in ", __FILE__, " at line ", \ + __LINE__, " failed with code ", ret, ": ", #STMT); \ + } \ + } while (0) + +} // namespace cuda +} // namespace arrow + +#endif // ARROW_GPU_CUDA_COMMON_H diff --git a/r/R/inst/include/arrow/gpu/cuda_context.h b/r/R/inst/include/arrow/gpu/cuda_context.h new file mode 100644 index 00000000000..99c3fc2ba42 --- /dev/null +++ b/r/R/inst/include/arrow/gpu/cuda_context.h @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_GPU_CUDA_CONTEXT_H +#define ARROW_GPU_CUDA_CONTEXT_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +#include "arrow/gpu/cuda_memory.h" + +namespace arrow { +namespace cuda { + +// Forward declaration +class CudaContext; + +class ARROW_EXPORT CudaDeviceManager { + public: + static Status GetInstance(CudaDeviceManager** manager); + + /// \brief Get the CUDA driver context for a particular device + /// \param[in] device_number the CUDA device + /// \param[out] out cached context + Status GetContext(int device_number, std::shared_ptr* out); + + /// \brief Get the shared CUDA driver context for a particular device + /// \param[in] device_number the CUDA device + /// \param[in] handle CUDA context handler created by another library + /// \param[out] out shared context + Status GetSharedContext(int device_number, void* handle, + std::shared_ptr* out); + + /// \brief Allocate host memory with fast access to given GPU device + /// \param[in] device_number the CUDA device + /// \param[in] nbytes number of bytes + /// \param[out] out the allocated buffer + Status AllocateHost(int device_number, int64_t nbytes, + std::shared_ptr* out); + + Status FreeHost(void* data, int64_t nbytes); + + int num_devices() const; + + private: + CudaDeviceManager(); + static std::unique_ptr instance_; + + class CudaDeviceManagerImpl; + std::unique_ptr impl_; + + friend CudaContext; +}; + +struct ARROW_EXPORT CudaDeviceInfo {}; + +/// \class CudaContext +/// \brief Friendlier interface to the CUDA driver API +class ARROW_EXPORT CudaContext : public std::enable_shared_from_this { + public: + ~CudaContext(); + + Status Close(); + + /// \brief Allocate CUDA memory on GPU device for this context + /// \param[in] nbytes number of bytes + /// \param[out] out the allocated buffer + /// \return Status + Status Allocate(int64_t nbytes, std::shared_ptr* out); + + /// \brief Create a view of CUDA memory on GPU device of this context + /// \param[in] data the starting device address + /// \param[in] nbytes number of bytes + /// \param[out] out the view buffer + /// \return Status + /// + /// \note The caller is responsible for allocating and freeing the + /// memory as well as ensuring that the memory belongs to the CUDA + /// context that this CudaContext instance holds. + Status View(uint8_t* data, int64_t nbytes, std::shared_ptr* out); + + /// \brief Open existing CUDA IPC memory handle + /// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API) + /// \param[out] out a CudaBuffer referencing the IPC segment + /// \return Status + Status OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, + std::shared_ptr* out); + + /// \brief Close memory mapped with IPC buffer + /// \param[in] buffer a CudaBuffer referencing + /// \return Status + Status CloseIpcBuffer(CudaBuffer* buffer); + + /// \brief Block until the all device tasks are completed. + Status Synchronize(void); + + int64_t bytes_allocated() const; + + /// \brief Expose CUDA context handle to other libraries + void* handle() const; + + /// \brief Return device number + int device_number() const; + + /// \brief Return the device address that is reachable from kernels + /// running in the context + /// \param[in] addr device or host memory address + /// \param[out] devaddr the device address + /// \return Status + /// + /// The device address is defined as a memory address accessible by + /// device. While it is often a device memory address, it can be + /// also a host memory address, for instance, when the memory is + /// allocated as host memory (using cudaMallocHost or cudaHostAlloc) + /// or as managed memory (using cudaMallocManaged) or the host + /// memory is page-locked (using cudaHostRegister). + Status GetDeviceAddress(uint8_t* addr, uint8_t** devaddr); + + /// \brief Release CUDA memory on GPU device for this context + /// \param[in] device_ptr the buffer address + /// \param[in] nbytes number of bytes + /// \return Status + Status Free(void* device_ptr, int64_t nbytes); + + private: + CudaContext(); + + Status ExportIpcBuffer(void* data, int64_t size, + std::shared_ptr* handle); + Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes); + Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes); + Status CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes); + Status CopyDeviceToAnotherDevice(const std::shared_ptr& dst_ctx, void* dst, + const void* src, int64_t nbytes); + + class CudaContextImpl; + std::unique_ptr impl_; + + friend CudaBuffer; + friend CudaBufferReader; + friend CudaBufferWriter; + /// \cond FALSE + // (note: emits warning on Doxygen < 1.8.15) + friend CudaDeviceManager::CudaDeviceManagerImpl; + /// \endcond +}; + +} // namespace cuda +} // namespace arrow + +#endif // ARROW_GPU_CUDA_CONTEXT_H diff --git a/r/R/inst/include/arrow/gpu/cuda_memory.h b/r/R/inst/include/arrow/gpu/cuda_memory.h new file mode 100644 index 00000000000..6b9f04cc6de --- /dev/null +++ b/r/R/inst/include/arrow/gpu/cuda_memory.h @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_GPU_CUDA_MEMORY_H +#define ARROW_GPU_CUDA_MEMORY_H + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/memory.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +namespace arrow { +namespace cuda { + +class CudaContext; +class CudaIpcMemHandle; + +/// \class CudaBuffer +/// \brief An Arrow buffer located on a GPU device +/// +/// Be careful using this in any Arrow code which may not be GPU-aware +class ARROW_EXPORT CudaBuffer : public Buffer { + public: + CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr& context, + bool own_data = false, bool is_ipc = false); + + CudaBuffer(const std::shared_ptr& parent, const int64_t offset, + const int64_t size); + + ~CudaBuffer(); + + /// \brief Convert back generic buffer into CudaBuffer + /// \param[in] buffer buffer to convert + /// \param[out] out conversion result + /// \return Status + /// + /// \note This function returns an error if the buffer isn't backed + /// by GPU memory + static Status FromBuffer(std::shared_ptr buffer, + std::shared_ptr* out); + + /// \brief Copy memory from GPU device to CPU host + /// \param[in] position start position inside buffer to copy bytes from + /// \param[in] nbytes number of bytes to copy + /// \param[out] out start address of the host memory area to copy to + /// \return Status + Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const; + + /// \brief Copy memory to device at position + /// \param[in] position start position to copy bytes to + /// \param[in] data the host data to copy + /// \param[in] nbytes number of bytes to copy + /// \return Status + Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes); + + /// \brief Copy memory from device to device at position + /// \param[in] position start position inside buffer to copy bytes to + /// \param[in] data start address of the device memory area to copy from + /// \param[in] nbytes number of bytes to copy + /// \return Status + /// + /// \note It is assumed that both source and destination device + /// memories have been allocated within the same context. + Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes); + + /// \brief Copy memory from another device to device at position + /// \param[in] src_ctx context of the source device memory + /// \param[in] position start position inside buffer to copy bytes to + /// \param[in] data start address of the another device memory area to copy from + /// \param[in] nbytes number of bytes to copy + /// \return Status + Status CopyFromAnotherDevice(const std::shared_ptr& src_ctx, + const int64_t position, const void* data, int64_t nbytes); + + /// \brief Expose this device buffer as IPC memory which can be used in other processes + /// \param[out] handle the exported IPC handle + /// \return Status + /// + /// \note After calling this function, this device memory will not be freed + /// when the CudaBuffer is destructed + virtual Status ExportForIpc(std::shared_ptr* handle); + + std::shared_ptr context() const { return context_; } + + protected: + std::shared_ptr context_; + bool own_data_; + bool is_ipc_; + + virtual Status Close(); +}; + +/// \class CudaHostBuffer +/// \brief Device-accessible CPU memory created using cudaHostAlloc +class ARROW_EXPORT CudaHostBuffer : public MutableBuffer { + public: + using MutableBuffer::MutableBuffer; + ~CudaHostBuffer(); +}; + +/// \class CudaIpcHandle +/// \brief A container for a CUDA IPC handle +class ARROW_EXPORT CudaIpcMemHandle { + public: + ~CudaIpcMemHandle(); + + /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process) + /// \param[in] opaque_handle a CUipcMemHandle as a const void* + /// \param[out] handle the CudaIpcMemHandle instance + /// \return Status + static Status FromBuffer(const void* opaque_handle, + std::shared_ptr* handle); + + /// \brief Write CudaIpcMemHandle to a Buffer + /// \param[in] pool a MemoryPool to allocate memory from + /// \param[out] out the serialized buffer + /// \return Status + Status Serialize(MemoryPool* pool, std::shared_ptr* out) const; + + private: + explicit CudaIpcMemHandle(const void* handle); + CudaIpcMemHandle(int64_t memory_size, const void* cu_handle); + + struct CudaIpcMemHandleImpl; + std::unique_ptr impl_; + + const void* handle() const; + int64_t memory_size() const; + + friend CudaBuffer; + friend CudaContext; +}; + +/// \class CudaBufferReader +/// \brief File interface for zero-copy read from CUDA buffers +/// +/// Note: Reads return pointers to device memory. This means you must be +/// careful using this interface with any Arrow code which may expect to be +/// able to do anything other than pointer arithmetic on the returned buffers +class ARROW_EXPORT CudaBufferReader : public io::BufferReader { + public: + explicit CudaBufferReader(const std::shared_ptr& buffer); + ~CudaBufferReader() override; + + /// \brief Read bytes into pre-allocated host memory + /// \param[in] nbytes number of bytes to read + /// \param[out] bytes_read actual number of bytes read + /// \param[out] buffer pre-allocated memory to write into + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; + + /// \brief Zero-copy read from device memory + /// \param[in] nbytes number of bytes to read + /// \param[out] out a Buffer referencing device memory + /// \return Status + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + private: + std::shared_ptr cuda_buffer_; + std::shared_ptr context_; +}; + +/// \class CudaBufferWriter +/// \brief File interface for writing to CUDA buffers, with optional buffering +class ARROW_EXPORT CudaBufferWriter : public io::WritableFile { + public: + explicit CudaBufferWriter(const std::shared_ptr& buffer); + ~CudaBufferWriter() override; + + /// \brief Close writer and flush buffered bytes to GPU + Status Close() override; + + bool closed() const override; + + /// \brief Flush buffered bytes to GPU + Status Flush() override; + + Status Seek(int64_t position) override; + + Status Write(const void* data, int64_t nbytes) override; + + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; + + Status Tell(int64_t* position) const override; + + /// \brief Set CPU buffer size to limit calls to cudaMemcpy + /// \param[in] buffer_size the size of CPU buffer to allocate + /// \return Status + /// + /// By default writes are unbuffered + Status SetBufferSize(const int64_t buffer_size); + + /// \brief Returns size of host (CPU) buffer, 0 for unbuffered + int64_t buffer_size() const; + + /// \brief Returns number of bytes buffered on host + int64_t num_bytes_buffered() const; + + private: + class CudaBufferWriterImpl; + std::unique_ptr impl_; +}; + +/// \brief Allocate CUDA-accessible memory on CPU host +/// \param[in] device_number device to expose host memory +/// \param[in] size number of bytes +/// \param[out] out the allocated buffer +/// \return Status +ARROW_EXPORT +Status AllocateCudaHostBuffer(int device_number, const int64_t size, + std::shared_ptr* out); + +} // namespace cuda +} // namespace arrow + +#endif // ARROW_GPU_CUDA_MEMORY_H diff --git a/r/R/inst/include/arrow/io/api.h b/r/R/inst/include/arrow/io/api.h new file mode 100644 index 00000000000..cf1be337fd1 --- /dev/null +++ b/r/R/inst/include/arrow/io/api.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_API_H +#define ARROW_IO_API_H + +#include "arrow/io/buffered.h" +#include "arrow/io/compressed.h" +#include "arrow/io/file.h" +#include "arrow/io/hdfs.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" + +#endif // ARROW_IO_API_H diff --git a/r/R/inst/include/arrow/io/buffered.h b/r/R/inst/include/arrow/io/buffered.h new file mode 100644 index 00000000000..03ea1c7f757 --- /dev/null +++ b/r/R/inst/include/arrow/io/buffered.h @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Buffered stream implementations + +#ifndef ARROW_IO_BUFFERED_H +#define ARROW_IO_BUFFERED_H + +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class ARROW_EXPORT BufferedOutputStream : public OutputStream { + public: + ~BufferedOutputStream() override; + + /// \brief Create a buffered output stream wrapping the given output stream. + /// \param[in] buffer_size the size of the temporary write buffer + /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw another OutputStream + /// \param[out] out the created BufferedOutputStream + /// \return Status + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, + std::shared_ptr* out); + + /// \brief Resize internal buffer + /// \param[in] new_buffer_size the new buffer size + /// \return Status + Status SetBufferSize(int64_t new_buffer_size); + + /// \brief Return the current size of the internal buffer + int64_t buffer_size() const; + + /// \brief Flush any buffered writes and release the raw + /// OutputStream. Further operations on this object are invalid + /// \param[out] raw the underlying OutputStream + /// \return Status + Status Detach(std::shared_ptr* raw); + + // OutputStream interface + + /// \brief Close the buffered output stream. This implicitly closes the + /// underlying raw output stream. + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + // Write bytes to the stream. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + + Status Flush() override; + + /// \brief Return the underlying raw output stream. + std::shared_ptr raw() const; + + private: + explicit BufferedOutputStream(std::shared_ptr raw, MemoryPool* pool); + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +/// \class BufferedInputStream +/// \brief An InputStream that performs buffered reads from an unbuffered +/// InputStream, which can mitigate the overhead of many small reads in some +/// cases +class ARROW_EXPORT BufferedInputStream : public InputStream { + public: + ~BufferedInputStream() override; + + /// \brief Create a BufferedInputStream from a raw InputStream + /// \param[in] buffer_size the size of the temporary read buffer + /// \param[in] pool a MemoryPool to use for allocations + /// \param[in] raw a raw InputStream + /// \param[out] out the created BufferedInputStream + /// \param[in] raw_read_bound a bound on the maximum number of bytes + /// to read from the raw input stream. The default -1 indicates that + /// it is unbounded + static Status Create(int64_t buffer_size, MemoryPool* pool, + std::shared_ptr raw, + std::shared_ptr* out, + int64_t raw_read_bound = -1); + + /// \brief Resize internal read buffer; calls to Read(...) will read at least + /// \param[in] new_buffer_size the new read buffer size + /// \return Status + Status SetBufferSize(int64_t new_buffer_size); + + /// \brief Return the number of remaining bytes in the read buffer + int64_t bytes_buffered() const; + + /// \brief Return the current size of the internal buffer + int64_t buffer_size() const; + + /// \brief Release the raw InputStream. Any data buffered will be + /// discarded. Further operations on this object are invalid + /// \return raw the underlying InputStream + std::shared_ptr Detach(); + + /// \brief Return the unbuffered InputStream + std::shared_ptr raw() const; + + // InputStream APIs + + /// \brief Return a zero-copy string view referencing buffered data, + /// but do not advance the position of the stream. Buffers data and + /// expands the buffer size if necessary + Status Peek(int64_t nbytes, util::string_view* out) override; + + Status Close() override; + bool closed() const override; + + /// \brief Returns the position of the buffered stream, though the position + /// of the unbuffered stream may be further advanced + Status Tell(int64_t* position) const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + + /// \brief Read into buffer. If the read is already buffered, then this will + /// return a slice into the buffer + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + private: + explicit BufferedInputStream(std::shared_ptr raw, MemoryPool* pool, + int64_t raw_total_bytes_bound); + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_BUFFERED_H diff --git a/r/R/inst/include/arrow/io/compressed.h b/r/R/inst/include/arrow/io/compressed.h new file mode 100644 index 00000000000..ffb18d929ab --- /dev/null +++ b/r/R/inst/include/arrow/io/compressed.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Compressed stream implementations + +#ifndef ARROW_IO_COMPRESSED_H +#define ARROW_IO_COMPRESSED_H + +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class Status; + +namespace util { + +class Codec; + +} // namespace util + +namespace io { + +class ARROW_EXPORT CompressedOutputStream : public OutputStream { + public: + ~CompressedOutputStream() override; + + /// \brief Create a compressed output stream wrapping the given output stream. + static Status Make(util::Codec* codec, const std::shared_ptr& raw, + std::shared_ptr* out); + static Status Make(MemoryPool* pool, util::Codec* codec, + const std::shared_ptr& raw, + std::shared_ptr* out); + + // OutputStream interface + + /// \brief Close the compressed output stream. This implicitly closes the + /// underlying raw output stream. + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Write(const void* data, int64_t nbytes) override; + Status Flush() override; + + /// \brief Return the underlying raw output stream. + std::shared_ptr raw() const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedOutputStream); + + CompressedOutputStream() = default; + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +class ARROW_EXPORT CompressedInputStream : public InputStream { + public: + ~CompressedInputStream() override; + + /// \brief Create a compressed input stream wrapping the given input stream. + static Status Make(util::Codec* codec, const std::shared_ptr& raw, + std::shared_ptr* out); + static Status Make(MemoryPool* pool, util::Codec* codec, + const std::shared_ptr& raw, + std::shared_ptr* out); + + // InputStream interface + + /// \brief Close the compressed input stream. This implicitly closes the + /// underlying raw input stream. + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + /// \brief Return the underlying raw input stream. + std::shared_ptr raw() const; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedInputStream); + + CompressedInputStream() = default; + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_COMPRESSED_H diff --git a/r/R/inst/include/arrow/io/file.h b/r/R/inst/include/arrow/io/file.h new file mode 100644 index 00000000000..e9ac13f4c6a --- /dev/null +++ b/r/R/inst/include/arrow/io/file.h @@ -0,0 +1,246 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// IO interface implementations for OS files + +#ifndef ARROW_IO_FILE_H +#define ARROW_IO_FILE_H + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class ARROW_EXPORT FileOutputStream : public OutputStream { + public: + ~FileOutputStream() override; + + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[out] out a base interface OutputStream instance + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing data + static Status Open(const std::string& path, std::shared_ptr* out); + + /// \brief Open a local file for writing + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \param[out] out a base interface OutputStream instance + static Status Open(const std::string& path, bool append, + std::shared_ptr* out); + + /// \brief Open a file descriptor for writing. The underlying file isn't + /// truncated. + /// \param[in] fd file descriptor + /// \param[out] out a base interface OutputStream instance + /// + /// The file descriptor becomes owned by the OutputStream, and will be closed + /// on Close() or destruction. + static Status Open(int fd, std::shared_ptr* out); + + /// \brief Open a local file for writing, truncating any existing file + /// \param[in] path with UTF8 encoding + /// \param[out] file a FileOutputStream instance + /// + /// When opening a new file, any existing file with the indicated path is + /// truncated to 0 bytes, deleting any existing data + static Status Open(const std::string& path, std::shared_ptr* file); + + /// \brief Open a local file for writing + /// \param[in] path with UTF8 encoding + /// \param[in] append append to existing file, otherwise truncate to 0 bytes + /// \param[out] file a FileOutputStream instance + static Status Open(const std::string& path, bool append, + std::shared_ptr* file); + + /// \brief Open a file descriptor for writing. The underlying file isn't + /// truncated. + /// \param[in] fd file descriptor + /// \param[out] out a FileOutputStream instance + /// + /// The file descriptor becomes owned by the OutputStream, and will be closed + /// on Close() or destruction. + static Status Open(int fd, std::shared_ptr* out); + + // OutputStream interface + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + + // Write bytes to the stream. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + + using Writable::Write; + + int file_descriptor() const; + + private: + FileOutputStream(); + + class ARROW_NO_EXPORT FileOutputStreamImpl; + std::unique_ptr impl_; +}; + +// Operating system file +class ARROW_EXPORT ReadableFile : public RandomAccessFile { + public: + ~ReadableFile() override; + + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[out] file ReadableFile instance + /// Open file, allocate memory (if needed) from default memory pool + static Status Open(const std::string& path, std::shared_ptr* file); + + /// \brief Open a local file for reading + /// \param[in] path with UTF8 encoding + /// \param[in] pool a MemoryPool for memory allocations + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations + static Status Open(const std::string& path, MemoryPool* pool, + std::shared_ptr* file); + + /// \brief Open a local file for reading + /// \param[in] fd file descriptor + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations + /// + /// The file descriptor becomes owned by the ReadableFile, and will be closed + /// on Close() or destruction. + static Status Open(int fd, std::shared_ptr* file); + + /// \brief Open a local file for reading + /// \param[in] fd file descriptor + /// \param[in] pool a MemoryPool for memory allocations + /// \param[out] file ReadableFile instance + /// Open file with one's own memory pool for memory allocations + /// + /// The file descriptor becomes owned by the ReadableFile, and will be closed + /// on Close() or destruction. + static Status Open(int fd, MemoryPool* pool, std::shared_ptr* file); + + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + + // Read bytes from the file. Thread-safe + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + /// \brief Thread-safe implementation of ReadAt + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) override; + + /// \brief Thread-safe implementation of ReadAt + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status GetSize(int64_t* size) override; + Status Seek(int64_t position) override; + + int file_descriptor() const; + + private: + explicit ReadableFile(MemoryPool* pool); + + class ARROW_NO_EXPORT ReadableFileImpl; + std::unique_ptr impl_; +}; + +// A file interface that uses memory-mapped files for memory interactions, +// supporting zero copy reads. The same class is used for both reading and +// writing. +// +// If opening a file in a writable mode, it is not truncated first as with +// FileOutputStream +class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { + public: + ~MemoryMappedFile() override; + + /// Create new file with indicated size, return in read/write mode + static Status Create(const std::string& path, int64_t size, + std::shared_ptr* out); + + static Status Open(const std::string& path, FileMode::type mode, + std::shared_ptr* out); + + Status Close() override; + + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Seek(int64_t position) override; + + // Required by RandomAccessFile, copies memory into out. Not thread-safe + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + + // Zero copy read, moves position pointer. Not thread-safe + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + // Zero-copy read, leaves position unchanged. Acquires a reader lock + // for the duration of slice creation (typically very short). Is thread-safe. + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + // Raw copy of the memory at specified position. Thread-safe, but + // locks out other readers for the duration of memcpy. Prefer the + // zero copy method + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) override; + + bool supports_zero_copy() const override; + + /// Write data at the current position in the file. Thread-safe + Status Write(const void* data, int64_t nbytes) override; + + /// Set the size of the map to new_size. + Status Resize(int64_t new_size); + + /// Write data at a particular position in the file. Thread-safe + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; + + // @return: the size in bytes of the memory source + Status GetSize(int64_t* size) const; + + // @return: the size in bytes of the memory source + Status GetSize(int64_t* size) override; + + int file_descriptor() const; + + private: + MemoryMappedFile(); + + Status WriteInternal(const void* data, int64_t nbytes); + + class ARROW_NO_EXPORT MemoryMap; + std::shared_ptr memory_map_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_FILE_H diff --git a/r/R/inst/include/arrow/io/hdfs-internal.h b/r/R/inst/include/arrow/io/hdfs-internal.h new file mode 100644 index 00000000000..3912f2f1144 --- /dev/null +++ b/r/R/inst/include/arrow/io/hdfs-internal.h @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_HDFS_INTERNAL +#define ARROW_IO_HDFS_INTERNAL + +#include +#include + +#include + +#include "arrow/util/visibility.h" +#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep + +using std::size_t; + +struct hdfsBuilder; + +namespace arrow { + +class Status; + +namespace io { +namespace internal { + +// NOTE(wesm): cpplint does not like use of short and other imprecise C types +struct LibHdfsShim { +#ifndef _WIN32 + void* handle; +#else + HINSTANCE handle; +#endif + + hdfsBuilder* (*hdfsNewBuilder)(void); + void (*hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn); + void (*hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port); + void (*hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName); + void (*hdfsBuilderSetKerbTicketCachePath)(hdfsBuilder* bld, + const char* kerbTicketCachePath); + void (*hdfsBuilderSetForceNewInstance)(hdfsBuilder* bld); + hdfsFS (*hdfsBuilderConnect)(hdfsBuilder* bld); + int (*hdfsBuilderConfSetStr)(hdfsBuilder* bld, const char* key, const char* val); + + int (*hdfsDisconnect)(hdfsFS fs); + + hdfsFile (*hdfsOpenFile)(hdfsFS fs, const char* path, int flags, int bufferSize, + short replication, tSize blocksize); // NOLINT + + int (*hdfsCloseFile)(hdfsFS fs, hdfsFile file); + int (*hdfsExists)(hdfsFS fs, const char* path); + int (*hdfsSeek)(hdfsFS fs, hdfsFile file, tOffset desiredPos); + tOffset (*hdfsTell)(hdfsFS fs, hdfsFile file); + tSize (*hdfsRead)(hdfsFS fs, hdfsFile file, void* buffer, tSize length); + tSize (*hdfsPread)(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, + tSize length); + tSize (*hdfsWrite)(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); + int (*hdfsFlush)(hdfsFS fs, hdfsFile file); + int (*hdfsAvailable)(hdfsFS fs, hdfsFile file); + int (*hdfsCopy)(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + int (*hdfsMove)(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + int (*hdfsDelete)(hdfsFS fs, const char* path, int recursive); + int (*hdfsRename)(hdfsFS fs, const char* oldPath, const char* newPath); + char* (*hdfsGetWorkingDirectory)(hdfsFS fs, char* buffer, size_t bufferSize); + int (*hdfsSetWorkingDirectory)(hdfsFS fs, const char* path); + int (*hdfsCreateDirectory)(hdfsFS fs, const char* path); + int (*hdfsSetReplication)(hdfsFS fs, const char* path, int16_t replication); + hdfsFileInfo* (*hdfsListDirectory)(hdfsFS fs, const char* path, int* numEntries); + hdfsFileInfo* (*hdfsGetPathInfo)(hdfsFS fs, const char* path); + void (*hdfsFreeFileInfo)(hdfsFileInfo* hdfsFileInfo, int numEntries); + char*** (*hdfsGetHosts)(hdfsFS fs, const char* path, tOffset start, tOffset length); + void (*hdfsFreeHosts)(char*** blockHosts); + tOffset (*hdfsGetDefaultBlockSize)(hdfsFS fs); + tOffset (*hdfsGetCapacity)(hdfsFS fs); + tOffset (*hdfsGetUsed)(hdfsFS fs); + int (*hdfsChown)(hdfsFS fs, const char* path, const char* owner, const char* group); + int (*hdfsChmod)(hdfsFS fs, const char* path, short mode); // NOLINT + int (*hdfsUtime)(hdfsFS fs, const char* path, tTime mtime, tTime atime); + + void Initialize() { + this->handle = nullptr; + this->hdfsNewBuilder = nullptr; + this->hdfsBuilderSetNameNode = nullptr; + this->hdfsBuilderSetNameNodePort = nullptr; + this->hdfsBuilderSetUserName = nullptr; + this->hdfsBuilderSetKerbTicketCachePath = nullptr; + this->hdfsBuilderSetForceNewInstance = nullptr; + this->hdfsBuilderConfSetStr = nullptr; + this->hdfsBuilderConnect = nullptr; + this->hdfsDisconnect = nullptr; + this->hdfsOpenFile = nullptr; + this->hdfsCloseFile = nullptr; + this->hdfsExists = nullptr; + this->hdfsSeek = nullptr; + this->hdfsTell = nullptr; + this->hdfsRead = nullptr; + this->hdfsPread = nullptr; + this->hdfsWrite = nullptr; + this->hdfsFlush = nullptr; + this->hdfsAvailable = nullptr; + this->hdfsCopy = nullptr; + this->hdfsMove = nullptr; + this->hdfsDelete = nullptr; + this->hdfsRename = nullptr; + this->hdfsGetWorkingDirectory = nullptr; + this->hdfsSetWorkingDirectory = nullptr; + this->hdfsCreateDirectory = nullptr; + this->hdfsSetReplication = nullptr; + this->hdfsListDirectory = nullptr; + this->hdfsGetPathInfo = nullptr; + this->hdfsFreeFileInfo = nullptr; + this->hdfsGetHosts = nullptr; + this->hdfsFreeHosts = nullptr; + this->hdfsGetDefaultBlockSize = nullptr; + this->hdfsGetCapacity = nullptr; + this->hdfsGetUsed = nullptr; + this->hdfsChown = nullptr; + this->hdfsChmod = nullptr; + this->hdfsUtime = nullptr; + } + + hdfsBuilder* NewBuilder(void); + + void BuilderSetNameNode(hdfsBuilder* bld, const char* nn); + + void BuilderSetNameNodePort(hdfsBuilder* bld, tPort port); + + void BuilderSetUserName(hdfsBuilder* bld, const char* userName); + + void BuilderSetKerbTicketCachePath(hdfsBuilder* bld, const char* kerbTicketCachePath); + + void BuilderSetForceNewInstance(hdfsBuilder* bld); + + int BuilderConfSetStr(hdfsBuilder* bld, const char* key, const char* val); + + hdfsFS BuilderConnect(hdfsBuilder* bld); + + int Disconnect(hdfsFS fs); + + hdfsFile OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, + short replication, tSize blocksize); // NOLINT + + int CloseFile(hdfsFS fs, hdfsFile file); + + int Exists(hdfsFS fs, const char* path); + + int Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos); + + tOffset Tell(hdfsFS fs, hdfsFile file); + + tSize Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length); + + bool HasPread(); + + tSize Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length); + + tSize Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); + + int Flush(hdfsFS fs, hdfsFile file); + + int Available(hdfsFS fs, hdfsFile file); + + int Copy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + int Move(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + int Delete(hdfsFS fs, const char* path, int recursive); + + int Rename(hdfsFS fs, const char* oldPath, const char* newPath); + + char* GetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize); + + int SetWorkingDirectory(hdfsFS fs, const char* path); + + int MakeDirectory(hdfsFS fs, const char* path); + + int SetReplication(hdfsFS fs, const char* path, int16_t replication); + + hdfsFileInfo* ListDirectory(hdfsFS fs, const char* path, int* numEntries); + + hdfsFileInfo* GetPathInfo(hdfsFS fs, const char* path); + + void FreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries); + + char*** GetHosts(hdfsFS fs, const char* path, tOffset start, tOffset length); + + void FreeHosts(char*** blockHosts); + + tOffset GetDefaultBlockSize(hdfsFS fs); + tOffset GetCapacity(hdfsFS fs); + + tOffset GetUsed(hdfsFS fs); + + int Chown(hdfsFS fs, const char* path, const char* owner, const char* group); + + int Chmod(hdfsFS fs, const char* path, short mode); // NOLINT + + int Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime); + + Status GetRequiredSymbols(); +}; + +// TODO(wesm): Remove these exports when we are linking statically +Status ARROW_EXPORT ConnectLibHdfs(LibHdfsShim** driver); +Status ARROW_EXPORT ConnectLibHdfs3(LibHdfsShim** driver); + +} // namespace internal +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_HDFS_INTERNAL diff --git a/r/R/inst/include/arrow/io/hdfs.h b/r/R/inst/include/arrow/io/hdfs.h new file mode 100644 index 00000000000..45a47ddedad --- /dev/null +++ b/r/R/inst/include/arrow/io/hdfs.h @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_HDFS +#define ARROW_IO_HDFS + +#include +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class HdfsReadableFile; +class HdfsOutputStream; + +struct HdfsPathInfo { + ObjectType::type kind; + + std::string name; + std::string owner; + std::string group; + + // Access times in UNIX timestamps (seconds) + int64_t size; + int64_t block_size; + + int32_t last_modified_time; + int32_t last_access_time; + + int16_t replication; + int16_t permissions; +}; + +enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 }; + +struct HdfsConnectionConfig { + std::string host; + int port; + std::string user; + std::string kerb_ticket; + std::unordered_map extra_conf; + HdfsDriver driver; +}; + +class ARROW_EXPORT HadoopFileSystem : public FileSystem { + public: + ~HadoopFileSystem() override; + + // Connect to an HDFS cluster given a configuration + // + // @param config (in): configuration for connecting + // @param fs (out): the created client + // @returns Status + static Status Connect(const HdfsConnectionConfig* config, + std::shared_ptr* fs); + + // Create directory and all parents + // + // @param path (in): absolute HDFS path + // @returns Status + Status MakeDirectory(const std::string& path) override; + + // Delete file or directory + // @param path: absolute path to data + // @param recursive: if path is a directory, delete contents as well + // @returns error status on failure + Status Delete(const std::string& path, bool recursive = false); + + Status DeleteDirectory(const std::string& path) override; + + // Disconnect from cluster + // + // @returns Status + Status Disconnect(); + + // @param path (in): absolute HDFS path + // @returns bool, true if the path exists, false if not (or on error) + bool Exists(const std::string& path); + + // @param path (in): absolute HDFS path + // @param info (out) + // @returns Status + Status GetPathInfo(const std::string& path, HdfsPathInfo* info); + + // @param nbytes (out): total capacity of the filesystem + // @returns Status + Status GetCapacity(int64_t* nbytes); + + // @param nbytes (out): total bytes used of the filesystem + // @returns Status + Status GetUsed(int64_t* nbytes); + + Status GetChildren(const std::string& path, std::vector* listing) override; + + Status ListDirectory(const std::string& path, std::vector* listing); + + /// Change + /// + /// @param path file path to change + /// @param owner pass null for no change + /// @param group pass null for no change + Status Chown(const std::string& path, const char* owner, const char* group); + + /// Change path permissions + /// + /// \param path Absolute path in file system + /// \param mode Mode bitset + /// \return Status + Status Chmod(const std::string& path, int mode); + + // Move file or directory from source path to destination path within the + // current filesystem + Status Rename(const std::string& src, const std::string& dst) override; + + Status Stat(const std::string& path, FileStatistics* stat) override; + + // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory + + // Open an HDFS file in READ mode. Returns error + // status if the file is not found. + // + // @param path complete file path + Status OpenReadable(const std::string& path, int32_t buffer_size, + std::shared_ptr* file); + + Status OpenReadable(const std::string& path, std::shared_ptr* file); + + // FileMode::WRITE options + // @param path complete file path + // @param buffer_size, 0 for default + // @param replication, 0 for default + // @param default_block_size, 0 for default + Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); + + Status OpenWritable(const std::string& path, bool append, + std::shared_ptr* file); + + ARROW_DEPRECATED("Use OpenWritable") + Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); + + ARROW_DEPRECATED("Use OpenWritable") + Status OpenWriteable(const std::string& path, bool append, + std::shared_ptr* file); + + private: + friend class HdfsReadableFile; + friend class HdfsOutputStream; + + class ARROW_NO_EXPORT HadoopFileSystemImpl; + std::unique_ptr impl_; + + HadoopFileSystem(); + ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem); +}; + +class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { + public: + ~HdfsReadableFile() override; + + Status Close() override; + + bool closed() const override; + + Status GetSize(int64_t* size) override; + + // NOTE: If you wish to read a particular range of a file in a multithreaded + // context, you may prefer to use ReadAt to avoid locking issues + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; + + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* buffer) override; + + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status Seek(int64_t position) override; + Status Tell(int64_t* position) const override; + + void set_memory_pool(MemoryPool* pool); + + private: + explicit HdfsReadableFile(MemoryPool* pool = NULLPTR); + + class ARROW_NO_EXPORT HdfsReadableFileImpl; + std::unique_ptr impl_; + + friend class HadoopFileSystem::HadoopFileSystemImpl; + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); +}; + +// Naming this file OutputStream because it does not support seeking (like the +// WritableFile interface) +class ARROW_EXPORT HdfsOutputStream : public OutputStream { + public: + ~HdfsOutputStream() override; + + Status Close() override; + + bool closed() const override; + + Status Write(const void* buffer, int64_t nbytes) override; + + Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written); + + Status Flush() override; + + Status Tell(int64_t* position) const override; + + private: + class ARROW_NO_EXPORT HdfsOutputStreamImpl; + std::unique_ptr impl_; + + friend class HadoopFileSystem::HadoopFileSystemImpl; + + HdfsOutputStream(); + + ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); +}; + +Status ARROW_EXPORT HaveLibHdfs(); +Status ARROW_EXPORT HaveLibHdfs3(); + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_HDFS diff --git a/r/R/inst/include/arrow/io/interfaces.h b/r/R/inst/include/arrow/io/interfaces.h new file mode 100644 index 00000000000..3a5cfe3d778 --- /dev/null +++ b/r/R/inst/include/arrow/io/interfaces.h @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_INTERFACES_H +#define ARROW_IO_INTERFACES_H + +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class Status; + +namespace io { + +struct FileMode { + enum type { READ, WRITE, READWRITE }; +}; + +struct ObjectType { + enum type { FILE, DIRECTORY }; +}; + +struct ARROW_EXPORT FileStatistics { + /// Size of file, -1 if finding length is unsupported + int64_t size; + ObjectType::type kind; +}; + +class ARROW_EXPORT FileSystem { + public: + virtual ~FileSystem() = default; + + virtual Status MakeDirectory(const std::string& path) = 0; + + virtual Status DeleteDirectory(const std::string& path) = 0; + + virtual Status GetChildren(const std::string& path, + std::vector* listing) = 0; + + virtual Status Rename(const std::string& src, const std::string& dst) = 0; + + virtual Status Stat(const std::string& path, FileStatistics* stat) = 0; +}; + +class ARROW_EXPORT FileInterface { + public: + virtual ~FileInterface() = 0; + virtual Status Close() = 0; + virtual Status Tell(int64_t* position) const = 0; + virtual bool closed() const = 0; + + FileMode::type mode() const { return mode_; } + + protected: + FileInterface() : mode_(FileMode::READ) {} + FileMode::type mode_; + void set_mode(FileMode::type mode) { mode_ = mode; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(FileInterface); +}; + +class ARROW_EXPORT Seekable { + public: + virtual ~Seekable() = default; + virtual Status Seek(int64_t position) = 0; +}; + +class ARROW_EXPORT Writable { + public: + virtual ~Writable() = default; + + virtual Status Write(const void* data, int64_t nbytes) = 0; + + /// \brief Flush buffered bytes, if any + virtual Status Flush(); + + Status Write(const std::string& data); +}; + +class ARROW_EXPORT Readable { + public: + virtual ~Readable() = default; + + virtual Status Read(int64_t nbytes, int64_t* bytes_read, void* out) = 0; + + // Does not copy if not necessary + virtual Status Read(int64_t nbytes, std::shared_ptr* out) = 0; +}; + +class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable { + protected: + OutputStream() = default; +}; + +class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Readable { + public: + /// \brief Advance or skip stream indicated number of bytes + /// \param[in] nbytes the number to move forward + /// \return Status + Status Advance(int64_t nbytes); + + /// \brief Return zero-copy string_view to upcoming bytes in the + /// stream but do not modify stream position. View becomes invalid + /// after any operation on file. If the InputStream is unbuffered, + /// returns 0-length string_view. May trigger buffering if the + /// requested size is larger than the number of buffered bytes + /// \param[in] nbytes the maximum number of bytes to see + /// \param[out] out the returned arrow::util::string_view + /// \return Status + virtual Status Peek(int64_t nbytes, util::string_view* out); + + /// \brief Return true if InputStream is capable of zero copy Buffer reads + virtual bool supports_zero_copy() const; + + protected: + InputStream() = default; +}; + +class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { + public: + /// Necessary because we hold a std::unique_ptr + ~RandomAccessFile() override; + + virtual Status GetSize(int64_t* size) = 0; + + /// \brief Read nbytes at position, provide default implementations using + /// Read(...), but can be overridden. The default implementation is + /// thread-safe. It is unspecified whether this method updates the file + /// position or not. + /// + /// \param[in] position Where to read bytes from + /// \param[in] nbytes The number of bytes to read + /// \param[out] bytes_read The number of bytes read + /// \param[out] out The buffer to read bytes into + /// \return Status + virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out); + + /// \brief Read nbytes at position, provide default implementations using + /// Read(...), but can be overridden. The default implementation is + /// thread-safe. It is unspecified whether this method updates the file + /// position or not. + /// + /// \param[in] position Where to read bytes from + /// \param[in] nbytes The number of bytes to read + /// \param[out] out The buffer to read bytes into. The number of bytes read can be + /// retrieved by calling Buffer::size(). + virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out); + + protected: + RandomAccessFile(); + + private: + struct ARROW_NO_EXPORT RandomAccessFileImpl; + std::unique_ptr interface_impl_; +}; + +class ARROW_EXPORT WritableFile : public OutputStream, public Seekable { + public: + virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0; + + protected: + WritableFile() = default; +}; + +class ARROW_EXPORT ReadWriteFileInterface : public RandomAccessFile, public WritableFile { + protected: + ReadWriteFileInterface() { RandomAccessFile::set_mode(FileMode::READWRITE); } +}; + +// TODO(kszucs): remove this after 0.13 +#ifndef _MSC_VER +using WriteableFile ARROW_DEPRECATED("Use WritableFile") = WritableFile; +using ReadableFileInterface ARROW_DEPRECATED("Use RandomAccessFile") = RandomAccessFile; +#else +// MSVC does not like using ARROW_DEPRECATED with using declarations +using WriteableFile = WritableFile; +using ReadableFileInterface = RandomAccessFile; +#endif + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_INTERFACES_H diff --git a/r/R/inst/include/arrow/io/memory.h b/r/R/inst/include/arrow/io/memory.h new file mode 100644 index 00000000000..d820d46552c --- /dev/null +++ b/r/R/inst/include/arrow/io/memory.h @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for different memory sharing / IO mechanisms + +#pragma once + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/interfaces.h" +#include "arrow/memory_pool.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class ResizableBuffer; +class Status; + +namespace io { + +// \brief An output stream that writes to a resizable buffer +class ARROW_EXPORT BufferOutputStream : public OutputStream { + public: + explicit BufferOutputStream(const std::shared_ptr& buffer); + + /// \brief Create in-memory output stream with indicated capacity using a + /// memory pool + /// \param[in] initial_capacity the initial allocated internal capacity of + /// the OutputStream + /// \param[in,out] pool a MemoryPool to use for allocations + /// \param[out] out the created stream + static Status Create(int64_t initial_capacity, MemoryPool* pool, + std::shared_ptr* out); + + ~BufferOutputStream() override; + + // Implement the OutputStream interface + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + Status Write(const void* data, int64_t nbytes) override; + + using OutputStream::Write; + + /// Close the stream and return the buffer + Status Finish(std::shared_ptr* result); + + /// \brief Initialize state of OutputStream with newly allocated memory and + /// set position to 0 + /// \param[in] initial_capacity the starting allocated capacity + /// \param[in,out] pool the memory pool to use for allocations + /// \return Status + Status Reset(int64_t initial_capacity = 1024, MemoryPool* pool = default_memory_pool()); + + int64_t capacity() const { return capacity_; } + + private: + BufferOutputStream(); + + // Ensures there is sufficient space available to write nbytes + Status Reserve(int64_t nbytes); + + std::shared_ptr buffer_; + bool is_open_; + int64_t capacity_; + int64_t position_; + uint8_t* mutable_data_; +}; + +// \brief A helper class to tracks the size of allocations +class ARROW_EXPORT MockOutputStream : public OutputStream { + public: + MockOutputStream() : extent_bytes_written_(0), is_open_(true) {} + + // Implement the OutputStream interface + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + Status Write(const void* data, int64_t nbytes) override; + + int64_t GetExtentBytesWritten() const { return extent_bytes_written_; } + + private: + int64_t extent_bytes_written_; + bool is_open_; +}; + +/// \brief Enables random writes into a fixed-size mutable buffer +class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile { + public: + /// Input buffer must be mutable, will abort if not + explicit FixedSizeBufferWriter(const std::shared_ptr& buffer); + ~FixedSizeBufferWriter() override; + + Status Close() override; + bool closed() const override; + Status Seek(int64_t position) override; + Status Tell(int64_t* position) const override; + Status Write(const void* data, int64_t nbytes) override; + Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; + + void set_memcopy_threads(int num_threads); + void set_memcopy_blocksize(int64_t blocksize); + void set_memcopy_threshold(int64_t threshold); + + protected: + class FixedSizeBufferWriterImpl; + std::unique_ptr impl_; +}; + +/// \class BufferReader +/// \brief Random access zero-copy reads on an arrow::Buffer +class ARROW_EXPORT BufferReader : public RandomAccessFile { + public: + explicit BufferReader(const std::shared_ptr& buffer); + explicit BufferReader(const Buffer& buffer); + BufferReader(const uint8_t* data, int64_t size); + + /// \brief Instantiate from std::string or arrow::util::string_view. Does not + /// own data + explicit BufferReader(const util::string_view& data); + + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; + // Zero copy read + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + Status Peek(int64_t nbytes, util::string_view* out) override; + + bool supports_zero_copy() const override; + + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status GetSize(int64_t* size) override; + Status Seek(int64_t position) override; + + std::shared_ptr buffer() const { return buffer_; } + + protected: + inline Status CheckClosed() const; + + std::shared_ptr buffer_; + const uint8_t* data_; + int64_t size_; + int64_t position_; + bool is_open_; +}; + +} // namespace io +} // namespace arrow diff --git a/r/R/inst/include/arrow/io/mman.h b/r/R/inst/include/arrow/io/mman.h new file mode 100644 index 00000000000..61254925609 --- /dev/null +++ b/r/R/inst/include/arrow/io/mman.h @@ -0,0 +1,181 @@ +// Copyright https://code.google.com/p/mman-win32/ +// +// Licensed under the MIT License; +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/MIT + +#ifndef _MMAN_WIN32_H +#define _MMAN_WIN32_H + +#include "arrow/util/windows_compatibility.h" + +#include +#include +#include + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void*)-1) + +/* Flags for msync. */ +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +#ifndef FILE_MAP_EXECUTE +#define FILE_MAP_EXECUTE 0x0020 +#endif + +static inline int __map_mman_error(const DWORD err, const int deferr) { + if (err == 0) return 0; + // TODO: implement + return err; +} + +static inline DWORD __map_mmap_prot_page(const int prot) { + DWORD protect = 0; + + if (prot == PROT_NONE) return protect; + + if ((prot & PROT_EXEC) != 0) { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } else { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY; + } + + return protect; +} + +static inline DWORD __map_mmap_prot_file(const int prot) { + DWORD desiredAccess = 0; + + if (prot == PROT_NONE) return desiredAccess; + + if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes, + off_t off) { + HANDLE fm, h; + + void* map = MAP_FAILED; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4293) +#endif + + const DWORD dwFileOffsetLow = + (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); + const DWORD dwFileOffsetHigh = + (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const size_t maxSize = off + len; + + const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFL); + const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFL); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + errno = 0; + + if (len == 0 + /* Unsupported flag combinations */ + || (flags & MAP_FIXED) != 0 + /* Usupported protection combinations */ + || prot == PROT_EXEC) { + errno = EINVAL; + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) + : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { + errno = EBADF; + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + + CloseHandle(fm); + + if (map == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + return map; +} + +static inline int munmap(void* addr, size_t len) { + if (UnmapViewOfFile(addr)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int mprotect(void* addr, size_t len, int prot) { + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int msync(void* addr, size_t len, int flags) { + if (FlushViewOfFile(addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int mlock(const void* addr, size_t len) { + if (VirtualLock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +static inline int munlock(const void* addr, size_t len) { + if (VirtualUnlock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +#endif diff --git a/r/R/inst/include/arrow/io/readahead.h b/r/R/inst/include/arrow/io/readahead.h new file mode 100644 index 00000000000..950520ba597 --- /dev/null +++ b/r/R/inst/include/arrow/io/readahead.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_READAHEAD_H +#define ARROW_IO_READAHEAD_H + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class ResizableBuffer; +class Status; + +namespace io { + +class InputStream; + +namespace internal { + +struct ARROW_EXPORT ReadaheadBuffer { + std::shared_ptr buffer; + int64_t left_padding; + int64_t right_padding; +}; + +class ARROW_EXPORT ReadaheadSpooler { + public: + /// \brief EXPERIMENTAL: Create a readahead spooler wrapping the given input stream. + /// + /// The spooler launches a background thread that reads up to a given number + /// of fixed-size blocks in advance from the underlying stream. + /// The buffers returned by Read() will be padded at the beginning and the end + /// with the configured amount of (zeroed) bytes. + ReadaheadSpooler(MemoryPool* pool, std::shared_ptr raw, + int64_t read_size = kDefaultReadSize, int32_t readahead_queue_size = 1, + int64_t left_padding = 0, int64_t right_padding = 0); + + explicit ReadaheadSpooler(std::shared_ptr raw, + int64_t read_size = kDefaultReadSize, + int32_t readahead_queue_size = 1, int64_t left_padding = 0, + int64_t right_padding = 0); + + ~ReadaheadSpooler(); + + /// Configure zero-padding at beginning and end of buffers (default 0 bytes). + /// The buffers returned by Read() will be padded at the beginning and the end + /// with the configured amount of (zeroed) bytes. + /// Note that, as reading happens in background and in advance, changing the + /// configured values might not affect Read() results immediately. + int64_t GetLeftPadding(); + void SetLeftPadding(int64_t size); + + int64_t GetRightPadding(); + void SetRightPadding(int64_t size); + + /// \brief Close the spooler. This implicitly closes the underlying input stream. + Status Close(); + + /// \brief Read a buffer from the queue. + /// + /// If the buffer pointer in the ReadaheadBuffer is null, then EOF was + /// reached and/or the spooler was explicitly closed. + /// Otherwise, the buffer will contain at most read_size bytes in addition + /// to the configured padding (short reads are possible at the end of a file). + // How do we allow reusing the buffer in ReadaheadBuffer? perhaps by using + // a caching memory pool? + Status Read(ReadaheadBuffer* out); + + private: + static constexpr int64_t kDefaultReadSize = 1 << 20; // 1 MB + + class ARROW_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_READAHEAD_H diff --git a/r/R/inst/include/arrow/io/test-common.h b/r/R/inst/include/arrow/io/test-common.h new file mode 100644 index 00000000000..75e134732e3 --- /dev/null +++ b/r/R/inst/include/arrow/io/test-common.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_TEST_COMMON_H +#define ARROW_IO_TEST_COMMON_H + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +class MemoryMappedFile; + +ARROW_EXPORT +void AssertFileContents(const std::string& path, const std::string& contents); + +ARROW_EXPORT bool FileExists(const std::string& path); + +ARROW_EXPORT bool FileIsClosed(int fd); + +ARROW_EXPORT +Status ZeroMemoryMap(MemoryMappedFile* file); + +class ARROW_EXPORT MemoryMapFixture { + public: + void TearDown(); + + void CreateFile(const std::string& path, int64_t size); + + Status InitMemoryMap(int64_t size, const std::string& path, + std::shared_ptr* mmap); + + void AppendFile(const std::string& path); + + private: + std::vector tmp_files_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/ipc/api.h b/r/R/inst/include/arrow/ipc/api.h new file mode 100644 index 00000000000..1895c313193 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/api.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_API_H +#define ARROW_IPC_API_H + +#include "arrow/ipc/dictionary.h" +#include "arrow/ipc/feather.h" +#include "arrow/ipc/json-simple.h" +#include "arrow/ipc/message.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" + +#endif // ARROW_IPC_API_H diff --git a/r/R/inst/include/arrow/ipc/dictionary.h b/r/R/inst/include/arrow/ipc/dictionary.h new file mode 100644 index 00000000000..787cd0ddd5a --- /dev/null +++ b/r/R/inst/include/arrow/ipc/dictionary.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Tools for dictionaries in IPC context + +#ifndef ARROW_IPC_DICTIONARY_H +#define ARROW_IPC_DICTIONARY_H + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Field; +class RecordBatch; + +namespace ipc { + +using DictionaryMap = std::unordered_map>; + +/// \brief Memoization data structure for assigning id numbers to +/// dictionaries and tracking their current state through possible +/// deltas in an IPC stream +class ARROW_EXPORT DictionaryMemo { + public: + DictionaryMemo(); + DictionaryMemo(DictionaryMemo&&) = default; + DictionaryMemo& operator=(DictionaryMemo&&) = default; + + /// \brief Return current dictionary corresponding to a particular + /// id. Returns KeyError if id not found + Status GetDictionary(int64_t id, std::shared_ptr* dictionary) const; + + /// \brief Return dictionary value type corresponding to a + /// particular dictionary id. This permits multiple fields to + /// reference the same dictionary in IPC and JSON + Status GetDictionaryType(int64_t id, std::shared_ptr* type) const; + + /// \brief Return id for dictionary, computing new id if necessary + Status GetOrAssignId(const std::shared_ptr& field, int64_t* out); + + /// \brief Return id for dictionary if it exists, otherwise return + /// KeyError + Status GetId(const Field& type, int64_t* id) const; + + /// \brief Return true if dictionary for type is in this memo + bool HasDictionary(const Field& type) const; + + /// \brief Return true if we have a dictionary for the input id + bool HasDictionary(int64_t id) const; + + /// \brief Add field to the memo, return KeyError if already present + Status AddField(int64_t id, const std::shared_ptr& field); + + /// \brief Add a dictionary to the memo with a particular id. Returns + /// KeyError if that dictionary already exists + Status AddDictionary(int64_t id, const std::shared_ptr& dictionary); + + const DictionaryMap& id_to_dictionary() const { return id_to_dictionary_; } + + /// \brief The number of fields tracked in the memo + int num_fields() const { return static_cast(field_to_id_.size()); } + int num_dictionaries() const { return static_cast(id_to_dictionary_.size()); } + + private: + Status AddFieldInternal(int64_t id, const std::shared_ptr& field); + + // Dictionary memory addresses, to track whether a particular + // dictionary-encoded field has been seen before + std::unordered_map field_to_id_; + + // Map of dictionary id to dictionary array + DictionaryMap id_to_dictionary_; + std::unordered_map> id_to_type_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(DictionaryMemo); +}; + +ARROW_EXPORT +Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_DICTIONARY_H diff --git a/r/R/inst/include/arrow/ipc/feather-internal.h b/r/R/inst/include/arrow/ipc/feather-internal.h new file mode 100644 index 00000000000..2aa04b2db72 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/feather-internal.h @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for the "Feather" file format, originally created at +// http://github.com/wesm/feather + +#ifndef ARROW_IPC_FEATHER_INTERNAL_H +#define ARROW_IPC_FEATHER_INTERNAL_H + +#include +#include +#include +#include +#include + +#include "flatbuffers/flatbuffers.h" + +#include "arrow/buffer.h" +#include "arrow/ipc/feather.h" +#include "arrow/ipc/feather_generated.h" +#include "arrow/type.h" + +namespace arrow { +namespace ipc { +namespace feather { + +typedef std::vector> ColumnVector; +typedef flatbuffers::FlatBufferBuilder FBB; +typedef flatbuffers::Offset FBString; + +struct ARROW_EXPORT ColumnType { + enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; +}; + +struct ARROW_EXPORT ArrayMetadata { + ArrayMetadata() {} + + ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, + int64_t total_bytes) + : type(type), + offset(offset), + length(length), + null_count(null_count), + total_bytes(total_bytes) {} + + bool Equals(const ArrayMetadata& other) const { + return this->type == other.type && this->offset == other.offset && + this->length == other.length && this->null_count == other.null_count && + this->total_bytes == other.total_bytes; + } + + fbs::Type type; + int64_t offset; + int64_t length; + int64_t null_count; + int64_t total_bytes; +}; + +struct ARROW_EXPORT CategoryMetadata { + ArrayMetadata levels; + bool ordered; +}; + +struct ARROW_EXPORT TimestampMetadata { + TimeUnit::type unit; + + // A timezone name known to the Olson timezone database. For display purposes + // because the actual data is all UTC + std::string timezone; +}; + +struct ARROW_EXPORT TimeMetadata { + TimeUnit::type unit; +}; + +static constexpr const char* kFeatherMagicBytes = "FEA1"; +static constexpr const int kFeatherDefaultAlignment = 8; + +class ColumnBuilder; + +class ARROW_EXPORT TableBuilder { + public: + explicit TableBuilder(int64_t num_rows); + ~TableBuilder() = default; + + FBB& fbb(); + Status Finish(); + std::shared_ptr GetBuffer() const; + + std::unique_ptr AddColumn(const std::string& name); + void SetDescription(const std::string& description); + void SetNumRows(int64_t num_rows); + void add_column(const flatbuffers::Offset& col); + + private: + flatbuffers::FlatBufferBuilder fbb_; + ColumnVector columns_; + + friend class ColumnBuilder; + + bool finished_; + std::string description_; + int64_t num_rows_; +}; + +class ARROW_EXPORT TableMetadata { + public: + TableMetadata() : table_(NULLPTR) {} + ~TableMetadata() = default; + + Status Open(const std::shared_ptr& buffer) { + metadata_buffer_ = buffer; + table_ = fbs::GetCTable(buffer->data()); + + if (table_->version() < kFeatherVersion) { + std::cout << "This Feather file is old" + << " and will not be readable beyond the 0.3.0 release" << std::endl; + } + return Status::OK(); + } + + bool HasDescription() const { return table_->description() != 0; } + + std::string GetDescription() const { + if (!HasDescription()) { + return std::string(""); + } + return table_->description()->str(); + } + + int version() const { return table_->version(); } + int64_t num_rows() const { return table_->num_rows(); } + int64_t num_columns() const { return table_->columns()->size(); } + + const fbs::Column* column(int i) { return table_->columns()->Get(i); } + + private: + std::shared_ptr metadata_buffer_; + const fbs::CTable* table_; +}; + +static inline flatbuffers::Offset GetPrimitiveArray( + FBB& fbb, const ArrayMetadata& array) { + return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding_PLAIN, array.offset, + array.length, array.null_count, array.total_bytes); +} + +static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { + return static_cast(static_cast(unit)); +} + +static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { + return static_cast(static_cast(unit)); +} + +// Convert Feather enums to Flatbuffer enums + +const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { + fbs::TypeMetadata_NONE, // PRIMITIVE + fbs::TypeMetadata_CategoryMetadata, // CATEGORY + fbs::TypeMetadata_TimestampMetadata, // TIMESTAMP + fbs::TypeMetadata_DateMetadata, // DATE + fbs::TypeMetadata_TimeMetadata // TIME +}; + +static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { + return COLUMN_TYPE_ENUM_MAPPING[column_type]; +} + +static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) { + out->type = values->type(); + out->offset = values->offset(); + out->length = values->length(); + out->null_count = values->null_count(); + out->total_bytes = values->total_bytes(); +} + +class ARROW_EXPORT ColumnBuilder { + public: + ColumnBuilder(TableBuilder* parent, const std::string& name); + ~ColumnBuilder() = default; + + flatbuffers::Offset CreateColumnMetadata(); + + Status Finish(); + void SetValues(const ArrayMetadata& values); + void SetUserMetadata(const std::string& data); + void SetCategory(const ArrayMetadata& levels, bool ordered = false); + void SetTimestamp(TimeUnit::type unit); + void SetTimestamp(TimeUnit::type unit, const std::string& timezone); + void SetDate(); + void SetTime(TimeUnit::type unit); + FBB& fbb(); + + private: + TableBuilder* parent_; + + std::string name_; + ArrayMetadata values_; + std::string user_metadata_; + + // Column metadata + + // Is this a primitive type, or one of the types having metadata? Default is + // primitive + ColumnType::type type_; + + // Type-specific metadata union + CategoryMetadata meta_category_; + TimeMetadata meta_time_; + + TimestampMetadata meta_timestamp_; + + FBB* fbb_; +}; + +} // namespace feather +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_FEATHER_INTERNAL_H diff --git a/r/R/inst/include/arrow/ipc/feather.h b/r/R/inst/include/arrow/ipc/feather.h new file mode 100644 index 00000000000..b6bd4ff5e5b --- /dev/null +++ b/r/R/inst/include/arrow/ipc/feather.h @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for the "Feather" file format, originally created at +// http://github.com/wesm/feather + +#ifndef ARROW_IPC_FEATHER_H +#define ARROW_IPC_FEATHER_H + +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Column; +class Status; +class Table; + +namespace io { + +class OutputStream; +class RandomAccessFile; + +} // namespace io + +namespace ipc { +namespace feather { + +static constexpr const int kFeatherVersion = 2; + +// ---------------------------------------------------------------------- +// Metadata accessor classes + +/// \class TableReader +/// \brief An interface for reading columns from Feather files +class ARROW_EXPORT TableReader { + public: + TableReader(); + ~TableReader(); + + /// \brief Open a Feather file from a RandomAccessFile interface + /// + /// \param[in] source a RandomAccessFile instance + /// \param[out] out the table reader + static Status Open(const std::shared_ptr& source, + std::unique_ptr* out); + + /// \brief Optional table description + /// + /// This does not return a const std::string& because a string has to be + /// copied from the flatbuffer to be able to return a non-flatbuffer type + std::string GetDescription() const; + + /// \brief Return true if the table has a description field populated + bool HasDescription() const; + + /// \brief Return the version number of the Feather file + int version() const; + + /// \brief Return the number of rows in the file + int64_t num_rows() const; + + /// \brief Return the number of columns in the file + int64_t num_columns() const; + + std::string GetColumnName(int i) const; + + /// \brief Read a column from the file as an arrow::Column. + /// + /// \param[in] i the column index to read + /// \param[out] out the returned column + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + Status GetColumn(int i, std::shared_ptr* out); + + /// \brief Read all columns from the file as an arrow::Table. + /// + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + Status Read(std::shared_ptr
* out); + + /// \brief Read only the specified columns from the file as an arrow::Table. + /// + /// \param[in] indices the column indices to read + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + Status Read(const std::vector& indices, std::shared_ptr
* out); + + /// \brief Read only the specified columns from the file as an arrow::Table. + /// + /// \param[in] names the column names to read + /// \param[out] out the returned table + /// \return Status + /// + /// This function is zero-copy if the file source supports zero-copy reads + Status Read(const std::vector& names, std::shared_ptr
* out); + + private: + class ARROW_NO_EXPORT TableReaderImpl; + std::unique_ptr impl_; +}; + +/// \class TableWriter +/// \brief Interface for writing Feather files +class ARROW_EXPORT TableWriter { + public: + ~TableWriter(); + + /// \brief Create a new TableWriter that writes to an OutputStream + /// \param[in] stream an output stream + /// \param[out] out the returned table writer + /// \return Status + static Status Open(const std::shared_ptr& stream, + std::unique_ptr* out); + + /// \brief Set the description field in the file metadata + void SetDescription(const std::string& desc); + + /// \brief Set the number of rows in the file + void SetNumRows(int64_t num_rows); + + /// \brief Append a column to the file + /// + /// \param[in] name the column name + /// \param[in] values the column values as a contiguous arrow::Array + /// \return Status + Status Append(const std::string& name, const Array& values); + + /// \brief Write a table to the file + /// + /// \param[in] table the table to be written + /// \return Status + Status Write(const Table& table); + + /// \brief Finalize the file by writing the file metadata and footer + /// \return Status + Status Finalize(); + + private: + TableWriter(); + class ARROW_NO_EXPORT TableWriterImpl; + std::unique_ptr impl_; +}; + +} // namespace feather +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_FEATHER_H diff --git a/r/R/inst/include/arrow/ipc/json-integration.h b/r/R/inst/include/arrow/ipc/json-integration.h new file mode 100644 index 00000000000..0256532a4a9 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/json-integration.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow JSON serialization format + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class RecordBatch; +class Schema; + +namespace io { +class ReadableFile; +} // namespace io + +namespace ipc { +namespace internal { +namespace json { + +/// \class JsonWriter +/// \brief Write the JSON representation of an Arrow record batch file or stream +/// +/// This is used for integration testing +class ARROW_EXPORT JsonWriter { + public: + ~JsonWriter(); + + /// \brief Create a new JSON writer that writes to memory + /// + /// \param[in] schema the schema of record batches + /// \param[out] out the returned writer object + /// \return Status + static Status Open(const std::shared_ptr& schema, + std::unique_ptr* out); + + /// \brief Append a record batch + Status WriteRecordBatch(const RecordBatch& batch); + + /// \brief Finish the JSON payload and return as a std::string + /// + /// \param[out] result the JSON as as a std::string + /// \return Status + Status Finish(std::string* result); + + private: + explicit JsonWriter(const std::shared_ptr& schema); + + // Hide RapidJSON details from public API + class JsonWriterImpl; + std::unique_ptr impl_; +}; + +/// \class JsonReader +/// \brief Read the JSON representation of an Arrow record batch file or stream +/// +/// This is used for integration testing +class ARROW_EXPORT JsonReader { + public: + ~JsonReader(); + + /// \brief Create a new JSON reader + /// + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \param[in] data a Buffer containing the JSON data + /// \param[out] reader the returned reader object + /// \return Status + static Status Open(MemoryPool* pool, const std::shared_ptr& data, + std::unique_ptr* reader); + + /// \brief Create a new JSON reader that uses the default memory pool + /// + /// \param[in] data a Buffer containing the JSON data + /// \param[out] reader the returned reader object + /// \return Status + static Status Open(const std::shared_ptr& data, + std::unique_ptr* reader); + + /// \brief Create a new JSON reader from a file + /// + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \param[in] in_file a ReadableFile containing JSON data + /// \param[out] reader the returned reader object + /// \return Status + static Status Open(MemoryPool* pool, const std::shared_ptr& in_file, + std::unique_ptr* reader); + + /// \brief Return the schema read from the JSON + std::shared_ptr schema() const; + + /// \brief Return the number of record batches + int num_record_batches() const; + + /// \brief Read a particular record batch from the file + /// + /// \param[in] i the record batch index, does not boundscheck + /// \param[out] batch the read record batch + Status ReadRecordBatch(int i, std::shared_ptr* batch) const; + + private: + JsonReader(MemoryPool* pool, const std::shared_ptr& data); + + // Hide RapidJSON details from public API + class JsonReaderImpl; + std::unique_ptr impl_; +}; + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/r/R/inst/include/arrow/ipc/json-internal.h b/r/R/inst/include/arrow/ipc/json-internal.h new file mode 100644 index 00000000000..aa2e06a189d --- /dev/null +++ b/r/R/inst/include/arrow/ipc/json-internal.h @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_JSON_INTERNAL_H +#define ARROW_IPC_JSON_INTERNAL_H + +#include +#include + +#include "arrow/json/rapidjson-defs.h" +#include "rapidjson/document.h" // IWYU pragma: export +#include "rapidjson/encodings.h" // IWYU pragma: export +#include "rapidjson/error/en.h" // IWYU pragma: export +#include "rapidjson/stringbuffer.h" // IWYU pragma: export +#include "rapidjson/writer.h" // IWYU pragma: export + +#include "arrow/status.h" // IWYU pragma: export +#include "arrow/type_fwd.h" // IWYU pragma: keep +#include "arrow/util/visibility.h" + +namespace rj = arrow::rapidjson; +using RjWriter = rj::Writer; +using RjArray = rj::Value::ConstArray; +using RjObject = rj::Value::ConstObject; + +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == (PARENT).MemberEnd()) { \ + return Status::Invalid("field ", TOK, " not found"); \ + } + +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + return Status::Invalid("field was not a string line ", __LINE__); \ + } + +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + return Status::Invalid("field was not a boolean line ", __LINE__); \ + } + +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + return Status::Invalid("field was not an int line ", __LINE__); \ + } + +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + return Status::Invalid("field was not an array line ", __LINE__); \ + } + +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + return Status::Invalid("field was not an object line ", __LINE__); \ + } + +namespace arrow { +namespace ipc { + +class DictionaryMemo; + +namespace internal { +namespace json { + +/// \brief Append integration test Schema format to rapidjson writer +ARROW_EXPORT +Status WriteSchema(const Schema& schema, DictionaryMemo* dict_memo, RjWriter* writer); + +ARROW_EXPORT +Status WriteDictionary(int64_t id, const std::shared_ptr& dictionary, + RjWriter* writer); + +ARROW_EXPORT +Status WriteRecordBatch(const RecordBatch& batch, RjWriter* writer); + +ARROW_EXPORT +Status WriteArray(const std::string& name, const Array& array, RjWriter* writer); + +ARROW_EXPORT +Status ReadSchema(const rj::Value& json_obj, MemoryPool* pool, + DictionaryMemo* dictionary_memo, std::shared_ptr* schema); + +ARROW_EXPORT +Status ReadRecordBatch(const rj::Value& json_obj, const std::shared_ptr& schema, + DictionaryMemo* dict_memo, MemoryPool* pool, + std::shared_ptr* batch); + +ARROW_EXPORT +Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, + const std::shared_ptr& type, DictionaryMemo* dict_memo, + std::shared_ptr* array); + +ARROW_EXPORT +Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, const Schema& schema, + DictionaryMemo* dict_memo, std::shared_ptr* array); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_INTERNAL_H diff --git a/r/R/inst/include/arrow/ipc/json-simple.h b/r/R/inst/include/arrow/ipc/json-simple.h new file mode 100644 index 00000000000..da6483ff155 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/json-simple.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement a simple JSON representation format for arrays + +#ifndef ARROW_IPC_JSON_SIMPLE_H +#define ARROW_IPC_JSON_SIMPLE_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace ipc { +namespace internal { +namespace json { + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, + std::shared_ptr* out); + +ARROW_EXPORT +Status ArrayFromJSON(const std::shared_ptr&, const char* json, + std::shared_ptr* out); + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/r/R/inst/include/arrow/ipc/message.h b/r/R/inst/include/arrow/ipc/message.h new file mode 100644 index 00000000000..fcc7e778377 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/message.h @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// C++ object model and user API for interprocess schema messaging + +#ifndef ARROW_IPC_MESSAGE_H +#define ARROW_IPC_MESSAGE_H + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; + +namespace io { + +class FileInterface; +class InputStream; +class OutputStream; +class RandomAccessFile; + +} // namespace io + +namespace ipc { + +enum class MetadataVersion : char { + /// 0.1.0 + V1, + + /// 0.2.0 + V2, + + /// 0.3.0 to 0.7.1 + V3, + + /// >= 0.8.0 + V4 +}; + +// ARROW-109: We set this number arbitrarily to help catch user mistakes. For +// deeply nested schemas, it is expected the user will indicate explicitly the +// maximum allowed recursion depth +constexpr int kMaxNestingDepth = 64; + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +/// \class Message +/// \brief An IPC message including metadata and body +class ARROW_EXPORT Message { + public: + enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR, SPARSE_TENSOR }; + + /// \brief Construct message, but do not validate + /// + /// Use at your own risk; Message::Open has more metadata validation + Message(const std::shared_ptr& metadata, const std::shared_ptr& body); + + ~Message(); + + /// \brief Create and validate a Message instance from two buffers + /// + /// \param[in] metadata a buffer containing the Flatbuffer metadata + /// \param[in] body a buffer containing the message body, which may be null + /// \param[out] out the created message + /// \return Status + static Status Open(const std::shared_ptr& metadata, + const std::shared_ptr& body, std::unique_ptr* out); + + /// \brief Read message body and create Message given Flatbuffer metadata + /// \param[in] metadata containing a serialized Message flatbuffer + /// \param[in] stream an InputStream + /// \param[out] out the created Message + /// \return Status + /// + /// \note If stream supports zero-copy, this is zero-copy + static Status ReadFrom(const std::shared_ptr& metadata, io::InputStream* stream, + std::unique_ptr* out); + + /// \brief Read message body from position in file, and create Message given + /// the Flatbuffer metadata + /// \param[in] offset the position in the file where the message body starts. + /// \param[in] metadata containing a serialized Message flatbuffer + /// \param[in] file the seekable file interface to read from + /// \param[out] out the created Message + /// \return Status + /// + /// \note If file supports zero-copy, this is zero-copy + static Status ReadFrom(const int64_t offset, const std::shared_ptr& metadata, + io::RandomAccessFile* file, std::unique_ptr* out); + + /// \brief Return true if message type and contents are equal + /// + /// \param other another message + /// \return true if contents equal + bool Equals(const Message& other) const; + + /// \brief the Message metadata + /// + /// \return buffer + std::shared_ptr metadata() const; + + /// \brief the Message body, if any + /// + /// \return buffer is null if no body + std::shared_ptr body() const; + + /// \brief The expected body length according to the metadata, for + /// verification purposes + int64_t body_length() const; + + /// \brief The Message type + Type type() const; + + /// \brief The Message metadata version + MetadataVersion metadata_version() const; + + const void* header() const; + + /// \brief Write length-prefixed metadata and body to output stream + /// + /// \param[in] file output stream to write to + /// \param[in] alignment byte alignment for metadata, usually 8 or + /// 64. Whether the body is padded depends on the metadata; if the body + /// buffer is smaller than the size indicated in the metadata, then extra + /// padding bytes will be written + /// \param[out] output_length the number of bytes written + /// \return Status + Status SerializeTo(io::OutputStream* file, int32_t alignment, + int64_t* output_length) const; + + /// \brief Return true if the Message metadata passes Flatbuffer validation + bool Verify() const; + + /// \brief Whether a given message type needs a body. + static bool HasBody(Type type) { return type != NONE && type != SCHEMA; } + + private: + // Hide serialization details from user API + class MessageImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(Message); +}; + +ARROW_EXPORT std::string FormatMessageType(Message::Type type); + +/// \brief Abstract interface for a sequence of messages +/// \since 0.5.0 +class ARROW_EXPORT MessageReader { + public: + virtual ~MessageReader() = default; + + /// \brief Create MessageReader that reads from InputStream + static std::unique_ptr Open(io::InputStream* stream); + + /// \brief Create MessageReader that reads from owned InputStream + static std::unique_ptr Open( + const std::shared_ptr& owned_stream); + + /// \brief Read next Message from the interface + /// + /// \param[out] message an arrow::ipc::Message instance + /// \return Status + virtual Status ReadNextMessage(std::unique_ptr* message) = 0; +}; + +/// \brief Read encapsulated RPC message from position in file +/// +/// Read a length-prefixed message flatbuffer starting at the indicated file +/// offset. If the message has a body with non-zero length, it will also be +/// read +/// +/// The metadata_length includes at least the length prefix and the flatbuffer +/// +/// \param[in] offset the position in the file where the message starts. The +/// first 4 bytes after the offset are the message length +/// \param[in] metadata_length the total number of bytes to read from file +/// \param[in] file the seekable file interface to read from +/// \param[out] message the message read +/// \return Status success or failure +ARROW_EXPORT +Status ReadMessage(const int64_t offset, const int32_t metadata_length, + io::RandomAccessFile* file, std::unique_ptr* message); + +/// \brief Advance stream to an 8-byte offset if its position is not a multiple +/// of 8 already +/// \param[in] stream an input stream +/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 +/// or 64, to ensure the body starts on a multiple of that alignment +/// \return Status +ARROW_EXPORT +Status AlignStream(io::InputStream* stream, int32_t alignment = 8); + +/// \brief Advance stream to an 8-byte offset if its position is not a multiple +/// of 8 already +/// \param[in] stream an output stream +/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 +/// or 64, to ensure the body starts on a multiple of that alignment +/// \return Status +ARROW_EXPORT +Status AlignStream(io::OutputStream* stream, int32_t alignment = 8); + +/// \brief Return error Status if file position is not a multiple of the +/// indicated alignment +ARROW_EXPORT +Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8); + +/// \brief Read encapsulated RPC message (metadata and body) from InputStream +/// +/// Read length-prefixed message with as-yet unknown length. Returns null if +/// there are not enough bytes available or the message length is 0 (e.g. EOS +/// in a stream) +ARROW_EXPORT +Status ReadMessage(io::InputStream* stream, std::unique_ptr* message); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_MESSAGE_H diff --git a/r/R/inst/include/arrow/ipc/metadata-internal.h b/r/R/inst/include/arrow/ipc/metadata-internal.h new file mode 100644 index 00000000000..4563fb029d6 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/metadata-internal.h @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Internal metadata serialization matters + +#ifndef ARROW_IPC_METADATA_INTERNAL_H +#define ARROW_IPC_METADATA_INTERNAL_H + +#include +#include +#include +#include +#include + +#include + +#include "arrow/buffer.h" +#include "arrow/ipc/Schema_generated.h" +#include "arrow/ipc/dictionary.h" // IYWU pragma: keep +#include "arrow/ipc/message.h" +#include "arrow/memory_pool.h" +#include "arrow/sparse_tensor.h" +#include "arrow/status.h" + +namespace arrow { + +class DataType; +class Schema; +class Tensor; +class SparseTensor; + +namespace flatbuf = org::apache::arrow::flatbuf; + +namespace io { + +class OutputStream; + +} // namespace io + +namespace ipc { + +class DictionaryMemo; + +namespace internal { + +static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion = + flatbuf::MetadataVersion_V4; + +static constexpr flatbuf::MetadataVersion kMinMetadataVersion = + flatbuf::MetadataVersion_V4; + +MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version); + +static constexpr const char* kArrowMagicBytes = "ARROW1"; + +struct FieldMetadata { + int64_t length; + int64_t null_count; + int64_t offset; +}; + +struct BufferMetadata { + /// The relative offset into the memory page to the starting byte of the buffer + int64_t offset; + + /// Absolute length in bytes of the buffer + int64_t length; +}; + +struct FileBlock { + int64_t offset; + int32_t metadata_length; + int64_t body_length; +}; + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +// Construct a complete Schema from the message and add +// dictinory-encoded fields to a DictionaryMemo instance. May be +// expensive for very large schemas if you are only interested in a +// few fields +Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, + std::shared_ptr* out); + +Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, std::vector* strides, + std::vector* dim_names); + +// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message +Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, + std::vector* shape, + std::vector* dim_names, int64_t* length, + SparseTensorFormat::type* sparse_tensor_format_id); + +/// Write a serialized message metadata with a length-prefix and padding to an +/// 8-byte offset. Does not make assumptions about whether the stream is +/// aligned already +/// +/// +/// +/// \param[in] message a buffer containing the metadata to write +/// \param[in] alignment the size multiple of the total message size including +/// length prefix, metadata, and padding. Usually 8 or 64 +/// \param[in,out] file the OutputStream to write to +/// \param[out] message_length the total size of the payload written including +/// padding +/// \return Status +Status WriteMessage(const Buffer& message, int32_t alignment, io::OutputStream* file, + int32_t* message_length); + +// Serialize arrow::Schema as a Flatbuffer +// +// \param[in] schema a Schema instance +// \param[in,out] dictionary_memo class for tracking dictionaries and assigning +// dictionary ids +// \param[out] out the serialized arrow::Buffer +// \return Status outcome +Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo, + std::shared_ptr* out); + +Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); + +Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, + std::shared_ptr* out); + +Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, + const std::vector& buffers, + std::shared_ptr* out); + +Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, + const std::vector& record_batches, + io::OutputStream* out); + +Status WriteDictionaryMessage(const int64_t id, const int64_t length, + const int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); + +static inline Status WriteFlatbufferBuilder(flatbuffers::FlatBufferBuilder& fbb, + std::shared_ptr* out) { + int32_t size = fbb.GetSize(); + + std::shared_ptr result; + RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), size, &result)); + + uint8_t* dst = result->mutable_data(); + memcpy(dst, fbb.GetBufferPointer(), size); + *out = result; + return Status::OK(); +} + +} // namespace internal +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_H diff --git a/r/R/inst/include/arrow/ipc/reader.h b/r/R/inst/include/arrow/ipc/reader.h new file mode 100644 index 00000000000..34a0eefbbb5 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/reader.h @@ -0,0 +1,291 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Read Arrow files and streams + +#ifndef ARROW_IPC_READER_H +#define ARROW_IPC_READER_H + +#include +#include + +#include "arrow/ipc/dictionary.h" +#include "arrow/ipc/message.h" +#include "arrow/record_batch.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class Schema; +class Status; +class Tensor; +class SparseTensor; + +namespace io { + +class InputStream; +class RandomAccessFile; + +} // namespace io + +namespace ipc { + +using RecordBatchReader = ::arrow::RecordBatchReader; + +/// \class RecordBatchStreamReader +/// \brief Synchronous batch stream reader that reads from io::InputStream +/// +/// This class reads the schema (plus any dictionaries) as the first messages +/// in the stream, followed by record batches. For more granular zero-copy +/// reads see the ReadRecordBatch functions +class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { + public: + ~RecordBatchStreamReader() override; + + /// Create batch reader from generic MessageReader. + /// This will take ownership of the given MessageReader. + /// + /// \param[in] message_reader a MessageReader implementation + /// \param[out] out the created RecordBatchReader object + /// \return Status + static Status Open(std::unique_ptr message_reader, + std::shared_ptr* out); + static Status Open(std::unique_ptr message_reader, + std::unique_ptr* out); + + /// \brief Record batch stream reader from InputStream + /// + /// \param[in] stream an input stream instance. Must stay alive throughout + /// lifetime of stream reader + /// \param[out] out the created RecordBatchStreamReader object + /// \return Status + static Status Open(io::InputStream* stream, std::shared_ptr* out); + + /// \brief Open stream and retain ownership of stream object + /// \param[in] stream the input stream + /// \param[out] out the batch reader + /// \return Status + static Status Open(const std::shared_ptr& stream, + std::shared_ptr* out); + + /// \brief Returns the schema read from the stream + std::shared_ptr schema() const override; + + Status ReadNext(std::shared_ptr* batch) override; + + private: + RecordBatchStreamReader(); + + class ARROW_NO_EXPORT RecordBatchStreamReaderImpl; + std::unique_ptr impl_; +}; + +/// \brief Reads the record batch file format +class ARROW_EXPORT RecordBatchFileReader { + public: + ~RecordBatchFileReader(); + + /// \brief Open a RecordBatchFileReader + /// + /// Open a file-like object that is assumed to be self-contained; i.e., the + /// end of the file interface is the end of the Arrow file. Note that there + /// can be any amount of data preceding the Arrow-formatted data, because we + /// need only locate the end of the Arrow file stream to discover the metadata + /// and then proceed to read the data into memory. + static Status Open(io::RandomAccessFile* file, + std::shared_ptr* reader); + + /// \brief Open a RecordBatchFileReader + /// If the file is embedded within some larger file or memory region, you can + /// pass the absolute memory offset to the end of the file (which contains the + /// metadata footer). The metadata must have been written with memory offsets + /// relative to the start of the containing file + /// + /// \param[in] file the data source + /// \param[in] footer_offset the position of the end of the Arrow file + /// \param[out] reader the returned reader + /// \return Status + static Status Open(io::RandomAccessFile* file, int64_t footer_offset, + std::shared_ptr* reader); + + /// \brief Version of Open that retains ownership of file + /// + /// \param[in] file the data source + /// \param[out] reader the returned reader + /// \return Status + static Status Open(const std::shared_ptr& file, + std::shared_ptr* reader); + + /// \brief Version of Open that retains ownership of file + /// + /// \param[in] file the data source + /// \param[in] footer_offset the position of the end of the Arrow file + /// \param[out] reader the returned reader + /// \return Status + static Status Open(const std::shared_ptr& file, + int64_t footer_offset, + std::shared_ptr* reader); + + /// \brief The schema read from the file + std::shared_ptr schema() const; + + /// \brief Returns the number of record batches in the file + int num_record_batches() const; + + /// \brief Return the metadata version from the file metadata + MetadataVersion version() const; + + /// \brief Read a particular record batch from the file. Does not copy memory + /// if the input source supports zero-copy. + /// + /// \param[in] i the index of the record batch to return + /// \param[out] batch the read batch + /// \return Status + Status ReadRecordBatch(int i, std::shared_ptr* batch); + + private: + RecordBatchFileReader(); + + class ARROW_NO_EXPORT RecordBatchFileReaderImpl; + std::unique_ptr impl_; +}; + +// Generic read functions; does not copy data if the input supports zero copy reads + +/// \brief Read Schema from stream serialized as a single IPC message +/// and populate any dictionary-encoded fields into a DictionaryMemo +/// +/// \param[in] stream an InputStream +/// \param[in] dictionary_memo for recording dictionary-encoded fields +/// \param[out] out the output Schema +/// \return Status +/// +/// If record batches follow the schema, it is better to use +/// RecordBatchStreamReader +ARROW_EXPORT +Status ReadSchema(io::InputStream* stream, DictionaryMemo* dictionary_memo, + std::shared_ptr* out); + +/// \brief Read Schema from encapsulated Message +/// +/// \param[in] message a message instance containing metadata +/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded +/// fields. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[out] out the resulting Schema +/// \return Status +ARROW_EXPORT +Status ReadSchema(const Message& message, DictionaryMemo* dictionary_memo, + std::shared_ptr* out); + +/// Read record batch as encapsulated IPC message with metadata size prefix and +/// header +/// +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] stream the file where the batch is located +/// \param[out] out the read record batch +/// \return Status +ARROW_EXPORT +Status ReadRecordBatch(const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, io::InputStream* stream, + std::shared_ptr* out); + +/// \brief Read record batch from file given metadata and schema +/// +/// \param[in] metadata a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] file a random access file +/// \param[out] out the read record batch +/// \return Status +ARROW_EXPORT +Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, io::RandomAccessFile* file, + std::shared_ptr* out); + +/// \brief Read record batch from encapsulated Message +/// +/// \param[in] message a message instance containing metadata and body +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[out] out the resulting RecordBatch +/// \return Status +ARROW_EXPORT +Status ReadRecordBatch(const Message& message, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, + std::shared_ptr* out); + +/// Read record batch from file given metadata and schema +/// +/// \param[in] metadata a Message containing the record batch metadata +/// \param[in] schema the record batch schema +/// \param[in] dictionary_memo DictionaryMemo which has any +/// dictionaries. Can be nullptr if you are sure there are no +/// dictionary-encoded fields +/// \param[in] file a random access file +/// \param[in] max_recursion_depth the maximum permitted nesting depth +/// \param[out] out the read record batch +/// \return Status +ARROW_EXPORT +Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, + const DictionaryMemo* dictionary_memo, int max_recursion_depth, + io::RandomAccessFile* file, std::shared_ptr* out); + +/// \brief Read arrow::Tensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \param[out] out the read tensor +/// \return Status +ARROW_EXPORT +Status ReadTensor(io::InputStream* file, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read tensor +/// \return Status +ARROW_EXPORT +Status ReadTensor(const Message& message, std::shared_ptr* out); + +/// \brief EXPERIMETNAL: Read arrow::SparseTensor as encapsulated IPC message in file +/// +/// \param[in] file an InputStream pointed at the start of the message +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); + +/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message +/// +/// \param[in] message a Message containing the tensor metadata and body +/// \param[out] out the read sparse tensor +/// \return Status +ARROW_EXPORT +Status ReadSparseTensor(const Message& message, std::shared_ptr* out); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_READER_H diff --git a/r/R/inst/include/arrow/ipc/test-common.h b/r/R/inst/include/arrow/ipc/test-common.h new file mode 100644 index 00000000000..adbc57bfe26 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/test-common.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_TEST_COMMON_H +#define ARROW_IPC_TEST_COMMON_H + +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/type.h" + +namespace arrow { +namespace ipc { +namespace test { + +// A typedef used for test parameterization +typedef Status MakeRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +void CompareArraysDetailed(int index, const Array& result, const Array& expected); + +ARROW_EXPORT +void CompareBatchColumnsDetailed(const RecordBatch& result, const RecordBatch& expected); + +ARROW_EXPORT +Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out, uint32_t seed = 0); + +ARROW_EXPORT +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + +ARROW_EXPORT +Status MakeRandomBooleanArray(const int length, bool include_nulls, + std::shared_ptr* out); + +ARROW_EXPORT +Status MakeBooleanBatchSized(const int length, std::shared_ptr* out); + +ARROW_EXPORT +Status MakeBooleanBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeIntBatchSized(int length, std::shared_ptr* out, + uint32_t seed = 0); + +ARROW_EXPORT +Status MakeIntRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out); + +ARROW_EXPORT +Status MakeStringTypesRecordBatch(std::shared_ptr* out, + bool with_nulls = true); + +ARROW_EXPORT +Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeNullRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeListRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeZeroLengthRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeNonNullRecordBatch(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeDeeplyNestedList(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeStruct(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeUnion(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeDictionary(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeDictionaryFlat(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeDates(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeTimestamps(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeIntervals(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeTimes(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeFWBinary(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeDecimal(std::shared_ptr* out); + +ARROW_EXPORT +Status MakeNull(std::shared_ptr* out); + +} // namespace test +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/ipc/util.h b/r/R/inst/include/arrow/ipc/util.h new file mode 100644 index 00000000000..80f9f3c5102 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/util.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_UTIL_H +#define ARROW_IPC_UTIL_H + +#include + +#include "arrow/array.h" +#include "arrow/io/interfaces.h" +#include "arrow/status.h" + +namespace arrow { +namespace ipc { + +// Buffers are padded to 64-byte boundaries (for SIMD) +static constexpr int32_t kArrowAlignment = 64; + +// Tensors are padded to 64-byte boundaries +static constexpr int32_t kTensorAlignment = 64; + +// Align on 8-byte boundaries in IPC +static constexpr int32_t kArrowIpcAlignment = 8; + +static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; + +static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) { + return ((nbytes + alignment - 1) / alignment) * alignment; +} + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_UTIL_H diff --git a/r/R/inst/include/arrow/ipc/writer.h b/r/R/inst/include/arrow/ipc/writer.h new file mode 100644 index 00000000000..6bb55dbc1a5 --- /dev/null +++ b/r/R/inst/include/arrow/ipc/writer.h @@ -0,0 +1,366 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow streaming binary format + +#ifndef ARROW_IPC_WRITER_H +#define ARROW_IPC_WRITER_H + +#include +#include +#include + +#include "arrow/ipc/message.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class MemoryPool; +class RecordBatch; +class Schema; +class Status; +class Table; +class Tensor; +class SparseTensor; + +namespace io { + +class OutputStream; + +} // namespace io + +namespace ipc { + +class DictionaryMemo; + +/// \class RecordBatchWriter +/// \brief Abstract interface for writing a stream of record batches +class ARROW_EXPORT RecordBatchWriter { + public: + virtual ~RecordBatchWriter(); + + /// \brief Write a record batch to the stream + /// + /// \param[in] batch the record batch to write to the stream + /// \param[in] allow_64bit if true, allow field lengths that don't fit + /// in a signed 32-bit int + /// \return Status + virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0; + + /// \brief Write possibly-chunked table by creating sequence of record batches + /// \param[in] table table to write + /// \return Status + Status WriteTable(const Table& table); + + /// \brief Write Table with a particular chunksize + /// \param[in] table table to write + /// \param[in] max_chunksize maximum chunk size for table chunks + /// \return Status + Status WriteTable(const Table& table, int64_t max_chunksize); + + /// \brief Perform any logic necessary to finish the stream + /// + /// \return Status + virtual Status Close() = 0; + + /// In some cases, writing may require memory allocation. We use the default + /// memory pool, but provide the option to override + /// + /// \param pool the memory pool to use for required allocations + virtual void set_memory_pool(MemoryPool* pool) = 0; +}; + +/// \class RecordBatchStreamWriter +/// \brief Synchronous batch stream writer that writes the Arrow streaming +/// format +class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter { + public: + ~RecordBatchStreamWriter() override; + + /// Create a new writer from stream sink and schema. User is responsible for + /// closing the actual OutputStream. + /// + /// \param[in] sink output stream to write to + /// \param[in] schema the schema of the record batches to be written + /// \param[out] out the created stream writer + /// \return Status + static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, + std::shared_ptr* out); + + /// \brief Write a record batch to the stream + /// + /// \param[in] batch the record batch to write + /// \param[in] allow_64bit allow array lengths over INT32_MAX - 1 + /// \return Status + Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; + + /// \brief Close the stream by writing a 4-byte int32 0 EOS market + /// \return Status + Status Close() override; + + void set_memory_pool(MemoryPool* pool) override; + + protected: + RecordBatchStreamWriter(); + class ARROW_NO_EXPORT RecordBatchStreamWriterImpl; + std::unique_ptr impl_; +}; + +/// \brief Creates the Arrow record batch file format +/// +/// Implements the random access file format, which structurally is a record +/// batch stream followed by a metadata footer at the end of the file. Magic +/// numbers are written at the start and end of the file +class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { + public: + ~RecordBatchFileWriter() override; + + /// Create a new writer from stream sink and schema + /// + /// \param[in] sink output stream to write to + /// \param[in] schema the schema of the record batches to be written + /// \param[out] out the created stream writer + /// \return Status + static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, + std::shared_ptr* out); + + /// \brief Write a record batch to the file + /// + /// \param[in] batch the record batch to write + /// \param[in] allow_64bit allow array lengths over INT32_MAX - 1 + /// \return Status + Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; + + /// \brief Close the file stream by writing the file footer and magic number + /// \return Status + Status Close() override; + + private: + RecordBatchFileWriter(); + class ARROW_NO_EXPORT RecordBatchFileWriterImpl; + std::unique_ptr file_impl_; +}; + +/// \brief Low-level API for writing a record batch (without schema) to an OutputStream +/// +/// \param[in] batch the record batch to write +/// \param[in] buffer_start_offset the start offset to use in the buffer metadata, +/// generally should be 0 +/// \param[in] dst an OutputStream +/// \param[out] metadata_length the size of the length-prefixed flatbuffer +/// including padding to a 64-byte boundary +/// \param[out] body_length the size of the contiguous buffer block plus +/// \param[in] pool the memory pool to allocate memory from +/// \param[in] max_recursion_depth the maximum permitted nesting schema depth +/// \param[in] allow_64bit permit field lengths exceeding INT32_MAX. May not be +/// readable by other Arrow implementations +/// padding bytes +/// \return Status +/// +/// Write the RecordBatch (collection of equal-length Arrow arrays) to the +/// output stream in a contiguous block. The record batch metadata is written as +/// a flatbuffer (see format/Message.fbs -- the RecordBatch message type) +/// prefixed by its size, followed by each of the memory buffers in the batch +/// written end to end (with appropriate alignment and padding): +/// +/// \code +/// +/// \endcode +/// +/// Finally, the absolute offsets (relative to the start of the output stream) +/// to the end of the body and end of the metadata / data header (suffixed by +/// the header size) is returned in out-variables +ARROW_EXPORT +Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, + io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length, MemoryPool* pool, + int max_recursion_depth = kMaxNestingDepth, + bool allow_64bit = false); + +/// \brief Serialize record batch as encapsulated IPC message in a new buffer +/// +/// \param[in] batch the record batch +/// \param[in] pool a MemoryPool to allocate memory from +/// \param[out] out the serialized message +/// \return Status +ARROW_EXPORT +Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool, + std::shared_ptr* out); + +/// \brief Write record batch to OutputStream +/// +/// \param[in] batch the record batch to write +/// \param[in] pool a MemoryPool to use for temporary allocations, if needed +/// \param[in] out the OutputStream to write the output to +/// \return Status +/// +/// If writing to pre-allocated memory, you can use +/// arrow::ipc::GetRecordBatchSize to compute how much space is required +ARROW_EXPORT +Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool, + io::OutputStream* out); + +/// \brief Serialize schema as encapsulated IPC message +/// +/// \param[in] schema the schema to write +/// \param[in] dictionary_memo a DictionaryMemo for recording dictionary ids +/// \param[in] pool a MemoryPool to allocate memory from +/// \param[out] out the serialized schema +/// \return Status +ARROW_EXPORT +Status SerializeSchema(const Schema& schema, DictionaryMemo* dictionary_memo, + MemoryPool* pool, std::shared_ptr* out); + +/// \brief Write multiple record batches to OutputStream, including schema +/// \param[in] batches a vector of batches. Must all have same schema +/// \param[out] dst an OutputStream +/// \return Status +ARROW_EXPORT +Status WriteRecordBatchStream(const std::vector>& batches, + io::OutputStream* dst); + +/// \brief Compute the number of bytes needed to write a record batch including metadata +/// +/// \param[in] batch the record batch to write +/// \param[out] size the size of the complete encapsulated message +/// \return Status +ARROW_EXPORT +Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); + +/// \brief Compute the number of bytes needed to write a tensor including metadata +/// +/// \param[in] tensor the tenseor to write +/// \param[out] size the size of the complete encapsulated message +/// \return Status +ARROW_EXPORT +Status GetTensorSize(const Tensor& tensor, int64_t* size); + +/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory +/// allocation +/// +/// \param[in] tensor the Tensor to write +/// \param[in] pool MemoryPool to allocate space for metadata +/// \param[out] out the resulting Message +/// \return Status +ARROW_EXPORT +Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, + std::unique_ptr* out); + +/// \brief Write arrow::Tensor as a contiguous message. +/// +/// The metadata and body are written assuming 64-byte alignment. It is the +/// user's responsibility to ensure that the OutputStream has been aligned +/// to a 64-byte multiple before writing the message. +/// +/// The message is written out as followed: +/// \code +/// +/// \endcode +/// +/// \param[in] tensor the Tensor to write +/// \param[in] dst the OutputStream to write to +/// \param[out] metadata_length the actual metadata length, including padding +/// \param[out] body_length the acutal message body length +/// \return Status +ARROW_EXPORT +Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, + int64_t* body_length); + +// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, +// sparse index, and body are written assuming 64-byte alignment. It is the +// user's responsibility to ensure that the OutputStream has been aligned +// to a 64-byte multiple before writing the message. +// +// \param[in] tensor the SparseTensor to write +// \param[in] dst the OutputStream to write to +// \param[out] metadata_length the actual metadata length, including padding +// \param[out] body_length the actual message body length +ARROW_EXPORT +Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, + int32_t* metadata_length, int64_t* body_length, + MemoryPool* pool); + +namespace internal { + +// These internal APIs may change without warning or deprecation + +// Intermediate data structure with metadata header, and zero or more buffers +// for the message body. +struct IpcPayload { + Message::Type type = Message::NONE; + std::shared_ptr metadata; + std::vector> body_buffers; + int64_t body_length = 0; +}; + +class ARROW_EXPORT IpcPayloadWriter { + public: + virtual ~IpcPayloadWriter(); + + // Default implementation is a no-op + virtual Status Start(); + + virtual Status WritePayload(const IpcPayload& payload) = 0; + + virtual Status Close() = 0; +}; + +/// Create a new RecordBatchWriter from IpcPayloadWriter and schema. +/// +/// \param[in] sink the IpcPayloadWriter to write to +/// \param[in] schema the schema of the record batches to be written +/// \param[out] out the created RecordBatchWriter +/// \return Status +ARROW_EXPORT +Status OpenRecordBatchWriter(std::unique_ptr sink, + const std::shared_ptr& schema, + std::unique_ptr* out); + +/// \brief Compute IpcPayload for the given schema +/// \param[in] schema the Schema that is being serialized +/// \param[in,out] dictionary_memo class to populate with assigned dictionary ids +/// \param[out] out the returned vector of IpcPayloads +/// \return Status +ARROW_EXPORT +Status GetSchemaPayload(const Schema& schema, DictionaryMemo* dictionary_memo, + IpcPayload* out); + +/// \brief Compute IpcPayload for a dictionary +/// \param[in] id the dictionary id +/// \param[in] dictionary the dictionary values +/// \param[out] payload the output IpcPayload +/// \return Status +ARROW_EXPORT +Status GetDictionaryPayload(int64_t id, const std::shared_ptr& dictionary, + MemoryPool* pool, IpcPayload* payload); + +/// \brief Compute IpcPayload for the given record batch +/// \param[in] batch the RecordBatch that is being serialized +/// \param[in,out] pool for any required temporary memory allocations +/// \param[out] out the returned IpcPayload +/// \return Status +ARROW_EXPORT +Status GetRecordBatchPayload(const RecordBatch& batch, MemoryPool* pool, IpcPayload* out); + +} // namespace internal + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_WRITER_H diff --git a/r/R/inst/include/arrow/json/api.h b/r/R/inst/include/arrow/json/api.h new file mode 100644 index 00000000000..47b56684b5a --- /dev/null +++ b/r/R/inst/include/arrow/json/api.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/json/options.h" +#include "arrow/json/reader.h" diff --git a/r/R/inst/include/arrow/json/chunked-builder.h b/r/R/inst/include/arrow/json/chunked-builder.h new file mode 100644 index 00000000000..b2cfbefdf45 --- /dev/null +++ b/r/R/inst/include/arrow/json/chunked-builder.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { +class TaskGroup; +} // namespace internal + +class Array; +class MemoryPool; +class DataType; +class Field; +class ChunkedArray; + +namespace json { + +class PromotionGraph; + +class ARROW_EXPORT ChunkedArrayBuilder { + public: + virtual ~ChunkedArrayBuilder() = default; + + /// Spawn a task that will try to convert and insert the given JSON block + virtual void Insert(int64_t block_index, + const std::shared_ptr& unconverted_field, + const std::shared_ptr& unconverted) = 0; + + /// Return the final chunked array. + /// Every chunk must be inserted before this is called! + virtual Status Finish(std::shared_ptr* out) = 0; + + /// Finish current task group and substitute a new one + virtual Status ReplaceTaskGroup( + const std::shared_ptr& task_group) = 0; + + protected: + explicit ChunkedArrayBuilder(const std::shared_ptr& task_group) + : task_group_(task_group) {} + + std::shared_ptr task_group_; +}; + +/// create a chunked builder +/// +/// if unexpected fields and promotion need to be handled, promotion_graph must be +/// non-null +ARROW_EXPORT Status MakeChunkedArrayBuilder( + const std::shared_ptr& task_group, MemoryPool* pool, + const PromotionGraph* promotion_graph, const std::shared_ptr& type, + std::unique_ptr* out); + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/chunker.h b/r/R/inst/include/arrow/json/chunker.h new file mode 100644 index 00000000000..0f94d81afd3 --- /dev/null +++ b/r/R/inst/include/arrow/json/chunker.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; + +namespace json { + +struct ParseOptions; + +/// \class Chunker +/// \brief A reusable block-based chunker for JSON data +/// +/// The chunker takes a block of JSON data and finds a suitable place +/// to cut it up without splitting an object. +class ARROW_EXPORT Chunker { + public: + virtual ~Chunker() = default; + + /// \brief Carve up a chunk in a block of data to contain only whole objects + /// \param[in] block json data to be chunked + /// \param[out] whole subrange of block containing whole json objects + /// \param[out] partial subrange of block a partial json object + virtual Status Process(const std::shared_ptr& block, + std::shared_ptr* whole, + std::shared_ptr* partial) = 0; + + /// \brief Carve the completion of a partial object out of a block + /// \param[in] partial incomplete json object + /// \param[in] block json data + /// \param[out] completion subrange of block containing the completion of partial + /// \param[out] rest subrange of block containing what completion does not cover + virtual Status ProcessWithPartial(const std::shared_ptr& partial, + const std::shared_ptr& block, + std::shared_ptr* completion, + std::shared_ptr* rest) = 0; + + static std::unique_ptr Make(const ParseOptions& options); + + protected: + Chunker() = default; + ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); +}; + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/converter.h b/r/R/inst/include/arrow/json/converter.h new file mode 100644 index 00000000000..9a812dd3c3a --- /dev/null +++ b/r/R/inst/include/arrow/json/converter.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Field; +class MemoryPool; + +namespace json { + +/// \brief interface for conversion of Arrays +/// +/// Converters are not required to be correct for arbitrary input- only +/// for unconverted arrays emitted by a corresponding parser. +class ARROW_EXPORT Converter { + public: + virtual ~Converter() = default; + + /// convert an array + /// on failure, this converter may be promoted to another converter which + /// *can* convert the given input. + virtual Status Convert(const std::shared_ptr& in, + std::shared_ptr* out) = 0; + + std::shared_ptr out_type() const { return out_type_; } + + MemoryPool* pool() { return pool_; } + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + Converter(MemoryPool* pool, const std::shared_ptr& out_type) + : pool_(pool), out_type_(out_type) {} + + MemoryPool* pool_; + std::shared_ptr out_type_; +}; + +/// \brief produce a single converter to the specified out_type +ARROW_EXPORT Status MakeConverter(const std::shared_ptr& out_type, + MemoryPool* pool, std::shared_ptr* out); + +class ARROW_EXPORT PromotionGraph { + public: + virtual ~PromotionGraph() = default; + + /// \brief produce a valid field which will be inferred as null + virtual std::shared_ptr Null(const std::string& name) const = 0; + + /// \brief given an unexpected field encountered during parsing, return a type to which + /// it may be convertible (may return null if none is available) + virtual std::shared_ptr Infer( + const std::shared_ptr& unexpected_field) const = 0; + + /// \brief given a type to which conversion failed, return a promoted type to which + /// conversion may succeed (may return null if none is available) + virtual std::shared_ptr Promote( + const std::shared_ptr& failed, + const std::shared_ptr& unexpected_field) const = 0; + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph); + PromotionGraph() = default; +}; + +ARROW_EXPORT const PromotionGraph* GetPromotionGraph(); + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/options.h b/r/R/inst/include/arrow/json/options.h new file mode 100644 index 00000000000..8d27faabea2 --- /dev/null +++ b/r/R/inst/include/arrow/json/options.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class Schema; + +namespace json { + +enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType }; + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + // Optional explicit schema (no type inference, ignores other fields) + std::shared_ptr explicit_schema; + + // Whether objects may be printed across multiple lines (for example pretty printed) + // NB: if false, input must end with an empty line + bool newlines_in_values = false; + + // How should parse handle fields outside the explicit_schema? + UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + + static ParseOptions Defaults(); +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + // Whether to use the global CPU thread pool + bool use_threads = true; + // Block size we request from the IO layer; also determines the size of + // chunks when use_threads is true + int32_t block_size = 1 << 20; // 1 MB + + static ReadOptions Defaults(); +}; + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/parser.h b/r/R/inst/include/arrow/json/parser.h new file mode 100644 index 00000000000..ec12eeec370 --- /dev/null +++ b/r/R/inst/include/arrow/json/parser.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/json/options.h" +#include "arrow/status.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class MemoryPool; +class KeyValueMetadata; +class ResizableBuffer; + +namespace json { + +struct Kind { + enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; + + static const std::string& Name(Kind::type); + + static const std::shared_ptr& Tag(Kind::type); + + static Kind::type FromTag(const std::shared_ptr& tag); + + static Status ForType(const DataType& type, Kind::type* kind); +}; + +constexpr int32_t kMaxParserNumRows = 100000; + +/// \class BlockParser +/// \brief A reusable block-based parser for JSON data +/// +/// The parser takes a block of newline delimited JSON data and extracts Arrays +/// of unconverted strings which can be fed to a Converter to obtain a usable Array. +/// +/// Note that in addition to parse errors (such as malformed JSON) some conversion +/// errors are caught at parse time: +/// - A null value in non-nullable column +/// - Change in the JSON kind of a column. For example, if an explicit schema is provided +/// which stipulates that field "a" is integral, a row of {"a": "not a number"} will +/// result in an error. This also applies to fields outside an explicit schema. +class ARROW_EXPORT BlockParser { + public: + virtual ~BlockParser() = default; + + /// \brief Reserve storage for scalars parsed from a block of json + virtual Status ReserveScalarStorage(int64_t nbytes) = 0; + + /// \brief Parse a block of data + virtual Status Parse(const std::shared_ptr& json) = 0; + + /// \brief Extract parsed data + virtual Status Finish(std::shared_ptr* parsed) = 0; + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return num_rows_; } + + static Status Make(MemoryPool* pool, const ParseOptions& options, + std::unique_ptr* out); + + static Status Make(const ParseOptions& options, std::unique_ptr* out); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); + + explicit BlockParser(MemoryPool* pool) : pool_(pool) {} + + MemoryPool* pool_; + int32_t num_rows_ = 0; +}; + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/rapidjson-defs.h b/r/R/inst/include/arrow/json/rapidjson-defs.h new file mode 100644 index 00000000000..68dd0be6386 --- /dev/null +++ b/r/R/inst/include/arrow/json/rapidjson-defs.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Include this file before including any RapidJSON headers. + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +// rapidjson will be defined in namespace arrow::rapidjson +#define RAPIDJSON_NAMESPACE arrow::rapidjson +#define RAPIDJSON_NAMESPACE_BEGIN \ + namespace arrow { \ + namespace rapidjson { +#define RAPIDJSON_NAMESPACE_END \ + } \ + } + +#include "arrow/util/sse-util.h" + +// enable SIMD whitespace skipping, if available +#if defined(ARROW_HAVE_SSE2) +#define RAPIDJSON_SSE2 1 +#define ARROW_RAPIDJSON_SKIP_WHITESPACE_SIMD 1 +#endif + +#if defined(ARROW_HAVE_SSE4_2) +#define RAPIDJSON_SSE42 1 +#define ARROW_RAPIDJSON_SKIP_WHITESPACE_SIMD 1 +#endif diff --git a/r/R/inst/include/arrow/json/reader.h b/r/R/inst/include/arrow/json/reader.h new file mode 100644 index 00000000000..51a3473a04e --- /dev/null +++ b/r/R/inst/include/arrow/json/reader.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/json/options.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Table; +class RecordBatch; +class Array; +class DataType; + +namespace io { +class InputStream; +} // namespace io + +namespace json { + +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + virtual Status Read(std::shared_ptr
* out) = 0; + + static Status Make(MemoryPool* pool, std::shared_ptr input, + const ReadOptions&, const ParseOptions&, + std::shared_ptr* out); +}; + +ARROW_EXPORT Status ParseOne(ParseOptions options, std::shared_ptr json, + std::shared_ptr* out); + +/// \brief convert an Array produced by BlockParser into an Array of out_type +ARROW_EXPORT Status Convert(const std::shared_ptr& out_type, + const std::shared_ptr& in, + std::shared_ptr* out); + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/json/test-common.h b/r/R/inst/include/arrow/json/test-common.h new file mode 100644 index 00000000000..2905ae9556b --- /dev/null +++ b/r/R/inst/include/arrow/json/test-common.h @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "arrow/json/rapidjson-defs.h" +#include "rapidjson/document.h" +#include "rapidjson/prettywriter.h" +#include "rapidjson/reader.h" +#include "rapidjson/writer.h" + +#include "arrow/io/memory.h" +#include "arrow/json/converter.h" +#include "arrow/json/options.h" +#include "arrow/json/parser.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/string_view.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace json { + +namespace rj = arrow::rapidjson; + +using rj::StringBuffer; +using util::string_view; +using Writer = rj::Writer; + +inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); } + +template +inline static Status Generate(const std::shared_ptr& type, Engine& e, + Writer* writer); + +template +inline static Status Generate(const std::vector>& fields, + Engine& e, Writer* writer); + +template +inline static Status Generate(const std::shared_ptr& schm, Engine& e, + Writer* writer) { + return Generate(schm->fields(), e, writer); +} + +template +struct GenerateImpl { + Status Visit(const BooleanType&) { + return OK(writer.Bool(std::uniform_int_distribution{}(e)&1)); + } + template + Status Visit(T const&, enable_if_unsigned_integer* = nullptr) { + auto val = std::uniform_int_distribution<>{}(e); + return OK(writer.Uint64(static_cast(val))); + } + template + Status Visit(T const&, enable_if_signed_integer* = nullptr) { + auto val = std::uniform_int_distribution<>{}(e); + return OK(writer.Int64(static_cast(val))); + } + template + Status Visit(T const&, enable_if_floating_point* = nullptr) { + auto val = std::normal_distribution{0, 1 << 10}(e); + return OK(writer.Double(val)); + } + Status Visit(HalfFloatType const&) { + auto val = std::normal_distribution{0, 1 << 10}(e); + return OK(writer.Double(val)); + } + template + Status Visit(T const&, enable_if_binary* = nullptr) { + auto size = std::poisson_distribution<>{4}(e); + std::uniform_int_distribution gen_char(32, 127); // FIXME generate UTF8 + std::string s(size, '\0'); + for (char& ch : s) ch = static_cast(gen_char(e)); + return OK(writer.String(s.c_str())); + } + template + Status Visit( + T const& t, typename std::enable_if::value>::type* = nullptr, + typename std::enable_if::value>::type* = nullptr) { + return Status::Invalid("can't generate a value of type " + t.name()); + } + Status Visit(const ListType& t) { + auto size = std::poisson_distribution<>{4}(e); + writer.StartArray(); + for (int i = 0; i < size; ++i) RETURN_NOT_OK(Generate(t.value_type(), e, &writer)); + return OK(writer.EndArray(size)); + } + Status Visit(const StructType& t) { return Generate(t.children(), e, &writer); } + Engine& e; + rj::Writer& writer; +}; + +template +inline static Status Generate(const std::shared_ptr& type, Engine& e, + Writer* writer) { + if (std::uniform_real_distribution<>{0, 1}(e) < .2) { + // one out of 5 chance of null, anywhere + writer->Null(); + return Status::OK(); + } + GenerateImpl visitor = {e, *writer}; + return VisitTypeInline(*type, &visitor); +} + +template +inline static Status Generate(const std::vector>& fields, + Engine& e, Writer* writer) { + RETURN_NOT_OK(OK(writer->StartObject())); + for (const auto& f : fields) { + writer->Key(f->name().c_str()); + RETURN_NOT_OK(Generate(f->type(), e, writer)); + } + return OK(writer->EndObject(static_cast(fields.size()))); +} + +inline static Status MakeStream(string_view src_str, + std::shared_ptr* out) { + auto src = std::make_shared(src_str); + *out = std::make_shared(src); + return Status::OK(); +} + +// scalar values (numbers and strings) are parsed into a +// dictionary. This can be decoded for ease of comparison +inline static Status DecodeStringDictionary(const DictionaryArray& dict_array, + std::shared_ptr* decoded) { + const StringArray& dict = static_cast(*dict_array.dictionary()); + const Int32Array& indices = static_cast(*dict_array.indices()); + StringBuilder builder; + RETURN_NOT_OK(builder.Resize(indices.length())); + for (int64_t i = 0; i < indices.length(); ++i) { + if (indices.IsNull(i)) { + builder.UnsafeAppendNull(); + continue; + } + auto value = dict.GetView(indices.GetView(i)); + RETURN_NOT_OK(builder.ReserveData(value.size())); + builder.UnsafeAppend(value); + } + return builder.Finish(decoded); +} + +inline static Status ParseFromString(ParseOptions options, string_view src_str, + std::shared_ptr* parsed) { + auto src = std::make_shared(src_str); + std::unique_ptr parser; + RETURN_NOT_OK(BlockParser::Make(options, &parser)); + RETURN_NOT_OK(parser->Parse(src)); + return parser->Finish(parsed); +} + +static inline std::string PrettyPrint(string_view one_line) { + rj::Document document; + + // Must pass size to avoid ASAN issues. + document.Parse(one_line.data(), one_line.size()); + rj::StringBuffer sb; + rj::PrettyWriter writer(sb); + document.Accept(writer); + return sb.GetString(); +} + +} // namespace json +} // namespace arrow diff --git a/r/R/inst/include/arrow/memory_pool-test.h b/r/R/inst/include/arrow/memory_pool-test.h new file mode 100644 index 00000000000..3eca585a1b7 --- /dev/null +++ b/r/R/inst/include/arrow/memory_pool-test.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +class TestMemoryPoolBase : public ::testing::Test { + public: + virtual ::arrow::MemoryPool* memory_pool() = 0; + + void TestMemoryTracking() { + auto pool = memory_pool(); + + uint8_t* data; + ASSERT_OK(pool->Allocate(100, &data)); + EXPECT_EQ(static_cast(0), reinterpret_cast(data) % 64); + ASSERT_EQ(100, pool->bytes_allocated()); + + uint8_t* data2; + ASSERT_OK(pool->Allocate(27, &data2)); + EXPECT_EQ(static_cast(0), reinterpret_cast(data2) % 64); + ASSERT_EQ(127, pool->bytes_allocated()); + + pool->Free(data, 100); + ASSERT_EQ(27, pool->bytes_allocated()); + pool->Free(data2, 27); + ASSERT_EQ(0, pool->bytes_allocated()); + } + + void TestOOM() { + auto pool = memory_pool(); + + uint8_t* data; + int64_t to_alloc = std::min(std::numeric_limits::max(), + std::numeric_limits::max()); + // subtract 63 to prevent overflow after the size is aligned + to_alloc -= 63; + ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); + } + + void TestReallocate() { + auto pool = memory_pool(); + + uint8_t* data; + ASSERT_OK(pool->Allocate(10, &data)); + ASSERT_EQ(10, pool->bytes_allocated()); + data[0] = 35; + data[9] = 12; + + // Expand + ASSERT_OK(pool->Reallocate(10, 20, &data)); + ASSERT_EQ(data[9], 12); + ASSERT_EQ(20, pool->bytes_allocated()); + + // Shrink + ASSERT_OK(pool->Reallocate(20, 5, &data)); + ASSERT_EQ(data[0], 35); + ASSERT_EQ(5, pool->bytes_allocated()); + + // Free + pool->Free(data, 5); + ASSERT_EQ(0, pool->bytes_allocated()); + } +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/memory_pool.h b/r/R/inst/include/arrow/memory_pool.h new file mode 100644 index 00000000000..60643c387f4 --- /dev/null +++ b/r/R/inst/include/arrow/memory_pool.h @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_MEMORY_POOL_H +#define ARROW_MEMORY_POOL_H + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +namespace internal { + +/////////////////////////////////////////////////////////////////////// +// Helper tracking memory statistics + +class MemoryPoolStats { + public: + MemoryPoolStats() : bytes_allocated_(0), max_memory_(0) {} + + int64_t max_memory() const { return max_memory_.load(); } + + int64_t bytes_allocated() const { return bytes_allocated_.load(); } + + inline void UpdateAllocatedBytes(int64_t diff) { + auto allocated = bytes_allocated_.fetch_add(diff) + diff; + // "maximum" allocated memory is ill-defined in multi-threaded code, + // so don't try to be too rigorous here + if (diff > 0 && allocated > max_memory_) { + max_memory_ = allocated; + } + } + + protected: + std::atomic bytes_allocated_; + std::atomic max_memory_; +}; + +} // namespace internal + +/// Base class for memory allocation. +/// +/// Besides tracking the number of allocated bytes, the allocator also should +/// take care of the required 64-byte alignment. +class ARROW_EXPORT MemoryPool { + public: + virtual ~MemoryPool(); + + /// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool + static std::unique_ptr CreateDefault(); + + /// Allocate a new memory region of at least size bytes. + /// + /// The allocated region shall be 64-byte aligned. + virtual Status Allocate(int64_t size, uint8_t** out) = 0; + + /// Resize an already allocated memory section. + /// + /// As by default most default allocators on a platform don't support aligned + /// reallocation, this function can involve a copy of the underlying data. + virtual Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) = 0; + + /// Free an allocated region. + /// + /// @param buffer Pointer to the start of the allocated memory region + /// @param size Allocated size located at buffer. An allocator implementation + /// may use this for tracking the amount of allocated bytes as well as for + /// faster deallocation if supported by its backend. + virtual void Free(uint8_t* buffer, int64_t size) = 0; + + /// The number of bytes that were allocated and not yet free'd through + /// this allocator. + virtual int64_t bytes_allocated() const = 0; + + /// Return peak memory allocation in this memory pool + /// + /// \return Maximum bytes allocated. If not known (or not implemented), + /// returns -1 + virtual int64_t max_memory() const; + + protected: + MemoryPool(); +}; + +class ARROW_EXPORT LoggingMemoryPool : public MemoryPool { + public: + explicit LoggingMemoryPool(MemoryPool* pool); + ~LoggingMemoryPool() override = default; + + Status Allocate(int64_t size, uint8_t** out) override; + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override; + + void Free(uint8_t* buffer, int64_t size) override; + + int64_t bytes_allocated() const override; + + int64_t max_memory() const override; + + private: + MemoryPool* pool_; +}; + +/// Derived class for memory allocation. +/// +/// Tracks the number of bytes and maximum memory allocated through its direct +/// calls. Actual allocation is delegated to MemoryPool class. +class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { + public: + explicit ProxyMemoryPool(MemoryPool* pool); + ~ProxyMemoryPool() override; + + Status Allocate(int64_t size, uint8_t** out) override; + Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override; + + void Free(uint8_t* buffer, int64_t size) override; + + int64_t bytes_allocated() const override; + + int64_t max_memory() const override; + + private: + class ProxyMemoryPoolImpl; + std::unique_ptr impl_; +}; + +/// Return the process-wide default memory pool. +ARROW_EXPORT MemoryPool* default_memory_pool(); + +#ifdef ARROW_NO_DEFAULT_MEMORY_POOL +#define ARROW_MEMORY_POOL_DEFAULT +#else +#define ARROW_MEMORY_POOL_DEFAULT = default_memory_pool() +#endif + +} // namespace arrow + +#endif // ARROW_MEMORY_POOL_H diff --git a/r/R/inst/include/arrow/pretty_print.h b/r/R/inst/include/arrow/pretty_print.h new file mode 100644 index 00000000000..9c2708f16ee --- /dev/null +++ b/r/R/inst/include/arrow/pretty_print.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PRETTY_PRINT_H +#define ARROW_PRETTY_PRINT_H + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Column; +class ChunkedArray; +class RecordBatch; +class Schema; +class Status; +class Table; + +struct PrettyPrintOptions { + PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, + std::string null_rep_arg = "null", bool skip_new_lines_arg = false) + : indent(indent_arg), + indent_size(indent_size_arg), + window(window_arg), + null_rep(null_rep_arg), + skip_new_lines(skip_new_lines_arg) {} + + /// Number of spaces to shift entire formatted object to the right + int indent; + + /// Size of internal indents + int indent_size; + + /// Maximum number of elements to show at the beginning and at the end. + int window; + + /// String to use for representing a null value, defaults to "null" + std::string null_rep; + + /// Skip new lines between elements, defaults to false + bool skip_new_lines; +}; + +/// \brief Print human-readable representation of RecordBatch +ARROW_EXPORT +Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink); + +/// \brief Print human-readable representation of Table +ARROW_EXPORT +Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, + std::ostream* sink); + +/// \brief Print human-readable representation of Array +ARROW_EXPORT +Status PrettyPrint(const Array& arr, int indent, std::ostream* sink); + +/// \brief Print human-readable representation of Array +ARROW_EXPORT +Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, + std::ostream* sink); + +/// \brief Print human-readable representation of Array +ARROW_EXPORT +Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, + std::string* result); + +/// \brief Print human-readable representation of ChunkedArray +ARROW_EXPORT +Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, + std::ostream* sink); + +/// \brief Print human-readable representation of ChunkedArray +ARROW_EXPORT +Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, + std::string* result); + +/// \brief Print human-readable representation of Column +ARROW_EXPORT +Status PrettyPrint(const Column& column, const PrettyPrintOptions& options, + std::ostream* sink); + +ARROW_EXPORT +Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, + std::ostream* sink); + +ARROW_EXPORT +Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, + std::string* result); + +ARROW_EXPORT +Status DebugPrint(const Array& arr, int indent); + +} // namespace arrow + +#endif // ARROW_PRETTY_PRINT_H diff --git a/r/R/inst/include/arrow/python/api.h b/r/R/inst/include/arrow/python/api.h new file mode 100644 index 00000000000..6bbfcbfa34b --- /dev/null +++ b/r/R/inst/include/arrow/python/api.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_API_H +#define ARROW_PYTHON_API_H + +#include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/common.h" +#include "arrow/python/deserialize.h" +#include "arrow/python/helpers.h" +#include "arrow/python/inference.h" +#include "arrow/python/io.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_to_arrow.h" +#include "arrow/python/python_to_arrow.h" +#include "arrow/python/serialize.h" + +#endif // ARROW_PYTHON_API_H diff --git a/r/R/inst/include/arrow/python/arrow_to_pandas.h b/r/R/inst/include/arrow/python/arrow_to_pandas.h new file mode 100644 index 00000000000..20bad409710 --- /dev/null +++ b/r/R/inst/include/arrow/python/arrow_to_pandas.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H +#define ARROW_PYTHON_ADAPTERS_PANDAS_H + +#include "arrow/python/platform.h" + +#include +#include +#include + +#include "arrow/python/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class Column; +class DataType; +class MemoryPool; +class Status; +class Table; + +namespace py { + +struct PandasOptions { + /// If true, we will convert all string columns to categoricals + bool strings_to_categorical = false; + bool zero_copy_only = false; + bool integer_object_nulls = false; + bool date_as_object = false; + bool use_threads = false; + + /// \brief If true, do not create duplicate PyObject versions of equal + /// objects. This only applies to immutable objects like strings or datetime + /// objects + bool deduplicate_objects = false; +}; + +ARROW_PYTHON_EXPORT +Status ConvertArrayToPandas(const PandasOptions& options, + const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); + +ARROW_PYTHON_EXPORT +Status ConvertChunkedArrayToPandas(const PandasOptions& options, + const std::shared_ptr& col, + PyObject* py_ref, PyObject** out); + +ARROW_PYTHON_EXPORT +Status ConvertColumnToPandas(const PandasOptions& options, + const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); + +// Convert a whole table as efficiently as possible to a pandas.DataFrame. +// +// The returned Python object is a list of tuples consisting of the exact 2D +// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. +// +// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) +ARROW_PYTHON_EXPORT +Status ConvertTableToPandas(const PandasOptions& options, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out); + +/// Convert a whole table as efficiently as possible to a pandas.DataFrame. +/// +/// Explicitly name columns that should be a categorical +/// This option is only used on conversions that are applied to a table. +ARROW_PYTHON_EXPORT +Status ConvertTableToPandas(const PandasOptions& options, + const std::unordered_set& categorical_columns, + const std::shared_ptr
& table, MemoryPool* pool, + PyObject** out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ADAPTERS_PANDAS_H diff --git a/r/R/inst/include/arrow/python/benchmark.h b/r/R/inst/include/arrow/python/benchmark.h new file mode 100644 index 00000000000..caaff32b365 --- /dev/null +++ b/r/R/inst/include/arrow/python/benchmark.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_BENCHMARK_H +#define ARROW_PYTHON_BENCHMARK_H + +#include "arrow/python/platform.h" + +#include "arrow/python/visibility.h" + +namespace arrow { +namespace py { +namespace benchmark { + +// Micro-benchmark routines for use from ASV + +// Run PandasObjectIsNull() once over every object in *list* +ARROW_PYTHON_EXPORT +void Benchmark_PandasObjectIsNull(PyObject* list); + +} // namespace benchmark +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_BENCHMARK_H diff --git a/r/R/inst/include/arrow/python/common.h b/r/R/inst/include/arrow/python/common.h new file mode 100644 index 00000000000..a759d393a66 --- /dev/null +++ b/r/R/inst/include/arrow/python/common.h @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_COMMON_H +#define ARROW_PYTHON_COMMON_H + +#include +#include + +#include "arrow/python/config.h" + +#include "arrow/buffer.h" +#include "arrow/python/visibility.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class MemoryPool; + +namespace py { + +ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); + +// Catch a pending Python exception and return the corresponding Status. +// If no exception is pending, Status::OK() is returned. +inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) { + if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { + return Status::OK(); + } else { + return ConvertPyError(code); + } +} + +ARROW_PYTHON_EXPORT Status PassPyError(); + +// TODO(wesm): We can just let errors pass through. To be explored later +#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError()); + +#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)); + +// A RAII-style helper that ensures the GIL is acquired inside a lexical block. +class ARROW_PYTHON_EXPORT PyAcquireGIL { + public: + PyAcquireGIL() : acquired_gil_(false) { acquire(); } + + ~PyAcquireGIL() { release(); } + + void acquire() { + if (!acquired_gil_) { + state_ = PyGILState_Ensure(); + acquired_gil_ = true; + } + } + + // idempotent + void release() { + if (acquired_gil_) { + PyGILState_Release(state_); + acquired_gil_ = false; + } + } + + private: + bool acquired_gil_; + PyGILState_STATE state_; + ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); +}; + +// A helper to call safely into the Python interpreter from arbitrary C++ code. +// The GIL is acquired, and the current thread's error status is preserved. +template +Status SafeCallIntoPython(Function&& func) { + PyAcquireGIL lock; + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_traceback; + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + Status st = std::forward(func)(); + // If the return Status is a "Python error", the current Python error status + // describes the error and shouldn't be clobbered. + if (!st.IsPythonError() && exc_type != NULLPTR) { + PyErr_Restore(exc_type, exc_value, exc_traceback); + } + return st; +} + +#define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2 + +// A RAII primitive that DECREFs the underlying PyObject* when it +// goes out of scope. +class ARROW_PYTHON_EXPORT OwnedRef { + public: + OwnedRef() : obj_(NULLPTR) {} + OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} + explicit OwnedRef(PyObject* obj) : obj_(obj) {} + + OwnedRef& operator=(OwnedRef&& other) { + obj_ = other.detach(); + return *this; + } + + ~OwnedRef() { reset(); } + + void reset(PyObject* obj) { + Py_XDECREF(obj_); + obj_ = obj; + } + + void reset() { reset(NULLPTR); } + + PyObject* detach() { + PyObject* result = obj_; + obj_ = NULLPTR; + return result; + } + + PyObject* obj() const { return obj_; } + + PyObject** ref() { return &obj_; } + + operator bool() const { return obj_ != NULLPTR; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef); + + PyObject* obj_; +}; + +// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope. +// This is for situations where the GIL is not always known to be held +// (e.g. if it is released in the middle of a function for performance reasons) +class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { + public: + OwnedRefNoGIL() : OwnedRef() {} + OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {} + explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} + + ~OwnedRefNoGIL() { + PyAcquireGIL lock; + reset(); + } +}; + +// A temporary conversion of a Python object to a bytes area. +struct PyBytesView { + const char* bytes; + Py_ssize_t size; + + PyBytesView() : bytes(NULLPTR), size(0), ref(NULLPTR) {} + + // View the given Python object as binary-like, i.e. bytes + Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); } + + Status FromString(PyObject* obj) { + bool ignored = false; + return FromString(obj, false, &ignored); + } + + Status FromString(PyObject* obj, bool* is_utf8) { + return FromString(obj, true, is_utf8); + } + + Status FromUnicode(PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + Py_ssize_t size; + // The utf-8 representation is cached on the unicode object + const char* data = PyUnicode_AsUTF8AndSize(obj, &size); + RETURN_IF_PYERROR(); + this->bytes = data; + this->size = size; + this->ref.reset(); +#else + PyObject* converted = PyUnicode_AsUTF8String(obj); + RETURN_IF_PYERROR(); + this->bytes = PyBytes_AS_STRING(converted); + this->size = PyBytes_GET_SIZE(converted); + this->ref.reset(converted); +#endif + return Status::OK(); + } + + protected: + PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = NULLPTR) + : bytes(b), size(s), ref(obj) {} + + // View the given Python object as string-like, i.e. str or (utf8) bytes + Status FromString(PyObject* obj, bool check_utf8, bool* is_utf8) { + if (PyUnicode_Check(obj)) { + *is_utf8 = true; + return FromUnicode(obj); + } else { + ARROW_RETURN_NOT_OK(FromBinary(obj, "a string or bytes object")); + if (check_utf8) { + // Check the bytes are utf8 utf-8 + OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); + if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { + *is_utf8 = true; + } else { + *is_utf8 = false; + PyErr_Clear(); + } + } else { + *is_utf8 = false; + } + return Status::OK(); + } + } + + Status FromBinary(PyObject* obj, const char* expected_msg) { + if (PyBytes_Check(obj)) { + this->bytes = PyBytes_AS_STRING(obj); + this->size = PyBytes_GET_SIZE(obj); + this->ref.reset(); + return Status::OK(); + } else if (PyByteArray_Check(obj)) { + this->bytes = PyByteArray_AS_STRING(obj); + this->size = PyByteArray_GET_SIZE(obj); + this->ref.reset(); + return Status::OK(); + } else { + return Status::TypeError("Expected ", expected_msg, ", got a '", + Py_TYPE(obj)->tp_name, "' object"); + } + } + + OwnedRef ref; +}; + +// Return the common PyArrow memory pool +ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool); +ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool(); + +class ARROW_PYTHON_EXPORT PyBuffer : public Buffer { + public: + /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports + /// one-dimensional byte buffers. + ~PyBuffer(); + + static Status FromPyObject(PyObject* obj, std::shared_ptr* out); + + private: + PyBuffer(); + Status Init(PyObject*); + + Py_buffer py_buf_; +}; + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_COMMON_H diff --git a/r/R/inst/include/arrow/python/config.h b/r/R/inst/include/arrow/python/config.h new file mode 100644 index 00000000000..5649ffe55c2 --- /dev/null +++ b/r/R/inst/include/arrow/python/config.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_CONFIG_H +#define ARROW_PYTHON_CONFIG_H + +#include "arrow/python/platform.h" + +#include "arrow/python/numpy_interop.h" +#include "arrow/python/visibility.h" + +#if PY_MAJOR_VERSION >= 3 +#define PyString_Check PyUnicode_Check +#endif + +namespace arrow { +namespace py { + +ARROW_PYTHON_EXPORT +extern PyObject* numpy_nan; + +ARROW_PYTHON_EXPORT +void set_numpy_nan(PyObject* obj); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_CONFIG_H diff --git a/r/R/inst/include/arrow/python/decimal.h b/r/R/inst/include/arrow/python/decimal.h new file mode 100644 index 00000000000..0477be87f8f --- /dev/null +++ b/r/R/inst/include/arrow/python/decimal.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_DECIMAL_H +#define ARROW_PYTHON_DECIMAL_H + +#include + +#include "arrow/python/visibility.h" +#include "arrow/type.h" + +namespace arrow { + +class Decimal128; + +namespace py { + +class OwnedRef; + +// +// Python Decimal support +// + +namespace internal { + +// \brief Import the Python Decimal type +ARROW_PYTHON_EXPORT +Status ImportDecimalType(OwnedRef* decimal_type); + +// \brief Convert a Python Decimal object to a C++ string +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[out] The string representation of the Python Decimal instance +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status PythonDecimalToString(PyObject* python_decimal, std::string* out); + +// \brief Convert a C++ std::string to a Python Decimal instance +// \param[in] decimal_constructor The decimal type object +// \param[in] decimal_string A decimal string +// \return An instance of decimal.Decimal +ARROW_PYTHON_EXPORT +PyObject* DecimalFromString(PyObject* decimal_constructor, + const std::string& decimal_string); + +// \brief Convert a Python decimal to an Arrow Decimal128 object +// \param[in] python_decimal A Python decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out); + +// \brief Convert a Python object to an Arrow Decimal128 object +// \param[in] python_decimal A Python int or decimal.Decimal instance +// \param[in] arrow_type An instance of arrow::DecimalType +// \param[out] out A pointer to a Decimal128 +// \return The status of the operation +ARROW_PYTHON_EXPORT +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out); + +// \brief Check whether obj is an instance of Decimal +ARROW_PYTHON_EXPORT +bool PyDecimal_Check(PyObject* obj); + +// \brief Check whether obj is nan. This function will abort the program if the argument +// is not a Decimal instance +ARROW_PYTHON_EXPORT +bool PyDecimal_ISNAN(PyObject* obj); + +// \brief Helper class to track and update the precision and scale of a decimal +class ARROW_PYTHON_EXPORT DecimalMetadata { + public: + DecimalMetadata(); + DecimalMetadata(int32_t precision, int32_t scale); + + // \brief Adjust the precision and scale of a decimal type given a new precision and a + // new scale \param[in] suggested_precision A candidate precision \param[in] + // suggested_scale A candidate scale \return The status of the operation + Status Update(int32_t suggested_precision, int32_t suggested_scale); + + // \brief A convenient interface for updating the precision and scale based on a Python + // Decimal object \param object A Python Decimal object \return The status of the + // operation + Status Update(PyObject* object); + + int32_t precision() const { return precision_; } + int32_t scale() const { return scale_; } + + private: + int32_t precision_; + int32_t scale_; +}; + +} // namespace internal +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_DECIMAL_H diff --git a/r/R/inst/include/arrow/python/deserialize.h b/r/R/inst/include/arrow/python/deserialize.h new file mode 100644 index 00000000000..b9c4984a3b0 --- /dev/null +++ b/r/R/inst/include/arrow/python/deserialize.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_ARROW_TO_PYTHON_H +#define ARROW_PYTHON_ARROW_TO_PYTHON_H + +#include +#include +#include + +#include "arrow/python/serialize.h" +#include "arrow/python/visibility.h" +#include "arrow/status.h" + +namespace arrow { + +class RecordBatch; +class Tensor; + +namespace io { + +class RandomAccessFile; + +} // namespace io + +namespace py { + +/// \brief Read serialized Python sequence from file interface using Arrow IPC +/// \param[in] src a RandomAccessFile +/// \param[out] out the reconstructed data +/// \return Status +ARROW_PYTHON_EXPORT +Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); + +/// \brief Reconstruct SerializedPyObject from representation produced by +/// SerializedPyObject::GetComponents. +/// +/// \param[in] num_tensors number of tensors in the object +/// \param[in] num_ndarrays number of numpy Ndarrays in the object +/// \param[in] num_buffers number of buffers in the object +/// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 + +/// num_tensors * 2 + num_buffers in length +/// \param[out] out the reconstructed object +/// \return Status +ARROW_PYTHON_EXPORT +Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_buffers, + PyObject* data, SerializedPyObject* out); + +/// \brief Reconstruct Python object from Arrow-serialized representation +/// \param[in] context Serialization context which contains custom serialization +/// and deserialization callbacks. Can be any Python object with a +/// _serialize_callback method for serialization and a _deserialize_callback +/// method for deserialization. If context is None, no custom serialization +/// will be attempted. +/// \param[in] object Object to deserialize +/// \param[in] base a Python object holding the underlying data that any NumPy +/// arrays will reference, to avoid premature deallocation +/// \param[out] out The returned object +/// \return Status +/// This acquires the GIL +ARROW_PYTHON_EXPORT +Status DeserializeObject(PyObject* context, const SerializedPyObject& object, + PyObject* base, PyObject** out); + +/// \brief Reconstruct Ndarray from Arrow-serialized representation +/// \param[in] object Object to deserialize +/// \param[out] out The deserialized tensor +/// \return Status +ARROW_PYTHON_EXPORT +Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr* out); + +ARROW_PYTHON_EXPORT +Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ARROW_TO_PYTHON_H diff --git a/r/R/inst/include/arrow/python/flight.h b/r/R/inst/include/arrow/python/flight.h new file mode 100644 index 00000000000..432885cb764 --- /dev/null +++ b/r/R/inst/include/arrow/python/flight.h @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_FLIGHT_H +#define PYARROW_FLIGHT_H + +#include +#include +#include + +#include "arrow/flight/api.h" +#include "arrow/ipc/dictionary.h" +#include "arrow/python/common.h" +#include "arrow/python/config.h" + +namespace arrow { + +namespace py { + +namespace flight { + +/// \brief A table of function pointers for calling from C++ into +/// Python. +class ARROW_PYTHON_EXPORT PyFlightServerVtable { + public: + std::function*)> + list_flights; + std::function*)> + get_flight_info; + std::function*)> + do_get; + std::function)> + do_put; + std::function*)> + do_action; + std::function*)> + list_actions; +}; + +class ARROW_PYTHON_EXPORT PyServerAuthHandlerVtable { + public: + std::function + authenticate; + std::function is_valid; +}; + +class ARROW_PYTHON_EXPORT PyClientAuthHandlerVtable { + public: + std::function + authenticate; + std::function get_token; +}; + +/// \brief A helper to implement an auth mechanism in Python. +class ARROW_PYTHON_EXPORT PyServerAuthHandler : public arrow::flight::ServerAuthHandler { + public: + explicit PyServerAuthHandler(PyObject* handler, PyServerAuthHandlerVtable vtable); + Status Authenticate(arrow::flight::ServerAuthSender* outgoing, + arrow::flight::ServerAuthReader* incoming) override; + Status IsValid(const std::string& token, std::string* peer_identity) override; + + private: + OwnedRefNoGIL handler_; + PyServerAuthHandlerVtable vtable_; +}; + +/// \brief A helper to implement an auth mechanism in Python. +class ARROW_PYTHON_EXPORT PyClientAuthHandler : public arrow::flight::ClientAuthHandler { + public: + explicit PyClientAuthHandler(PyObject* handler, PyClientAuthHandlerVtable vtable); + Status Authenticate(arrow::flight::ClientAuthSender* outgoing, + arrow::flight::ClientAuthReader* incoming) override; + Status GetToken(std::string* token) override; + + private: + OwnedRefNoGIL handler_; + PyClientAuthHandlerVtable vtable_; +}; + +class ARROW_PYTHON_EXPORT PyFlightServer : public arrow::flight::FlightServerBase { + public: + explicit PyFlightServer(PyObject* server, PyFlightServerVtable vtable); + + // Like Serve(), but set up signals and invoke Python signal handlers + // if necessary. This function may return with a Python exception set. + Status ServeWithSignals(); + + Status ListFlights(const arrow::flight::ServerCallContext& context, + const arrow::flight::Criteria* criteria, + std::unique_ptr* listings) override; + Status GetFlightInfo(const arrow::flight::ServerCallContext& context, + const arrow::flight::FlightDescriptor& request, + std::unique_ptr* info) override; + Status DoGet(const arrow::flight::ServerCallContext& context, + const arrow::flight::Ticket& request, + std::unique_ptr* stream) override; + Status DoPut(const arrow::flight::ServerCallContext& context, + std::unique_ptr reader) override; + Status DoAction(const arrow::flight::ServerCallContext& context, + const arrow::flight::Action& action, + std::unique_ptr* result) override; + Status ListActions(const arrow::flight::ServerCallContext& context, + std::vector* actions) override; + + private: + OwnedRefNoGIL server_; + PyFlightServerVtable vtable_; +}; + +/// \brief A callback that obtains the next result from a Flight action. +typedef std::function*)> + PyFlightResultStreamCallback; + +/// \brief A ResultStream built around a Python callback. +class ARROW_PYTHON_EXPORT PyFlightResultStream : public arrow::flight::ResultStream { + public: + /// \brief Construct a FlightResultStream from a Python object and callback. + /// Must only be called while holding the GIL. + explicit PyFlightResultStream(PyObject* generator, + PyFlightResultStreamCallback callback); + Status Next(std::unique_ptr* result) override; + + private: + OwnedRefNoGIL generator_; + PyFlightResultStreamCallback callback_; +}; + +/// \brief A wrapper around a FlightDataStream that keeps alive a +/// Python object backing it. +class ARROW_PYTHON_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream { + public: + /// \brief Construct a FlightDataStream from a Python object and underlying stream. + /// Must only be called while holding the GIL. + explicit PyFlightDataStream(PyObject* data_source, + std::unique_ptr stream); + + std::shared_ptr schema() override; + Status GetSchemaPayload(arrow::flight::FlightPayload* payload) override; + Status Next(arrow::flight::FlightPayload* payload) override; + + private: + OwnedRefNoGIL data_source_; + std::unique_ptr stream_; +}; + +/// \brief A callback that obtains the next payload from a Flight result stream. +typedef std::function + PyGeneratorFlightDataStreamCallback; + +/// \brief A FlightDataStream built around a Python callback. +class ARROW_PYTHON_EXPORT PyGeneratorFlightDataStream + : public arrow::flight::FlightDataStream { + public: + /// \brief Construct a FlightDataStream from a Python object and underlying stream. + /// Must only be called while holding the GIL. + explicit PyGeneratorFlightDataStream(PyObject* generator, + std::shared_ptr schema, + PyGeneratorFlightDataStreamCallback callback); + std::shared_ptr schema() override; + Status GetSchemaPayload(arrow::flight::FlightPayload* payload) override; + Status Next(arrow::flight::FlightPayload* payload) override; + + private: + OwnedRefNoGIL generator_; + std::shared_ptr schema_; + ipc::DictionaryMemo dictionary_memo_; + PyGeneratorFlightDataStreamCallback callback_; +}; + +ARROW_PYTHON_EXPORT +Status CreateFlightInfo(const std::shared_ptr& schema, + const arrow::flight::FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, + std::unique_ptr* out); + +} // namespace flight +} // namespace py +} // namespace arrow + +#endif // PYARROW_FLIGHT_H diff --git a/r/R/inst/include/arrow/python/helpers.h b/r/R/inst/include/arrow/python/helpers.h new file mode 100644 index 00000000000..2d44feea5ac --- /dev/null +++ b/r/R/inst/include/arrow/python/helpers.h @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_HELPERS_H +#define ARROW_PYTHON_HELPERS_H + +#include "arrow/python/platform.h" + +#include +#include +#include +#include + +#include + +#include "arrow/python/visibility.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" + +namespace arrow { + +namespace py { + +class OwnedRef; + +// \brief Get an arrow DataType instance from Arrow's Type::type enum +// \param[in] type One of the values of Arrow's Type::type enum +// \return A shared pointer to DataType +ARROW_PYTHON_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); + +// \brief Construct a np.float16 object from a npy_half value. +ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value); + +// \brief Convert a Python object to a npy_half value. +ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out); + +namespace internal { + +// \brief Import a Python module +// \param[in] module_name The name of the module +// \param[out] ref The OwnedRef containing the module PyObject* +ARROW_PYTHON_EXPORT +Status ImportModule(const std::string& module_name, OwnedRef* ref); + +// \brief Import an object from a Python module +// \param[in] module A Python module +// \param[in] name The name of the object to import +// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c +// module +ARROW_PYTHON_EXPORT +Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref); + +// \brief Check whether obj is an integer, independent of Python versions. +inline bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + +// \brief Use pandas missing value semantics to check if a value is null +ARROW_PYTHON_EXPORT +bool PandasObjectIsNull(PyObject* obj); + +// \brief Check whether obj is a floating-point NaN +ARROW_PYTHON_EXPORT +bool PyFloat_IsNaN(PyObject* obj); + +inline bool IsPyBinary(PyObject* obj) { + return PyBytes_Check(obj) || PyByteArray_Check(obj); +} + +// \brief Convert a Python integer into a C integer +// \param[in] obj A Python integer +// \param[out] out A pointer to a C integer to hold the result of the conversion +// \return The status of the operation +template +Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = ""); + +// \brief Convert a Python unicode string to a std::string +ARROW_PYTHON_EXPORT +Status PyUnicode_AsStdString(PyObject* obj, std::string* out); + +// \brief Convert a Python bytes object to a std::string +ARROW_PYTHON_EXPORT +std::string PyBytes_AsStdString(PyObject* obj); + +// \brief Call str() on the given object and return the result as a std::string +ARROW_PYTHON_EXPORT +Status PyObject_StdStringStr(PyObject* obj, std::string* out); + +// \brief Return the repr() of the given object (always succeeds) +ARROW_PYTHON_EXPORT +std::string PyObject_StdStringRepr(PyObject* obj); + +// \brief Cast the given size to int32_t, with error checking +inline Status CastSize(Py_ssize_t size, int32_t* out, + const char* error_msg = "Maximum size exceeded (2GB)") { + // size is assumed to be positive + if (size > std::numeric_limits::max()) { + return Status::Invalid(error_msg); + } + *out = static_cast(size); + return Status::OK(); +} + +// \brief Print the Python object's __str__ form along with the passed error +// message +ARROW_PYTHON_EXPORT +Status InvalidValue(PyObject* obj, const std::string& why); + +ARROW_PYTHON_EXPORT +Status IntegerScalarToDoubleSafe(PyObject* obj, double* result); +ARROW_PYTHON_EXPORT +Status IntegerScalarToFloat32Safe(PyObject* obj, float* result); + +} // namespace internal +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_HELPERS_H diff --git a/r/R/inst/include/arrow/python/inference.h b/r/R/inst/include/arrow/python/inference.h new file mode 100644 index 00000000000..8790250f543 --- /dev/null +++ b/r/R/inst/include/arrow/python/inference.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef ARROW_PYTHON_INFERENCE_H +#define ARROW_PYTHON_INFERENCE_H + +#include "arrow/python/platform.h" + +#include + +#include "arrow/python/visibility.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" + +#include "arrow/python/common.h" + +namespace arrow { + +class Array; +class Status; + +namespace py { + +// These three functions take a sequence input, not arbitrary iterables +ARROW_PYTHON_EXPORT +arrow::Status InferArrowType(PyObject* obj, std::shared_ptr* out_type); + +ARROW_PYTHON_EXPORT +arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size, + std::shared_ptr* out_type); + +/// Checks whether the passed Python object is a boolean scalar +ARROW_PYTHON_EXPORT +bool IsPyBool(PyObject* obj); + +/// Checks whether the passed Python object is an integer scalar +ARROW_PYTHON_EXPORT +bool IsPyInt(PyObject* obj); + +/// Checks whether the passed Python object is a float scalar +ARROW_PYTHON_EXPORT +bool IsPyFloat(PyObject* obj); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_INFERENCE_H diff --git a/r/R/inst/include/arrow/python/init.h b/r/R/inst/include/arrow/python/init.h new file mode 100644 index 00000000000..34d19b21fdf --- /dev/null +++ b/r/R/inst/include/arrow/python/init.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_INIT_H +#define ARROW_PYTHON_INIT_H + +#include "arrow/python/platform.h" +#include "arrow/python/visibility.h" + +extern "C" { +ARROW_PYTHON_EXPORT +int arrow_init_numpy(); +} + +#endif // ARROW_PYTHON_INIT_H diff --git a/r/R/inst/include/arrow/python/io.h b/r/R/inst/include/arrow/python/io.h new file mode 100644 index 00000000000..d3b7c999eb8 --- /dev/null +++ b/r/R/inst/include/arrow/python/io.h @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_IO_H +#define PYARROW_IO_H + +#include + +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/python/visibility.h" + +#include "arrow/python/config.h" + +#include "arrow/python/common.h" + +namespace arrow { + +class MemoryPool; + +namespace py { + +class ARROW_NO_EXPORT PythonFile; + +class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile { + public: + explicit PyReadableFile(PyObject* file); + ~PyReadableFile() override; + + Status Close() override; + bool closed() const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + // Thread-safe version + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, + void* out) override; + + // Thread-safe version + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status GetSize(int64_t* size) override; + + Status Seek(int64_t position) override; + + Status Tell(int64_t* position) const override; + + private: + std::unique_ptr file_; +}; + +class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream { + public: + explicit PyOutputStream(PyObject* file); + ~PyOutputStream() override; + + Status Close() override; + bool closed() const override; + Status Tell(int64_t* position) const override; + Status Write(const void* data, int64_t nbytes) override; + + private: + std::unique_ptr file_; + int64_t position_; +}; + +// TODO(wesm): seekable output files + +// A Buffer subclass that keeps a PyObject reference throughout its +// lifetime, such that the Python object is kept alive as long as the +// C++ buffer is still needed. +// Keeping the reference in a Python wrapper would be incorrect as +// the Python wrapper can get destroyed even though the wrapped C++ +// buffer is still alive (ARROW-2270). +class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer { + public: + static Status Make(const uint8_t* data, int64_t size, PyObject* base, + std::shared_ptr* out); + + private: + PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base) + : Buffer(data, size) { + Py_INCREF(base); + base_.reset(base); + } + + OwnedRefNoGIL base_; +}; + +} // namespace py +} // namespace arrow + +#endif // PYARROW_IO_H diff --git a/r/R/inst/include/arrow/python/iterators.h b/r/R/inst/include/arrow/python/iterators.h new file mode 100644 index 00000000000..40e40aa984a --- /dev/null +++ b/r/R/inst/include/arrow/python/iterators.h @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_ITERATORS_H +#define ARROW_PYTHON_ITERATORS_H + +#include + +#include "arrow/python/common.h" +#include "arrow/python/numpy-internal.h" + +namespace arrow { +namespace py { +namespace internal { + +// Visit the Python sequence, calling the given callable on each element. If +// the callable returns a non-OK status, iteration stops and the status is +// returned. +// +// The call signature for Visitor must be +// +// Visit(PyObject* obj, int64_t index, bool* keep_going) +// +// If keep_going is set to false, the iteration terminates +template +inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) { + // VisitorFunc may set to false to terminate iteration + bool keep_going = true; + + if (PyArray_Check(obj)) { + PyArrayObject* arr_obj = reinterpret_cast(obj); + if (PyArray_NDIM(arr_obj) != 1) { + return Status::Invalid("Only 1D arrays accepted"); + } + + if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) { + // It's an array object, we can fetch object pointers directly + const Ndarray1DIndexer objects(arr_obj); + for (int64_t i = 0; keep_going && i < objects.size(); ++i) { + RETURN_NOT_OK(func(objects[i], i, &keep_going)); + } + return Status::OK(); + } + // It's a non-object array, fall back on regular sequence access. + // (note PyArray_GETITEM() is slightly different: it returns standard + // Python types, not Numpy scalar types) + // This code path is inefficient: callers should implement dedicated + // logic for non-object arrays. + } + if (PySequence_Check(obj)) { + if (PyList_Check(obj) || PyTuple_Check(obj)) { + // Use fast item access + const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj); + for (Py_ssize_t i = 0; keep_going && i < size; ++i) { + PyObject* value = PySequence_Fast_GET_ITEM(obj, i); + RETURN_NOT_OK(func(value, static_cast(i), &keep_going)); + } + } else { + // Regular sequence: avoid making a potentially large copy + const Py_ssize_t size = PySequence_Size(obj); + RETURN_IF_PYERROR(); + for (Py_ssize_t i = 0; keep_going && i < size; ++i) { + OwnedRef value_ref(PySequence_ITEM(obj, i)); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(func(value_ref.obj(), static_cast(i), &keep_going)); + } + } + } else { + return Status::TypeError("Object is not a sequence"); + } + return Status::OK(); +} + +// Visit sequence with no null mask +template +inline Status VisitSequence(PyObject* obj, VisitorFunc&& func) { + return VisitSequenceGeneric( + obj, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) { + return func(value, keep_going); + }); +} + +/// Visit sequence with null mask +template +inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, VisitorFunc&& func) { + if (mo == nullptr || !PyArray_Check(mo)) { + return Status::Invalid("Null mask must be NumPy array"); + } + + PyArrayObject* mask = reinterpret_cast(mo); + if (PyArray_NDIM(mask) != 1) { + return Status::Invalid("Mask must be 1D array"); + } + + const Py_ssize_t obj_size = PySequence_Size(obj); + if (PyArray_SIZE(mask) != static_cast(obj_size)) { + return Status::Invalid("Mask was a different length from sequence being converted"); + } + + const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num); + if (dtype == NPY_BOOL) { + Ndarray1DIndexer mask_values(mask); + + return VisitSequenceGeneric( + obj, [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) { + return func(value, mask_values[i], keep_going); + }); + } else { + return Status::Invalid("Mask must be boolean dtype"); + } +} + +// Like IterateSequence, but accepts any generic iterable (including +// non-restartable iterators, e.g. generators). +// +// The call signature for VisitorFunc must be Visit(PyObject*, bool* +// keep_going). If keep_going is set to false, the iteration terminates +template +inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) { + if (PySequence_Check(obj)) { + // Numpy arrays fall here as well + return VisitSequence(obj, std::forward(func)); + } + // Fall back on the iterator protocol + OwnedRef iter_ref(PyObject_GetIter(obj)); + PyObject* iter = iter_ref.obj(); + RETURN_IF_PYERROR(); + PyObject* value; + + bool keep_going = true; + while (keep_going && (value = PyIter_Next(iter))) { + OwnedRef value_ref(value); + RETURN_NOT_OK(func(value_ref.obj(), &keep_going)); + } + RETURN_IF_PYERROR(); // __next__() might have raised + return Status::OK(); +} + +} // namespace internal +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ITERATORS_H diff --git a/r/R/inst/include/arrow/python/numpy-internal.h b/r/R/inst/include/arrow/python/numpy-internal.h new file mode 100644 index 00000000000..19bcde0318f --- /dev/null +++ b/r/R/inst/include/arrow/python/numpy-internal.h @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Internal utilities for dealing with NumPy + +#ifndef ARROW_PYTHON_NUMPY_INTERNAL_H +#define ARROW_PYTHON_NUMPY_INTERNAL_H + +#include "arrow/python/numpy_interop.h" + +#include "arrow/status.h" + +#include "arrow/python/platform.h" + +#include +#include +#include + +namespace arrow { +namespace py { + +/// Indexing convenience for interacting with strided 1-dim ndarray objects +template +class Ndarray1DIndexer { + public: + typedef int64_t size_type; + + Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {} + + explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { + arr_ = arr; + DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays"; + Py_INCREF(arr); + data_ = reinterpret_cast(PyArray_DATA(arr)); + stride_ = PyArray_STRIDES(arr)[0]; + } + + ~Ndarray1DIndexer() { Py_XDECREF(arr_); } + + int64_t size() const { return PyArray_SIZE(arr_); } + + T* data() const { return data_; } + + bool is_strided() const { return stride_ != sizeof(T); } + + T& operator[](size_type index) { + return *reinterpret_cast(data_ + index * stride_); + } + const T& operator[](size_type index) const { + return *reinterpret_cast(data_ + index * stride_); + } + + private: + PyArrayObject* arr_; + uint8_t* data_; + int64_t stride_; +}; + +// Handling of Numpy Types by their static numbers +// (the NPY_TYPES enum and related defines) + +static inline std::string GetNumPyTypeName(int npy_type) { +#define TYPE_CASE(TYPE, NAME) \ + case NPY_##TYPE: \ + return NAME; + + switch (npy_type) { + TYPE_CASE(BOOL, "bool") + TYPE_CASE(INT8, "int8") + TYPE_CASE(INT16, "int16") + TYPE_CASE(INT32, "int32") + TYPE_CASE(INT64, "int64") +#if !NPY_INT32_IS_INT + TYPE_CASE(INT, "intc") +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_CASE(LONGLONG, "longlong") +#endif + TYPE_CASE(UINT8, "uint8") + TYPE_CASE(UINT16, "uint16") + TYPE_CASE(UINT32, "uint32") + TYPE_CASE(UINT64, "uint64") +#if !NPY_INT32_IS_INT + TYPE_CASE(UINT, "uintc") +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_CASE(ULONGLONG, "ulonglong") +#endif + TYPE_CASE(FLOAT16, "float16") + TYPE_CASE(FLOAT32, "float32") + TYPE_CASE(FLOAT64, "float64") + TYPE_CASE(DATETIME, "datetime64") + TYPE_CASE(OBJECT, "object") + TYPE_CASE(VOID, "void") + default: + break; + } + +#undef TYPE_CASE + std::stringstream ss; + ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; + return ss.str(); +} + +#define TYPE_VISIT_INLINE(TYPE) \ + case NPY_##TYPE: \ + return visitor->template Visit(arr); + +template +inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { + switch (PyArray_TYPE(arr)) { + TYPE_VISIT_INLINE(BOOL); + TYPE_VISIT_INLINE(INT8); + TYPE_VISIT_INLINE(UINT8); + TYPE_VISIT_INLINE(INT16); + TYPE_VISIT_INLINE(UINT16); + TYPE_VISIT_INLINE(INT32); + TYPE_VISIT_INLINE(UINT32); + TYPE_VISIT_INLINE(INT64); + TYPE_VISIT_INLINE(UINT64); +#if !NPY_INT32_IS_INT + TYPE_VISIT_INLINE(INT); + TYPE_VISIT_INLINE(UINT); +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_VISIT_INLINE(LONGLONG); + TYPE_VISIT_INLINE(ULONGLONG); +#endif + TYPE_VISIT_INLINE(FLOAT16); + TYPE_VISIT_INLINE(FLOAT32); + TYPE_VISIT_INLINE(FLOAT64); + TYPE_VISIT_INLINE(DATETIME); + TYPE_VISIT_INLINE(OBJECT); + } + return Status::NotImplemented("NumPy type not implemented: ", + GetNumPyTypeName(PyArray_TYPE(arr))); +} + +#undef TYPE_VISIT_INLINE + +namespace internal { + +inline bool PyFloatScalar_Check(PyObject* obj) { + return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); +} + +inline bool PyIntScalar_Check(PyObject* obj) { +#if PY_MAJOR_VERSION < 3 + if (PyInt_Check(obj)) { + return true; + } +#endif + return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); +} + +inline bool PyBoolScalar_Check(PyObject* obj) { + return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); +} + +} // namespace internal + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_NUMPY_INTERNAL_H diff --git a/r/R/inst/include/arrow/python/numpy_convert.h b/r/R/inst/include/arrow/python/numpy_convert.h new file mode 100644 index 00000000000..dce5fe522d6 --- /dev/null +++ b/r/R/inst/include/arrow/python/numpy_convert.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#ifndef ARROW_PYTHON_NUMPY_CONVERT_H +#define ARROW_PYTHON_NUMPY_CONVERT_H + +#include "arrow/python/platform.h" + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/python/visibility.h" + +namespace arrow { + +class DataType; +class MemoryPool; +class Status; +class Tensor; + +namespace py { + +class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer { + public: + explicit NumPyBuffer(PyObject* arr); + virtual ~NumPyBuffer(); + + private: + PyObject* arr_; +}; + +// Handle misbehaved types like LONGLONG and ULONGLONG +ARROW_PYTHON_EXPORT +int cast_npy_type_compat(int type_num); + +ARROW_PYTHON_EXPORT +bool is_contiguous(PyObject* array); + +ARROW_PYTHON_EXPORT +Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out); +ARROW_PYTHON_EXPORT +Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out); + +Status GetTensorType(PyObject* dtype, std::shared_ptr* out); +Status GetNumPyType(const DataType& type, int* type_num); + +ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, + PyObject* base, PyObject** out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_NUMPY_CONVERT_H diff --git a/r/R/inst/include/arrow/python/numpy_interop.h b/r/R/inst/include/arrow/python/numpy_interop.h new file mode 100644 index 00000000000..094c3213758 --- /dev/null +++ b/r/R/inst/include/arrow/python/numpy_interop.h @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_NUMPY_INTEROP_H +#define PYARROW_NUMPY_INTEROP_H + +#include "arrow/python/platform.h" // IWYU pragma: export + +#include // IWYU pragma: export + +// Don't use the deprecated Numpy functions +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +// This is required to be able to access the NumPy C API properly in C++ files +// other than this main one +#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API +#ifndef NUMPY_IMPORT_ARRAY +#define NO_IMPORT_ARRAY +#endif + +#include // IWYU pragma: export +#include // IWYU pragma: export +#include // IWYU pragma: export + +// A bit subtle. Numpy has 5 canonical integer types: +// (or, rather, type pairs: signed and unsigned) +// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG +// It also has 4 fixed-width integer aliases. +// When mapping Arrow integer types to these 4 fixed-width aliases, +// we always miss one of the canonical types (even though it may +// have the same width as one of the aliases). +// Which one depends on the platform... +// On a LP64 system, NPY_INT64 maps to NPY_LONG and +// NPY_LONGLONG needs to be handled separately. +// On a LLP64 system, NPY_INT32 maps to NPY_LONG and +// NPY_INT needs to be handled separately. + +#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64 +#define NPY_INT64_IS_LONG_LONG 1 +#else +#define NPY_INT64_IS_LONG_LONG 0 +#endif + +#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64 +#define NPY_INT32_IS_INT 1 +#else +#define NPY_INT32_IS_INT 0 +#endif + +namespace arrow { +namespace py { + +inline int import_numpy() { +#ifdef NUMPY_IMPORT_ARRAY + import_array1(-1); + import_umath1(-1); +#endif + + return 0; +} + +// See above about the missing Numpy integer type numbers +inline int fix_numpy_type_num(int type_num) { +#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 + if (type_num == NPY_INT) return NPY_INT32; + if (type_num == NPY_UINT) return NPY_UINT32; +#endif +#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 + if (type_num == NPY_LONGLONG) return NPY_INT64; + if (type_num == NPY_ULONGLONG) return NPY_UINT64; +#endif + return type_num; +} + +} // namespace py +} // namespace arrow + +#endif // PYARROW_NUMPY_INTEROP_H diff --git a/r/R/inst/include/arrow/python/numpy_to_arrow.h b/r/R/inst/include/arrow/python/numpy_to_arrow.h new file mode 100644 index 00000000000..4edc7669bb8 --- /dev/null +++ b/r/R/inst/include/arrow/python/numpy_to_arrow.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Converting from pandas memory representation to Arrow data structures + +#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H +#define ARROW_PYTHON_NUMPY_TO_ARROW_H + +#include "arrow/python/platform.h" + +#include + +#include "arrow/compute/kernels/cast.h" +#include "arrow/python/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class DataType; +class MemoryPool; +class Status; + +namespace py { + +/// Convert NumPy arrays to Arrow. If target data type is not known, pass a +/// type with null +/// +/// \param[in] pool Memory pool for any memory allocations +/// \param[in] ao an ndarray with the array data +/// \param[in] mo an ndarray with a null mask (True is null), optional +/// \param[in] from_pandas If true, use pandas's null sentinels to determine +/// whether values are null +/// \param[in] type a specific type to cast to, may be null +/// \param[in] cast_options casting options +/// \param[out] out a ChunkedArray, to accommodate chunked output +ARROW_PYTHON_EXPORT +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + const compute::CastOptions& cast_options, + std::shared_ptr* out); + +/// Safely convert NumPy arrays to Arrow. If target data type is not known, +/// pass a type with null. +/// +/// \param[in] pool Memory pool for any memory allocations +/// \param[in] ao an ndarray with the array data +/// \param[in] mo an ndarray with a null mask (True is null), optional +/// \param[in] from_pandas If true, use pandas's null sentinels to determine +/// whether values are null +/// \param[in] type a specific type to cast to, may be null +/// \param[out] out a ChunkedArray, to accommodate chunked output +ARROW_PYTHON_EXPORT +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + std::shared_ptr* out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_NUMPY_TO_ARROW_H diff --git a/r/R/inst/include/arrow/python/platform.h b/r/R/inst/include/arrow/python/platform.h new file mode 100644 index 00000000000..bc06df9c38c --- /dev/null +++ b/r/R/inst/include/arrow/python/platform.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures + +#ifndef ARROW_PYTHON_PLATFORM_H +#define ARROW_PYTHON_PLATFORM_H + +#include // IWYU pragma: export +#include + +// Work around C2528 error +#ifdef _MSC_VER +#if _MSC_VER >= 1900 +#undef timezone +#endif +#endif + +#endif // ARROW_PYTHON_PLATFORM_H diff --git a/r/R/inst/include/arrow/python/pyarrow.h b/r/R/inst/include/arrow/python/pyarrow.h new file mode 100644 index 00000000000..a5a39108479 --- /dev/null +++ b/r/R/inst/include/arrow/python/pyarrow.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_PYARROW_H +#define ARROW_PYTHON_PYARROW_H + +#include "arrow/python/platform.h" + +#include + +#include "arrow/python/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class Column; +class DataType; +class Field; +class RecordBatch; +class Schema; +class Status; +class Table; +class Tensor; + +namespace py { + +ARROW_PYTHON_EXPORT int import_pyarrow(); + +ARROW_PYTHON_EXPORT bool is_buffer(PyObject* buffer); +ARROW_PYTHON_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_buffer(const std::shared_ptr& buffer); + +ARROW_PYTHON_EXPORT bool is_data_type(PyObject* data_type); +ARROW_PYTHON_EXPORT Status unwrap_data_type(PyObject* data_type, + std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_data_type(const std::shared_ptr& type); + +ARROW_PYTHON_EXPORT bool is_field(PyObject* field); +ARROW_PYTHON_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_field(const std::shared_ptr& field); + +ARROW_PYTHON_EXPORT bool is_schema(PyObject* schema); +ARROW_PYTHON_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_schema(const std::shared_ptr& schema); + +ARROW_PYTHON_EXPORT bool is_array(PyObject* array); +ARROW_PYTHON_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_array(const std::shared_ptr& array); + +ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor); +ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); + +ARROW_PYTHON_EXPORT bool is_column(PyObject* column); +ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); + +ARROW_PYTHON_EXPORT bool is_table(PyObject* table); +ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); +ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); + +ARROW_PYTHON_EXPORT bool is_record_batch(PyObject* batch); +ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch, + std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_record_batch( + const std::shared_ptr& batch); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_PYARROW_H diff --git a/r/R/inst/include/arrow/python/pyarrow_api.h b/r/R/inst/include/arrow/python/pyarrow_api.h new file mode 100644 index 00000000000..f6a211290e5 --- /dev/null +++ b/r/R/inst/include/arrow/python/pyarrow_api.h @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build + +/* Generated by Cython 0.29 */ + +#ifndef __PYX_HAVE_API__pyarrow__lib +#define __PYX_HAVE_API__pyarrow__lib +#include "Python.h" +#include "pyarrow_lib.h" + +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0; +#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0; +#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; +#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column)(std::shared_ptr< arrow::Column> const &) = 0; +#define pyarrow_wrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0; +#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0; +#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer)(std::shared_ptr< arrow::ResizableBuffer> const &) = 0; +#define pyarrow_wrap_resizable_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0; +#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0; +#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; +#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor +static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0; +#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array +static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0; +#define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch +static std::shared_ptr< arrow::Buffer> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0; +#define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer +static std::shared_ptr< arrow::Column> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column)(PyObject *) = 0; +#define pyarrow_unwrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column +static std::shared_ptr< arrow::DataType> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0; +#define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type +static std::shared_ptr< arrow::Field> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0; +#define pyarrow_unwrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field +static std::shared_ptr< arrow::Schema> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema)(PyObject *) = 0; +#define pyarrow_unwrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema +static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table)(PyObject *) = 0; +#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table +static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; +#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; +#define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0; +#define pyarrow_is_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_field)(PyObject *) = 0; +#define pyarrow_is_field __pyx_api_f_7pyarrow_3lib_pyarrow_is_field +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0; +#define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0; +#define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array +static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0; +#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; +#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; +#define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; +#define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table +static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0; +#define pyarrow_is_batch __pyx_api_f_7pyarrow_3lib_pyarrow_is_batch +#if !defined(__Pyx_PyIdentifier_FromString) +#if PY_MAJOR_VERSION < 3 + #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s) +#else + #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s) +#endif +#endif + +#ifndef __PYX_HAVE_RT_ImportFunction +#define __PYX_HAVE_RT_ImportFunction +static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) { + PyObject *d = 0; + PyObject *cobj = 0; + union { + void (*fp)(void); + void *p; + } tmp; + d = PyObject_GetAttrString(module, (char *)"__pyx_capi__"); + if (!d) + goto bad; + cobj = PyDict_GetItemString(d, funcname); + if (!cobj) { + PyErr_Format(PyExc_ImportError, + "%.200s does not export expected C function %.200s", + PyModule_GetName(module), funcname); + goto bad; + } +#if PY_VERSION_HEX >= 0x02070000 + if (!PyCapsule_IsValid(cobj, sig)) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj)); + goto bad; + } + tmp.p = PyCapsule_GetPointer(cobj, sig); +#else + {const char *desc, *s1, *s2; + desc = (const char *)PyCObject_GetDesc(cobj); + if (!desc) + goto bad; + s1 = desc; s2 = sig; + while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; } + if (*s1 != *s2) { + PyErr_Format(PyExc_TypeError, + "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", + PyModule_GetName(module), funcname, sig, desc); + goto bad; + } + tmp.p = PyCObject_AsVoidPtr(cobj);} +#endif + *f = tmp.fp; + if (!(*f)) + goto bad; + Py_DECREF(d); + return 0; +bad: + Py_XDECREF(d); + return -1; +} +#endif + + +static int import_pyarrow__lib(void) { + PyObject *module = 0; + module = PyImport_ImportModule("pyarrow.lib"); + if (!module) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column, "std::shared_ptr< arrow::Column> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; + if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; + Py_DECREF(module); module = 0; + return 0; + bad: + Py_XDECREF(module); + return -1; +} + +#endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/r/R/inst/include/arrow/python/pyarrow_lib.h b/r/R/inst/include/arrow/python/pyarrow_lib.h new file mode 100644 index 00000000000..4a99a073b50 --- /dev/null +++ b/r/R/inst/include/arrow/python/pyarrow_lib.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// DO NOT EDIT THIS FILE. Update from pyarrow/lib.h after pyarrow build + +/* Generated by Cython 0.29 */ + +#ifndef __PYX_HAVE__pyarrow__lib +#define __PYX_HAVE__pyarrow__lib + + +#ifndef __PYX_HAVE_API__pyarrow__lib + +#ifndef __PYX_EXTERN_C + #ifdef __cplusplus + #define __PYX_EXTERN_C extern "C" + #else + #define __PYX_EXTERN_C extern + #endif +#endif + +#ifndef DL_IMPORT + #define DL_IMPORT(_T) _T +#endif + +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_array(std::shared_ptr< arrow::Array> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_column(std::shared_ptr< arrow::Column> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_data_type(std::shared_ptr< arrow::DataType> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_field(std::shared_ptr< arrow::Field> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std::shared_ptr< arrow::ResizableBuffer> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &); +__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &); +__PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Column> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_column(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::DataType> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_data_type(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_field(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); +__PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); +__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *); +__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *); +__PYX_EXTERN_C int pyarrow_is_field(PyObject *); +__PYX_EXTERN_C int pyarrow_is_schema(PyObject *); +__PYX_EXTERN_C int pyarrow_is_array(PyObject *); +__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); +__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *); +__PYX_EXTERN_C int pyarrow_is_column(PyObject *); +__PYX_EXTERN_C int pyarrow_is_table(PyObject *); +__PYX_EXTERN_C int pyarrow_is_batch(PyObject *); + +#endif /* !__PYX_HAVE_API__pyarrow__lib */ + +/* WARNING: the interface of the module init function changed in CPython 3.5. */ +/* It now returns a PyModuleDef instance instead of a PyModule instance. */ + +#if PY_MAJOR_VERSION < 3 +PyMODINIT_FUNC initlib(void); +#else +PyMODINIT_FUNC PyInit_lib(void); +#endif + +#endif /* !__PYX_HAVE__pyarrow__lib */ diff --git a/r/R/inst/include/arrow/python/python_to_arrow.h b/r/R/inst/include/arrow/python/python_to_arrow.h new file mode 100644 index 00000000000..f9d97569ef4 --- /dev/null +++ b/r/R/inst/include/arrow/python/python_to_arrow.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef ARROW_PYTHON_ADAPTERS_BUILTIN_H +#define ARROW_PYTHON_ADAPTERS_BUILTIN_H + +#include "arrow/python/platform.h" + +#include +#include + +#include "arrow/python/visibility.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" + +#include "arrow/python/common.h" + +namespace arrow { + +class Array; +class Status; + +namespace py { + +struct PyConversionOptions { + PyConversionOptions() : type(NULLPTR), size(-1), pool(NULLPTR), from_pandas(false) {} + + PyConversionOptions(const std::shared_ptr& type, int64_t size, + MemoryPool* pool, bool from_pandas) + : type(type), size(size), pool(default_memory_pool()), from_pandas(from_pandas) {} + + // Set to null if to be inferred + std::shared_ptr type; + + // Default is -1: infer from data + int64_t size; + + // Memory pool to use for allocations + MemoryPool* pool; + + // Default false + bool from_pandas; +}; + +/// \brief Convert sequence (list, generator, NumPy array with dtype object) of +/// Python objects. +/// \param[in] obj the sequence to convert +/// \param[in] mask a NumPy array of true/false values to indicate whether +/// values in the sequence are null (true) or not null (false). This parameter +/// may be null +/// \param[in] options various conversion options +/// \param[out] out a ChunkedArray containing one or more chunks +/// \return Status +ARROW_PYTHON_EXPORT +Status ConvertPySequence(PyObject* obj, PyObject* mask, + const PyConversionOptions& options, + std::shared_ptr* out); + +ARROW_PYTHON_EXPORT +Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, + std::shared_ptr* out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ADAPTERS_BUILTIN_H diff --git a/r/R/inst/include/arrow/python/serialize.h b/r/R/inst/include/arrow/python/serialize.h new file mode 100644 index 00000000000..6cdbbe5053f --- /dev/null +++ b/r/R/inst/include/arrow/python/serialize.h @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_PYTHON_TO_ARROW_H +#define ARROW_PYTHON_PYTHON_TO_ARROW_H + +#include +#include + +#include "arrow/python/visibility.h" +#include "arrow/status.h" + +// Forward declaring PyObject, see +// https://mail.python.org/pipermail/python-dev/2003-August/037601.html +#ifndef PyObject_HEAD +struct _object; +typedef _object PyObject; +#endif + +namespace arrow { + +class Buffer; +class DataType; +class MemoryPool; +class RecordBatch; +class Tensor; + +namespace io { + +class OutputStream; + +} // namespace io + +namespace py { + +struct ARROW_PYTHON_EXPORT SerializedPyObject { + std::shared_ptr batch; + std::vector> tensors; + std::vector> ndarrays; + std::vector> buffers; + + /// \brief Write serialized Python object to OutputStream + /// \param[in,out] dst an OutputStream + /// \return Status + Status WriteTo(io::OutputStream* dst); + + /// \brief Convert SerializedPyObject to a dict containing the message + /// components as Buffer instances with minimal memory allocation + /// + /// { + /// 'num_tensors': N, + /// 'num_buffers': K, + /// 'data': [Buffer] + /// } + /// + /// Each tensor is written as two buffers, one for the metadata and one for + /// the body. Therefore, the number of buffers in 'data' is 2 * N + K + 1, + /// with the first buffer containing the serialized record batch containing + /// the UnionArray that describes the whole object + Status GetComponents(MemoryPool* pool, PyObject** out); +}; + +/// \brief Serialize Python sequence as a SerializedPyObject. +/// \param[in] context Serialization context which contains custom serialization +/// and deserialization callbacks. Can be any Python object with a +/// _serialize_callback method for serialization and a _deserialize_callback +/// method for deserialization. If context is None, no custom serialization +/// will be attempted. +/// \param[in] sequence A Python sequence object to serialize to Arrow data +/// structures +/// \param[out] out The serialized representation +/// \return Status +/// +/// Release GIL before calling +ARROW_PYTHON_EXPORT +Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); + +/// \brief Serialize an Arrow Tensor as a SerializedPyObject. +/// \param[in] tensor Tensor to be serialized +/// \param[out] out The serialized representation +/// \return Status +ARROW_PYTHON_EXPORT +Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* out); + +/// \brief Write the Tensor metadata header to an OutputStream. +/// \param[in] dtype DataType of the Tensor +/// \param[in] shape The shape of the tensor +/// \param[in] tensor_num_bytes The lengh of the Tensor data in bytes +/// \param[in] dst The OutputStream to write the Tensor header to +/// \return Status +ARROW_PYTHON_EXPORT +Status WriteNdarrayHeader(std::shared_ptr dtype, + const std::vector& shape, int64_t tensor_num_bytes, + io::OutputStream* dst); + +struct PythonType { + enum type { + BOOL, + INT, + PY2INT, + BYTES, + STRING, + HALF_FLOAT, + FLOAT, + DOUBLE, + DATE64, + LIST, + DICT, + TUPLE, + SET, + TENSOR, + NDARRAY, + BUFFER, + NUM_PYTHON_TYPES + }; +}; + +} // namespace py + +} // namespace arrow + +#endif // ARROW_PYTHON_PYTHON_TO_ARROW_H diff --git a/r/R/inst/include/arrow/python/type_traits.h b/r/R/inst/include/arrow/python/type_traits.h new file mode 100644 index 00000000000..bc71ec4e90b --- /dev/null +++ b/r/R/inst/include/arrow/python/type_traits.h @@ -0,0 +1,302 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Internal header + +#include "arrow/python/platform.h" + +#include +#include + +#include "arrow/python/numpy_interop.h" + +#include + +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace py { +namespace internal { + +// +// Type traits for Numpy -> Arrow equivalence +// +template +struct npy_traits {}; + +template <> +struct npy_traits { + typedef uint8_t value_type; + using TypeClass = BooleanType; + using BuilderClass = BooleanBuilder; + + static constexpr bool supports_nulls = false; + static inline bool isnull(uint8_t v) { return false; } +}; + +#define NPY_INT_DECL(TYPE, CapType, T) \ + template <> \ + struct npy_traits { \ + typedef T value_type; \ + using TypeClass = CapType##Type; \ + using BuilderClass = CapType##Builder; \ + \ + static constexpr bool supports_nulls = false; \ + static inline bool isnull(T v) { return false; } \ + }; + +NPY_INT_DECL(INT8, Int8, int8_t); +NPY_INT_DECL(INT16, Int16, int16_t); +NPY_INT_DECL(INT32, Int32, int32_t); +NPY_INT_DECL(INT64, Int64, int64_t); + +NPY_INT_DECL(UINT8, UInt8, uint8_t); +NPY_INT_DECL(UINT16, UInt16, uint16_t); +NPY_INT_DECL(UINT32, UInt32, uint32_t); +NPY_INT_DECL(UINT64, UInt64, uint64_t); + +#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 +NPY_INT_DECL(INT, Int32, int32_t); +NPY_INT_DECL(UINT, UInt32, uint32_t); +#endif +#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 +NPY_INT_DECL(LONGLONG, Int64, int64_t); +NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); +#endif + +template <> +struct npy_traits { + typedef npy_half value_type; + using TypeClass = HalfFloatType; + using BuilderClass = HalfFloatBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; } +}; + +template <> +struct npy_traits { + typedef float value_type; + using TypeClass = FloatType; + using BuilderClass = FloatBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(float v) { return v != v; } +}; + +template <> +struct npy_traits { + typedef double value_type; + using TypeClass = DoubleType; + using BuilderClass = DoubleBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(double v) { return v != v; } +}; + +template <> +struct npy_traits { + typedef int64_t value_type; + using TypeClass = TimestampType; + using BuilderClass = TimestampBuilder; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(int64_t v) { + // NaT = -2**63 + // = -0x8000000000000000 + // = -9223372036854775808; + // = std::numeric_limits::min() + return v == std::numeric_limits::min(); + } +}; + +template <> +struct npy_traits { + typedef PyObject* value_type; + static constexpr bool supports_nulls = true; + + static inline bool isnull(PyObject* v) { return v == Py_None; } +}; + +// +// Type traits for Arrow -> Numpy equivalence +// Note *supports_nulls* means the equivalent Numpy type support nulls +// +template +struct arrow_traits {}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_BOOL; + static constexpr bool supports_nulls = false; + typedef typename npy_traits::value_type T; +}; + +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + typedef typename npy_traits::value_type T; \ + }; + +INT_DECL(INT8); +INT_DECL(INT16); +INT_DECL(INT32); +INT_DECL(INT64); +INT_DECL(UINT8); +INT_DECL(UINT16); +INT_DECL(UINT32); +INT_DECL(UINT64); + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT16; + static constexpr bool supports_nulls = true; + static constexpr uint16_t na_value = NPY_HALF_NAN; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT32; + static constexpr bool supports_nulls = true; + static constexpr float na_value = NAN; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT64; + static constexpr bool supports_nulls = true; + static constexpr double na_value = NAN; + typedef typename npy_traits::value_type T; +}; + +static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); + +constexpr int64_t kNanosecondsInDay = 86400000000000LL; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_DATETIME; + static constexpr int64_t npy_shift = 1; + + static constexpr bool supports_nulls = true; + static constexpr int64_t na_value = kPandasTimestampNull; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + // Data stores as FR_D day unit + static constexpr int npy_type = NPY_DATETIME; + static constexpr int64_t npy_shift = 1; + + static constexpr bool supports_nulls = true; + typedef typename npy_traits::value_type T; + + static constexpr int64_t na_value = kPandasTimestampNull; + static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } +}; + +template <> +struct arrow_traits { + // Data stores as FR_D day unit + static constexpr int npy_type = NPY_DATETIME; + + // There are 1000 * 60 * 60 * 24 = 86400000ms in a day + static constexpr int64_t npy_shift = 86400000; + + static constexpr bool supports_nulls = true; + typedef typename npy_traits::value_type T; + + static constexpr int64_t na_value = kPandasTimestampNull; + static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr int64_t na_value = kPandasTimestampNull; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; +}; + +static inline int NumPyTypeSize(int npy_type) { + npy_type = fix_numpy_type_num(npy_type); + + switch (npy_type) { + case NPY_BOOL: + case NPY_INT8: + case NPY_UINT8: + return 1; + case NPY_INT16: + case NPY_UINT16: + return 2; + case NPY_INT32: + case NPY_UINT32: + return 4; + case NPY_INT64: + case NPY_UINT64: + return 8; + case NPY_FLOAT16: + return 2; + case NPY_FLOAT32: + return 4; + case NPY_FLOAT64: + return 8; + case NPY_DATETIME: + return 8; + case NPY_OBJECT: + return sizeof(void*); + default: + DCHECK(false) << "unhandled numpy type"; + break; + } + return -1; +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/r/R/inst/include/arrow/python/util/datetime.h b/r/R/inst/include/arrow/python/util/datetime.h new file mode 100644 index 00000000000..a6e9c87f4e2 --- /dev/null +++ b/r/R/inst/include/arrow/python/util/datetime.h @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_UTIL_DATETIME_H +#define PYARROW_UTIL_DATETIME_H + +#include + +#include +#include "arrow/python/platform.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace py { + +// The following code is adapted from +// https://github.com/numpy/numpy/blob/master/numpy/core/src/multiarray/datetime.c + +// Days per month, regular year and leap year +static int64_t _days_per_month_table[2][12] = { + {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; + +static bool is_leapyear(int64_t year) { + return (year & 0x3) == 0 && // year % 4 == 0 + ((year % 100) != 0 || (year % 400) == 0); +} + +// Calculates the days offset from the 1970 epoch. +static int64_t get_days_from_date(int64_t date_year, int64_t date_month, + int64_t date_day) { + int64_t i, month; + int64_t year, days = 0; + int64_t* month_lengths; + + year = date_year - 1970; + days = year * 365; + + // Adjust for leap years + if (days >= 0) { + // 1968 is the closest leap year before 1970. + // Exclude the current year, so add 1. + year += 1; + // Add one day for each 4 years + days += year / 4; + // 1900 is the closest previous year divisible by 100 + year += 68; + // Subtract one day for each 100 years + days -= year / 100; + // 1600 is the closest previous year divisible by 400 + year += 300; + // Add one day for each 400 years + days += year / 400; + } else { + // 1972 is the closest later year after 1970. + // Include the current year, so subtract 2. + year -= 2; + // Subtract one day for each 4 years + days += year / 4; + // 2000 is the closest later year divisible by 100 + year -= 28; + // Add one day for each 100 years + days -= year / 100; + // 2000 is also the closest later year divisible by 400 + // Subtract one day for each 400 years + days += year / 400; + } + + month_lengths = _days_per_month_table[is_leapyear(date_year)]; + month = date_month - 1; + + // Add the months + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + // Add the days + days += date_day - 1; + + return days; +} + +// Modifies '*days_' to be the day offset within the year, +// and returns the year. +static int64_t days_to_yearsdays(int64_t* days_) { + const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1); + // Adjust so it's relative to the year 2000 (divisible by 400) + int64_t days = (*days_) - (365 * 30 + 7); + int64_t year; + + // Break down the 400 year cycle to get the year and day within the year + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + // Work out the year/day within the 400 year cycle + if (days >= 366) { + year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); + days = (days - 1) % (100 * 365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days + 1) / (4 * 365 + 1)); + days = (days + 1) % (4 * 365 + 1); + if (days >= 366) { + year += (days - 1) / 365; + days = (days - 1) % 365; + } + } + } + + *days_ = days; + return year + 2000; +} + +// Extracts the month and year and day number from a number of days +static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month, + int64_t* date_day) { + int64_t *month_lengths, i; + + *date_year = days_to_yearsdays(&days); + month_lengths = _days_per_month_table[is_leapyear(*date_year)]; + + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + *date_month = i + 1; + *date_day = days + 1; + return; + } else { + days -= month_lengths[i]; + } + } + + // Should never get here + return; +} + +static inline int64_t PyTime_to_us(PyObject* pytime) { + return (static_cast(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL + + static_cast(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL + + static_cast(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL + + PyDateTime_TIME_GET_MICROSECOND(pytime)); +} + +static inline int64_t PyTime_to_s(PyObject* pytime) { + return PyTime_to_us(pytime) / 1000000; +} + +static inline int64_t PyTime_to_ms(PyObject* pytime) { + return PyTime_to_us(pytime) / 1000; +} + +static inline int64_t PyTime_to_ns(PyObject* pytime) { + return PyTime_to_us(pytime) * 1000; +} + +// Splitting time quantities, for example splitting total seconds into +// minutes and remaining seconds. After we run +// int64_t remaining = split_time(total, quotient, &next) +// we have +// total = next * quotient + remaining. Handles negative values by propagating +// them: If total is negative, next will be negative and remaining will +// always be non-negative. +static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { + int64_t r = total % quotient; + if (r < 0) { + *next = total / quotient - 1; + return r + quotient; + } else { + *next = total / quotient; + return r; + } +} + +static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, + int64_t* hour, int64_t* minute, int64_t* second, + int64_t* microsecond) { + switch (unit) { + case TimeUnit::NANO: + if (val % 1000 != 0) { + return Status::Invalid("Value ", val, " has non-zero nanoseconds"); + } + val /= 1000; + // fall through + case TimeUnit::MICRO: + *microsecond = split_time(val, 1000000LL, &val); + *second = split_time(val, 60, &val); + *minute = split_time(val, 60, hour); + break; + case TimeUnit::MILLI: + *microsecond = split_time(val, 1000, &val) * 1000; + // fall through + case TimeUnit::SECOND: + *second = split_time(val, 60, &val); + *minute = split_time(val, 60, hour); + break; + default: + break; + } + return Status::OK(); +} + +static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, + int64_t* month, int64_t* day) { + switch (unit) { + case DateUnit::MILLI: + val /= 86400000LL; // fall through + case DateUnit::DAY: + get_date_from_days(val, year, month, day); + default: + break; + } + return Status::OK(); +} + +static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, + PyObject** out) { + int64_t hour = 0, minute = 0, second = 0, microsecond = 0; + RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); + *out = PyTime_FromTime(static_cast(hour), static_cast(minute), + static_cast(second), static_cast(microsecond)); + return Status::OK(); +} + +static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { + int64_t year = 0, month = 0, day = 0; + RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); + *out = PyDate_FromDate(static_cast(year), static_cast(month), + static_cast(day)); + return Status::OK(); +} + +static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, + PyObject** out) { + int64_t hour = 0, minute = 0, second = 0, microsecond = 0; + RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); + int64_t total_days = 0; + hour = split_time(hour, 24, &total_days); + int64_t year = 0, month = 0, day = 0; + get_date_from_days(total_days, &year, &month, &day); + *out = PyDateTime_FromDateAndTime( + static_cast(year), static_cast(month), static_cast(day), + static_cast(hour), static_cast(minute), + static_cast(second), static_cast(microsecond)); + return Status::OK(); +} + +static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) { + return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); +} + +static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { + int64_t total_seconds = 0; + total_seconds += PyDateTime_DATE_GET_SECOND(pydate); + total_seconds += PyDateTime_DATE_GET_MINUTE(pydate) * 60; + total_seconds += PyDateTime_DATE_GET_HOUR(pydate) * 3600; + int64_t days = + get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); + total_seconds += days * 24 * 3600; + return total_seconds * 1000; +} + +static inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { + return PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL; +} + +static inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { + int64_t date_ms = PyDate_to_ms(reinterpret_cast(pydatetime)); + int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000; + return date_ms + ms; +} + +static inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { + int64_t ms = PyDate_to_ms(reinterpret_cast(pydatetime)); + int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); + return ms * 1000 + us; +} + +static inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { + return PyDateTime_to_us(pydatetime) * 1000; +} + +} // namespace py +} // namespace arrow + +#endif // PYARROW_UTIL_DATETIME_H diff --git a/r/R/inst/include/arrow/python/visibility.h b/r/R/inst/include/arrow/python/visibility.h new file mode 100644 index 00000000000..c0b343c70e9 --- /dev/null +++ b/r/R/inst/include/arrow/python/visibility.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) // Windows +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_STATIC +#define ARROW_PYTHON_EXPORT +#elif defined(ARROW_PYTHON_EXPORTING) +#define ARROW_PYTHON_EXPORT __declspec(dllexport) +#else +#define ARROW_PYTHON_EXPORT __declspec(dllimport) +#endif + +#else // Not Windows +#ifndef ARROW_PYTHON_EXPORT +#define ARROW_PYTHON_EXPORT __attribute__((visibility("default"))) +#endif +#endif // Non-Windows diff --git a/r/R/inst/include/arrow/record_batch.h b/r/R/inst/include/arrow/record_batch.h new file mode 100644 index 00000000000..f80d4ed7683 --- /dev/null +++ b/r/R/inst/include/arrow/record_batch.h @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_RECORD_BATCH_H +#define ARROW_RECORD_BATCH_H + +#include +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \class RecordBatch +/// \brief Collection of equal-length arrays matching a particular Schema +/// +/// A record batch is table-like data structure that is semantically a sequence +/// of fields, each a contiguous Arrow array +class ARROW_EXPORT RecordBatch { + public: + virtual ~RecordBatch() = default; + + /// \param[in] schema The record batch schema + /// \param[in] num_rows length of fields in the record batch. Each array + /// should have the same length as num_rows + /// \param[in] columns the record batch fields as vector of arrays + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns); + + /// \brief Move-based constructor for a vector of Array instances + static std::shared_ptr Make(const std::shared_ptr& schema, + int64_t num_rows, + std::vector>&& columns); + + /// \brief Construct record batch from vector of internal data structures + /// \since 0.5.0 + /// + /// This class is only provided with an rvalue-reference for the input data, + /// and is intended for internal use, or advanced users. + /// + /// \param schema the record batch schema + /// \param num_rows the number of semantic rows in the record batch. This + /// should be equal to the length of each field + /// \param columns the data for the batch's columns + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + std::vector>&& columns); + + /// \brief Construct record batch by copying vector of array data + /// \since 0.5.0 + static std::shared_ptr Make( + const std::shared_ptr& schema, int64_t num_rows, + const std::vector>& columns); + + /// \brief Determine if two record batches are exactly equal + /// \return true if batches are equal + bool Equals(const RecordBatch& other) const; + + /// \brief Determine if two record batches are approximately equal + bool ApproxEquals(const RecordBatch& other) const; + + // \return the table's schema + /// \return true if batches are equal + std::shared_ptr schema() const { return schema_; } + + /// \brief Retrieve an array from the record batch + /// \param[in] i field index, does not boundscheck + /// \return an Array object + virtual std::shared_ptr column(int i) const = 0; + + /// \brief Retrieve an array from the record batch + /// \param[in] name field name + /// \return an Array or null if no field was found + std::shared_ptr GetColumnByName(const std::string& name) const; + + /// \brief Retrieve an array's internaldata from the record batch + /// \param[in] i field index, does not boundscheck + /// \return an internal ArrayData object + virtual std::shared_ptr column_data(int i) const = 0; + + /// \brief Add column to the record batch, producing a new RecordBatch + /// + /// \param[in] i field index, which will be boundschecked + /// \param[in] field field to be added + /// \param[in] column column to be added + /// \param[out] out record batch with column added + virtual Status AddColumn(int i, const std::shared_ptr& field, + const std::shared_ptr& column, + std::shared_ptr* out) const = 0; + + /// \brief Add new nullable column to the record batch, producing a new + /// RecordBatch. + /// + /// For non-nullable columns, use the Field-based version of this method. + /// + /// \param[in] i field index, which will be boundschecked + /// \param[in] field_name name of field to be added + /// \param[in] column column to be added + /// \param[out] out record batch with column added + virtual Status AddColumn(int i, const std::string& field_name, + const std::shared_ptr& column, + std::shared_ptr* out) const; + + /// \brief Remove column from the record batch, producing a new RecordBatch + /// + /// \param[in] i field index, does boundscheck + /// \param[out] out record batch with column removed + virtual Status RemoveColumn(int i, std::shared_ptr* out) const = 0; + + virtual std::shared_ptr ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const = 0; + + /// \brief Name in i-th column + const std::string& column_name(int i) const; + + /// \return the number of columns in the table + int num_columns() const; + + /// \return the number of rows (the corresponding length of each column) + int64_t num_rows() const { return num_rows_; } + + /// \brief Slice each of the arrays in the record batch + /// \param[in] offset the starting offset to slice, through end of batch + /// \return new record batch + virtual std::shared_ptr Slice(int64_t offset) const; + + /// \brief Slice each of the arrays in the record batch + /// \param[in] offset the starting offset to slice + /// \param[in] length the number of elements to slice from offset + /// \return new record batch + virtual std::shared_ptr Slice(int64_t offset, int64_t length) const = 0; + + /// \brief Check for schema or length inconsistencies + /// \return Status + virtual Status Validate() const; + + protected: + RecordBatch(const std::shared_ptr& schema, int64_t num_rows); + + std::shared_ptr schema_; + int64_t num_rows_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch); +}; + +/// \brief Abstract interface for reading stream of record batches +class ARROW_EXPORT RecordBatchReader { + public: + virtual ~RecordBatchReader(); + + /// \return the shared schema of the record batches in the stream + virtual std::shared_ptr schema() const = 0; + + /// \brief Read the next record batch in the stream. Return null for batch + /// when reaching end of stream + /// + /// \param[out] batch the next loaded batch, null at end of stream + /// \return Status + virtual Status ReadNext(std::shared_ptr* batch) = 0; + + /// \brief Consume entire stream as a vector of record batches + Status ReadAll(std::vector>* batches); + + /// \brief Read all batches and concatenate as arrow::Table + Status ReadAll(std::shared_ptr
* table); +}; + +} // namespace arrow + +#endif // ARROW_RECORD_BATCH_H diff --git a/r/R/inst/include/arrow/scalar.h b/r/R/inst/include/arrow/scalar.h new file mode 100644 index 00000000000..51b5e71c345 --- /dev/null +++ b/r/R/inst/include/arrow/scalar.h @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Object model for scalar (non-Array) values. Not intended for use with large +// amounts of data +// +// NOTE: This API is experimental as of the 0.13 version and subject to change +// without deprecation warnings + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +/// \brief Base class for scalar values, representing a single value occupying +/// an array "slot" +struct ARROW_EXPORT Scalar { + virtual ~Scalar() = default; + + /// \brief The type of the scalar value + std::shared_ptr type; + + /// \brief Whether the value is valid (not null) or not + bool is_valid; + + bool Equals(const Scalar& other) const; + bool Equals(const std::shared_ptr& other) const { + if (other) return Equals(*other); + return false; + } + + protected: + Scalar(const std::shared_ptr& type, bool is_valid) + : type(type), is_valid(is_valid) {} +}; + +/// \brief A scalar value for NullType. Never valid +struct ARROW_EXPORT NullScalar : public Scalar { + public: + NullScalar() : Scalar{null(), false} {} +}; + +namespace internal { + +struct ARROW_EXPORT PrimitiveScalar : public Scalar { + using Scalar::Scalar; +}; + +} // namespace internal + +struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar { + bool value; + explicit BooleanScalar(bool value, bool is_valid = true) + : internal::PrimitiveScalar{boolean(), is_valid}, value(value) {} +}; + +template +struct NumericScalar : public internal::PrimitiveScalar { + using T = typename Type::c_type; + T value; + + explicit NumericScalar(T value, bool is_valid = true) + : NumericScalar(value, TypeTraits::type_singleton(), is_valid) {} + + protected: + explicit NumericScalar(T value, const std::shared_ptr& type, bool is_valid) + : internal::PrimitiveScalar{type, is_valid}, value(value) {} +}; + +struct ARROW_EXPORT BinaryScalar : public Scalar { + std::shared_ptr value; + explicit BinaryScalar(const std::shared_ptr& value, bool is_valid = true) + : BinaryScalar(value, binary(), is_valid) {} + + protected: + BinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true) + : Scalar{type, is_valid}, value(value) {} +}; + +struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { + FixedSizeBinaryScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true); +}; + +struct ARROW_EXPORT StringScalar : public BinaryScalar { + explicit StringScalar(const std::shared_ptr& value, bool is_valid = true) + : BinaryScalar(value, utf8(), is_valid) {} +}; + +class ARROW_EXPORT Date32Scalar : public NumericScalar { + public: + using NumericScalar::NumericScalar; +}; + +class ARROW_EXPORT Date64Scalar : public NumericScalar { + public: + using NumericScalar::NumericScalar; +}; + +class ARROW_EXPORT Time32Scalar : public internal::PrimitiveScalar { + public: + int32_t value; + Time32Scalar(int32_t value, const std::shared_ptr& type, + bool is_valid = true); +}; + +class ARROW_EXPORT Time64Scalar : public internal::PrimitiveScalar { + public: + int64_t value; + Time64Scalar(int64_t value, const std::shared_ptr& type, + bool is_valid = true); +}; + +class ARROW_EXPORT TimestampScalar : public internal::PrimitiveScalar { + public: + int64_t value; + TimestampScalar(int64_t value, const std::shared_ptr& type, + bool is_valid = true); +}; + +class ARROW_EXPORT DurationScalar : public internal::PrimitiveScalar { + public: + int64_t value; + DurationScalar(int64_t value, const std::shared_ptr& type, + bool is_valid = true); +}; + +class ARROW_EXPORT MonthIntervalScalar : public internal::PrimitiveScalar { + public: + int32_t value; + MonthIntervalScalar(int32_t value, const std::shared_ptr& type, + bool is_valid = true); +}; + +class ARROW_EXPORT DayTimeIntervalScalar : public Scalar { + public: + DayTimeIntervalType::DayMilliseconds value; + DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, + const std::shared_ptr& type, bool is_valid = true); +}; + +struct ARROW_EXPORT Decimal128Scalar : public Scalar { + Decimal128 value; + Decimal128Scalar(const Decimal128& value, const std::shared_ptr& type, + bool is_valid = true); +}; + +struct ARROW_EXPORT ListScalar : public Scalar { + std::shared_ptr value; + + ListScalar(const std::shared_ptr& value, const std::shared_ptr& type, + bool is_valid = true); + + explicit ListScalar(const std::shared_ptr& value, bool is_valid = true); +}; + +struct ARROW_EXPORT FixedSizeListScalar : public Scalar { + std::shared_ptr value; + + FixedSizeListScalar(const std::shared_ptr& value, + const std::shared_ptr& type, bool is_valid = true); + + explicit FixedSizeListScalar(const std::shared_ptr& value, bool is_valid = true); +}; + +struct ARROW_EXPORT StructScalar : public Scalar { + std::vector> value; +}; + +class ARROW_EXPORT UnionScalar : public Scalar {}; +class ARROW_EXPORT DictionaryScalar : public Scalar {}; +class ARROW_EXPORT ExtensionScalar : public Scalar {}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/sparse_tensor.h b/r/R/inst/include/arrow/sparse_tensor.h new file mode 100644 index 00000000000..e622245d633 --- /dev/null +++ b/r/R/inst/include/arrow/sparse_tensor.h @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SPARSE_TENSOR_H +#define ARROW_SPARSE_TENSOR_H + +#include +#include +#include + +#include "arrow/tensor.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// SparseIndex class + +struct SparseTensorFormat { + /// EXPERIMENTAL: The index format type of SparseTensor + enum type { COO, CSR }; +}; + +/// \brief EXPERIMENTAL: The base class for the index of a sparse tensor +/// +/// SparseIndex describes where the non-zero elements are within a SparseTensor. +/// +/// There are several ways to represent this. The format_id is used to +/// distinguish what kind of representation is used. Each possible value of +/// format_id must have only one corresponding concrete subclass of SparseIndex. +class ARROW_EXPORT SparseIndex { + public: + explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) + : format_id_(format_id), non_zero_length_(non_zero_length) {} + + virtual ~SparseIndex() = default; + + /// \brief Return the identifier of the format type + SparseTensorFormat::type format_id() const { return format_id_; } + + /// \brief Return the number of non zero values in the sparse tensor related + /// to this sparse index + int64_t non_zero_length() const { return non_zero_length_; } + + /// \brief Return the string representation of the sparse index + virtual std::string ToString() const = 0; + + protected: + SparseTensorFormat::type format_id_; + int64_t non_zero_length_; +}; + +namespace internal { +template +class SparseIndexBase : public SparseIndex { + public: + explicit SparseIndexBase(int64_t non_zero_length) + : SparseIndex(SparseIndexType::format_id, non_zero_length) {} +}; +} // namespace internal + +// ---------------------------------------------------------------------- +// SparseCOOIndex class + +/// \brief EXPERIMENTAL: The index data for a COO sparse tensor +/// +/// A COO sparse index manages the location of its non-zero values by their +/// coordinates. +class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { + public: + using CoordsTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; + + // Constructor with a column-major NumericTensor + explicit SparseCOOIndex(const std::shared_ptr& coords); + + /// \brief Return a tensor that has the coordinates of the non-zero values + const std::shared_ptr& indices() const { return coords_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the COO indices are equal + bool Equals(const SparseCOOIndex& other) const { + return indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr coords_; +}; + +// ---------------------------------------------------------------------- +// SparseCSRIndex class + +/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix +/// +/// A CSR sparse index manages the location of its non-zero values by two +/// vectors. +/// +/// The first vector, called indptr, represents the range of the rows; the i-th +/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. +/// So the length of an indptr vector is the number of rows + 1. +/// +/// The other vector, called indices, represents the column indices of the +/// corresponding non-zero values. So the length of an indices vector is same +/// as the number of non-zero-values. +class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase { + public: + using IndexTensor = NumericTensor; + + static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; + + // Constructor with two index vectors + explicit SparseCSRIndex(const std::shared_ptr& indptr, + const std::shared_ptr& indices); + + /// \brief Return a 1D tensor of indptr vector + const std::shared_ptr& indptr() const { return indptr_; } + + /// \brief Return a 1D tensor of indices vector + const std::shared_ptr& indices() const { return indices_; } + + /// \brief Return a string representation of the sparse index + std::string ToString() const override; + + /// \brief Return whether the CSR indices are equal + bool Equals(const SparseCSRIndex& other) const { + return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); + } + + protected: + std::shared_ptr indptr_; + std::shared_ptr indices_; +}; + +// ---------------------------------------------------------------------- +// SparseTensor class + +/// \brief EXPERIMENTAL: The base class of sparse tensor container +class ARROW_EXPORT SparseTensor { + public: + virtual ~SparseTensor() = default; + + SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } + + /// \brief Return a value type of the sparse tensor + std::shared_ptr type() const { return type_; } + + /// \brief Return a buffer that contains the value vector of the sparse tensor + std::shared_ptr data() const { return data_; } + + /// \brief Return an immutable raw data pointer + const uint8_t* raw_data() const { return data_->data(); } + + /// \brief Return a mutable raw data pointer + uint8_t* raw_mutable_data() const { return data_->mutable_data(); } + + /// \brief Return a shape vector of the sparse tensor + const std::vector& shape() const { return shape_; } + + /// \brief Return a sparse index of the sparse tensor + const std::shared_ptr& sparse_index() const { return sparse_index_; } + + /// \brief Return a number of dimensions of the sparse tensor + int ndim() const { return static_cast(shape_.size()); } + + /// \brief Return a vector of dimension names + const std::vector& dim_names() const { return dim_names_; } + + /// \brief Return the name of the i-th dimension + const std::string& dim_name(int i) const; + + /// \brief Total number of value cells in the sparse tensor + int64_t size() const; + + /// \brief Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + /// \brief Total number of non-zero cells in the sparse tensor + int64_t non_zero_length() const { + return sparse_index_ ? sparse_index_->non_zero_length() : 0; + } + + /// \brief Return whether sparse tensors are equal + bool Equals(const SparseTensor& other) const; + + protected: + // Constructor with all attributes + SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, + const std::shared_ptr& sparse_index, + const std::vector& dim_names); + + std::shared_ptr type_; + std::shared_ptr data_; + std::vector shape_; + std::shared_ptr sparse_index_; + + // These names are optional + std::vector dim_names_; +}; + +// ---------------------------------------------------------------------- +// SparseTensorImpl class + +/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index +/// type +template +class ARROW_EXPORT SparseTensorImpl : public SparseTensor { + public: + virtual ~SparseTensorImpl() = default; + + // Constructor with all attributes + SparseTensorImpl(const std::shared_ptr& sparse_index, + const std::shared_ptr& type, + const std::shared_ptr& data, const std::vector& shape, + const std::vector& dim_names) + : SparseTensor(type, data, shape, sparse_index, dim_names) {} + + // Constructor for empty sparse tensor + SparseTensorImpl(const std::shared_ptr& type, + const std::vector& shape, + const std::vector& dim_names = {}); + + // Constructor with a dense numeric tensor + template + explicit SparseTensorImpl(const NumericTensor& tensor); + + // Constructor with a dense tensor + explicit SparseTensorImpl(const Tensor& tensor); + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); +}; + +/// \brief EXPERIMENTAL: Type alias for COO sparse tensor +using SparseTensorCOO = SparseTensorImpl; + +/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix +using SparseTensorCSR = SparseTensorImpl; +using SparseMatrixCSR = SparseTensorImpl; + +} // namespace arrow + +#endif // ARROW_SPARSE_TENSOR_H diff --git a/r/R/inst/include/arrow/status.h b/r/R/inst/include/arrow/status.h new file mode 100644 index 00000000000..790d9b71d23 --- /dev/null +++ b/r/R/inst/include/arrow/status.h @@ -0,0 +1,424 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +// Adapted from Apache Kudu, TensorFlow + +#ifndef ARROW_STATUS_H_ +#define ARROW_STATUS_H_ + +#include +#include +#include +#include + +#ifdef ARROW_EXTRA_ERROR_CONTEXT +#include +#endif + +#include "arrow/util/macros.h" +#include "arrow/util/string_builder.h" +#include "arrow/util/visibility.h" + +#ifdef ARROW_EXTRA_ERROR_CONTEXT + +/// \brief Return with given status if condition is met. +#define ARROW_RETURN_IF_(condition, status, expr) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + ::arrow::Status _s = (status); \ + std::stringstream ss; \ + ss << _s.message() << "\n" << __FILE__ << ":" << __LINE__ << " code: " << expr; \ + return ::arrow::Status(_s.code(), ss.str()); \ + } \ + } while (0) + +#else + +#define ARROW_RETURN_IF_(condition, status, _) \ + do { \ + if (ARROW_PREDICT_FALSE(condition)) { \ + return (status); \ + } \ + } while (0) + +#endif // ARROW_EXTRA_ERROR_CONTEXT + +#define ARROW_RETURN_IF(condition, status) \ + ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status)) + +/// \brief Propagate any non-successful Status to the caller +#define ARROW_RETURN_NOT_OK(status) \ + do { \ + ::arrow::Status __s = (status); \ + ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \ + } while (false) + +#define RETURN_NOT_OK_ELSE(s, else_) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { \ + else_; \ + return _s; \ + } \ + } while (false) + +// This is an internal-use macro and should not be used in public headers. +#ifndef RETURN_NOT_OK +#define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) +#endif + +namespace arrow { + +enum class StatusCode : char { + OK = 0, + OutOfMemory = 1, + KeyError = 2, + TypeError = 3, + Invalid = 4, + IOError = 5, + CapacityError = 6, + IndexError = 7, + UnknownError = 9, + NotImplemented = 10, + SerializationError = 11, + PythonError = 12, + RError = 13, + PlasmaObjectExists = 20, + PlasmaObjectNonexistent = 21, + PlasmaStoreFull = 22, + PlasmaObjectAlreadySealed = 23, + StillExecuting = 24, + // Gandiva range of errors + CodeGenError = 40, + ExpressionValidationError = 41, + ExecutionError = 42 +}; + +#if defined(__clang__) +// Only clang supports warn_unused_result as a type annotation. +class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; +#endif + +/// \brief Status outcome object (success or error) +/// +/// The Status object is an object holding the outcome of an operation. +/// The outcome is represented as a StatusCode, either success +/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values). +/// +/// Additionally, if an error occurred, a specific error message is generally +/// attached. +class ARROW_EXPORT Status { + public: + // Create a success status. + Status() noexcept : state_(NULLPTR) {} + ~Status() noexcept { + // ARROW-2400: On certain compilers, splitting off the slow path improves + // performance significantly. + if (ARROW_PREDICT_FALSE(state_ != NULL)) { + DeleteState(); + } + } + + Status(StatusCode code, const std::string& msg); + + // Copy the specified status. + inline Status(const Status& s); + inline Status& operator=(const Status& s); + + // Move the specified status. + inline Status(Status&& s) noexcept; + inline Status& operator=(Status&& s) noexcept; + + // AND the statuses. + inline Status operator&(const Status& s) const noexcept; + inline Status operator&(Status&& s) const noexcept; + inline Status& operator&=(const Status& s) noexcept; + inline Status& operator&=(Status&& s) noexcept; + + /// Return a success status + static Status OK() { return Status(); } + + /// Return a success status with a specific message + template + static Status OK(Args&&... args) { + return Status(StatusCode::OK, util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status for out-of-memory conditions + template + static Status OutOfMemory(Args&&... args) { + return Status(StatusCode::OutOfMemory, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status for failed key lookups (e.g. column name in a table) + template + static Status KeyError(Args&&... args) { + return Status(StatusCode::KeyError, util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status for type errors (such as mismatching data types) + template + static Status TypeError(Args&&... args) { + return Status(StatusCode::TypeError, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status for unknown errors + template + static Status UnknownError(Args&&... args) { + return Status(StatusCode::UnknownError, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status when an operation or a combination of operation and + /// data types is unimplemented + template + static Status NotImplemented(Args&&... args) { + return Status(StatusCode::NotImplemented, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status for invalid data (for example a string that fails parsing) + template + static Status Invalid(Args&&... args) { + return Status(StatusCode::Invalid, util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status when an index is out of bounds + template + static Status IndexError(Args&&... args) { + return Status(StatusCode::IndexError, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status when a container's capacity would exceed its limits + template + static Status CapacityError(Args&&... args) { + return Status(StatusCode::CapacityError, + util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status when some IO-related operation failed + template + static Status IOError(Args&&... args) { + return Status(StatusCode::IOError, util::StringBuilder(std::forward(args)...)); + } + + /// Return an error status when some (de)serialization operation failed + template + static Status SerializationError(Args&&... args) { + return Status(StatusCode::SerializationError, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status RError(Args&&... args) { + return Status(StatusCode::RError, util::StringBuilder(std::forward(args)...)); + } + + template + static Status PlasmaObjectExists(Args&&... args) { + return Status(StatusCode::PlasmaObjectExists, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status PlasmaObjectNonexistent(Args&&... args) { + return Status(StatusCode::PlasmaObjectNonexistent, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status PlasmaObjectAlreadySealed(Args&&... args) { + return Status(StatusCode::PlasmaObjectAlreadySealed, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status PlasmaStoreFull(Args&&... args) { + return Status(StatusCode::PlasmaStoreFull, + util::StringBuilder(std::forward(args)...)); + } + + static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } + + template + static Status CodeGenError(Args&&... args) { + return Status(StatusCode::CodeGenError, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status ExpressionValidationError(Args&&... args) { + return Status(StatusCode::ExpressionValidationError, + util::StringBuilder(std::forward(args)...)); + } + + template + static Status ExecutionError(Args&&... args) { + return Status(StatusCode::ExecutionError, + util::StringBuilder(std::forward(args)...)); + } + + /// Return true iff the status indicates success. + bool ok() const { return (state_ == NULLPTR); } + + /// Return true iff the status indicates an out-of-memory error. + bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + /// Return true iff the status indicates a key lookup error. + bool IsKeyError() const { return code() == StatusCode::KeyError; } + /// Return true iff the status indicates invalid data. + bool IsInvalid() const { return code() == StatusCode::Invalid; } + /// Return true iff the status indicates an IO-related failure. + bool IsIOError() const { return code() == StatusCode::IOError; } + /// Return true iff the status indicates a container reaching capacity limits. + bool IsCapacityError() const { return code() == StatusCode::CapacityError; } + /// Return true iff the status indicates an out of bounds index. + bool IsIndexError() const { return code() == StatusCode::IndexError; } + /// Return true iff the status indicates a type error. + bool IsTypeError() const { return code() == StatusCode::TypeError; } + /// Return true iff the status indicates an unknown error. + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + /// Return true iff the status indicates an unimplemented operation. + bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } + /// Return true iff the status indicates a (de)serialization failure + bool IsSerializationError() const { return code() == StatusCode::SerializationError; } + /// Return true iff the status indicates a R-originated error. + bool IsRError() const { return code() == StatusCode::RError; } + /// Return true iff the status indicates a Python-originated error. + bool IsPythonError() const { return code() == StatusCode::PythonError; } + /// Return true iff the status indicates an already existing Plasma object. + bool IsPlasmaObjectExists() const { return code() == StatusCode::PlasmaObjectExists; } + /// Return true iff the status indicates a non-existent Plasma object. + bool IsPlasmaObjectNonexistent() const { + return code() == StatusCode::PlasmaObjectNonexistent; + } + /// Return true iff the status indicates an already sealed Plasma object. + bool IsPlasmaObjectAlreadySealed() const { + return code() == StatusCode::PlasmaObjectAlreadySealed; + } + /// Return true iff the status indicates the Plasma store reached its capacity limit. + bool IsPlasmaStoreFull() const { return code() == StatusCode::PlasmaStoreFull; } + + bool IsStillExecuting() const { return code() == StatusCode::StillExecuting; } + + bool IsCodeGenError() const { return code() == StatusCode::CodeGenError; } + + bool IsExpressionValidationError() const { + return code() == StatusCode::ExpressionValidationError; + } + + bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } + + /// \brief Return a string representation of this status suitable for printing. + /// + /// The string "OK" is returned for success. + std::string ToString() const; + + /// \brief Return a string representation of the status code, without the message + /// text or POSIX code information. + std::string CodeAsString() const; + + /// \brief Return the StatusCode value attached to this status. + StatusCode code() const { return ok() ? StatusCode::OK : state_->code; } + + /// \brief Return the specific error message attached to this status. + std::string message() const { return ok() ? "" : state_->msg; } + + [[noreturn]] void Abort() const; + [[noreturn]] void Abort(const std::string& message) const; + + private: + struct State { + StatusCode code; + std::string msg; + }; + // OK status has a `NULL` state_. Otherwise, `state_` points to + // a `State` structure containing the error code and message(s) + State* state_; + + void DeleteState() { + delete state_; + state_ = NULLPTR; + } + void CopyFrom(const Status& s); + inline void MoveFrom(Status& s); +}; + +static inline std::ostream& operator<<(std::ostream& os, const Status& x) { + os << x.ToString(); + return os; +} + +void Status::MoveFrom(Status& s) { + delete state_; + state_ = s.state_; + s.state_ = NULLPTR; +} + +Status::Status(const Status& s) + : state_((s.state_ == NULLPTR) ? NULLPTR : new State(*s.state_)) {} + +Status& Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + CopyFrom(s); + } + return *this; +} + +Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULLPTR; } + +Status& Status::operator=(Status&& s) noexcept { + MoveFrom(s); + return *this; +} + +/// \cond FALSE +// (note: emits warnings on Doxygen < 1.8.15, +// see https://github.com/doxygen/doxygen/issues/6295) +Status Status::operator&(const Status& s) const noexcept { + if (ok()) { + return s; + } else { + return *this; + } +} + +Status Status::operator&(Status&& s) const noexcept { + if (ok()) { + return std::move(s); + } else { + return *this; + } +} + +Status& Status::operator&=(const Status& s) noexcept { + if (ok() && !s.ok()) { + CopyFrom(s); + } + return *this; +} + +Status& Status::operator&=(Status&& s) noexcept { + if (ok() && !s.ok()) { + MoveFrom(s); + } + return *this; +} +/// \endcond + +} // namespace arrow + +#endif // ARROW_STATUS_H_ diff --git a/r/R/inst/include/arrow/stl.h b/r/R/inst/include/arrow/stl.h new file mode 100644 index 00000000000..d641e39955b --- /dev/null +++ b/r/R/inst/include/arrow/stl.h @@ -0,0 +1,373 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_STL_H +#define ARROW_STL_H + +#include +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/compute/api.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +class Schema; + +namespace stl { + +/// Traits meta class to map standard C/C++ types to equivalent Arrow types. +template +struct ConversionTraits {}; + +#define ARROW_STL_CONVERSION(c_type, ArrowType_) \ + template <> \ + struct ConversionTraits : public CTypeTraits { \ + static Status AppendRow(typename TypeTraits::BuilderType& builder, \ + c_type cell) { \ + return builder.Append(cell); \ + } \ + static c_type GetEntry(const typename TypeTraits::ArrayType& array, \ + size_t j) { \ + return array.Value(j); \ + } \ + constexpr static bool nullable = false; \ + }; + +ARROW_STL_CONVERSION(bool, BooleanType) +ARROW_STL_CONVERSION(int8_t, Int8Type) +ARROW_STL_CONVERSION(int16_t, Int16Type) +ARROW_STL_CONVERSION(int32_t, Int32Type) +ARROW_STL_CONVERSION(int64_t, Int64Type) +ARROW_STL_CONVERSION(uint8_t, UInt8Type) +ARROW_STL_CONVERSION(uint16_t, UInt16Type) +ARROW_STL_CONVERSION(uint32_t, UInt32Type) +ARROW_STL_CONVERSION(uint64_t, UInt64Type) +ARROW_STL_CONVERSION(float, FloatType) +ARROW_STL_CONVERSION(double, DoubleType) + +template <> +struct ConversionTraits : public CTypeTraits { + static Status AppendRow(StringBuilder& builder, const std::string& cell) { + return builder.Append(cell); + } + static std::string GetEntry(const StringArray& array, size_t j) { + return array.GetString(j); + } + constexpr static bool nullable = false; +}; + +template +struct ConversionTraits> + : public CTypeTraits> { + static Status AppendRow(ListBuilder& builder, std::vector cell) { + using ElementBuilderType = typename TypeTraits< + typename ConversionTraits::ArrowType>::BuilderType; + ARROW_RETURN_NOT_OK(builder.Append()); + ElementBuilderType& value_builder = + ::arrow::internal::checked_cast(*builder.value_builder()); + for (auto const& value : cell) { + ARROW_RETURN_NOT_OK( + ConversionTraits::AppendRow(value_builder, value)); + } + return Status::OK(); + } + + static std::vector GetEntry(const ListArray& array, size_t j) { + using ElementArrayType = typename TypeTraits< + typename ConversionTraits::ArrowType>::ArrayType; + + const ElementArrayType& value_array = + ::arrow::internal::checked_cast(*array.values()); + + std::vector vec(array.value_length(j)); + for (int64_t i = 0; i < array.value_length(j); i++) { + vec[i] = ConversionTraits::GetEntry(value_array, + array.value_offset(j) + i); + } + return vec; + } + + constexpr static bool nullable = false; +}; + +/// Build an arrow::Schema based upon the types defined in a std::tuple-like structure. +/// +/// While the type information is available at compile-time, we still need to add the +/// column names at runtime, thus these methods are not constexpr. +template ::value> +struct SchemaFromTuple { + using Element = typename std::tuple_element::type; + + // Implementations that take a vector-like object for the column names. + + /// Recursively build a vector of arrow::Field from the defined types. + /// + /// In most cases MakeSchema is the better entrypoint for the Schema creation. + static std::vector> MakeSchemaRecursion( + const std::vector& names) { + std::vector> ret = + SchemaFromTuple::MakeSchemaRecursion(names); + std::shared_ptr type = CTypeTraits::type_singleton(); + ret.push_back(field(names[N - 1], type, false /* nullable */)); + return ret; + } + + /// Build a Schema from the types of the tuple-like structure passed in as template + /// parameter assign the column names at runtime. + /// + /// An example usage of this API can look like the following: + /// + /// \code{.cpp} + /// using TupleType = std::tuple>; + /// std::shared_ptr schema = + /// SchemaFromTuple::MakeSchema({"int_column", "list_of_strings_column"}); + /// \endcode + static std::shared_ptr MakeSchema(const std::vector& names) { + return std::make_shared(MakeSchemaRecursion(names)); + } + + // Implementations that take a tuple-like object for the column names. + + /// Recursively build a vector of arrow::Field from the defined types. + /// + /// In most cases MakeSchema is the better entrypoint for the Schema creation. + template + static std::vector> MakeSchemaRecursionT( + const NamesTuple& names) { + using std::get; + + std::vector> ret = + SchemaFromTuple::MakeSchemaRecursionT(names); + std::shared_ptr type = ConversionTraits::type_singleton(); + ret.push_back(field(get(names), type, ConversionTraits::nullable)); + return ret; + } + + /// Build a Schema from the types of the tuple-like structure passed in as template + /// parameter assign the column names at runtime. + /// + /// An example usage of this API can look like the following: + /// + /// \code{.cpp} + /// using TupleType = std::tuple>; + /// std::shared_ptr schema = + /// SchemaFromTuple::MakeSchema({"int_column", "list_of_strings_column"}); + /// \endcode + template + static std::shared_ptr MakeSchema(const NamesTuple& names) { + return std::make_shared(MakeSchemaRecursionT(names)); + } +}; + +template +struct SchemaFromTuple { + static std::vector> MakeSchemaRecursion( + const std::vector& names) { + std::vector> ret; + ret.reserve(names.size()); + return ret; + } + + template + static std::vector> MakeSchemaRecursionT( + const NamesTuple& names) { + std::vector> ret; + ret.reserve(std::tuple_size::value); + return ret; + } +}; + +namespace internal { +template ::value> +struct CreateBuildersRecursive { + static Status Make(MemoryPool* pool, + std::vector>* builders) { + using Element = typename std::tuple_element::type; + std::shared_ptr type = ConversionTraits::type_singleton(); + ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &builders->at(N - 1))); + + return CreateBuildersRecursive::Make(pool, builders); + } +}; + +template +struct CreateBuildersRecursive { + static Status Make(MemoryPool*, std::vector>*) { + return Status::OK(); + } +}; + +template ::value> +struct RowIterator { + static Status Append(const std::vector>& builders, + const Tuple& row) { + using std::get; + using Element = typename std::tuple_element::type; + using BuilderType = + typename TypeTraits::ArrowType>::BuilderType; + + BuilderType& builder = + ::arrow::internal::checked_cast(*builders[N - 1]); + ARROW_RETURN_NOT_OK(ConversionTraits::AppendRow(builder, get(row))); + + return RowIterator::Append(builders, row); + } +}; + +template +struct RowIterator { + static Status Append(const std::vector>& builders, + const Tuple& row) { + return Status::OK(); + } +}; + +template ::value> +struct EnsureColumnTypes { + static Status Cast(const Table& table, std::shared_ptr
* table_owner, + const compute::CastOptions& cast_options, + compute::FunctionContext* ctx, + std::reference_wrapper* result) { + using Element = typename std::tuple_element::type; + std::shared_ptr expected_type = ConversionTraits::type_singleton(); + + if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) { + compute::Datum casted; + ARROW_RETURN_NOT_OK(compute::Cast(ctx, compute::Datum(table.column(N - 1)->data()), + expected_type, cast_options, &casted)); + std::shared_ptr new_column = std::make_shared( + table.schema()->field(N - 1)->WithType(expected_type), casted.chunked_array()); + ARROW_RETURN_NOT_OK(table.SetColumn(N - 1, new_column, table_owner)); + *result = **table_owner; + } + + return EnsureColumnTypes::Cast(result->get(), table_owner, cast_options, + ctx, result); + } +}; + +template +struct EnsureColumnTypes { + static Status Cast(const Table& table, std::shared_ptr
* table_ownder, + const compute::CastOptions& cast_options, + compute::FunctionContext* ctx, + std::reference_wrapper* result) { + return Status::OK(); + } +}; + +template ::value> +struct TupleSetter { + static void Fill(const Table& table, Range* rows) { + using std::get; + using Element = typename std::tuple_element::type; + using ArrayType = + typename TypeTraits::ArrowType>::ArrayType; + + auto iter = rows->begin(); + const ChunkedArray& chunked_array = *table.column(N - 1)->data(); + for (int i = 0; i < chunked_array.num_chunks(); i++) { + const ArrayType& array = + ::arrow::internal::checked_cast(*chunked_array.chunk(i)); + for (int64_t j = 0; j < array.length(); j++) { + get(*iter++) = ConversionTraits::GetEntry(array, j); + } + } + + return TupleSetter::Fill(table, rows); + } +}; + +template +struct TupleSetter { + static void Fill(const Table& table, Range* rows) {} +}; + +} // namespace internal + +template +Status TableFromTupleRange(MemoryPool* pool, const Range& rows, + const std::vector& names, + std::shared_ptr
* table) { + using row_type = typename std::iterator_traits::value_type; + constexpr std::size_t n_columns = std::tuple_size::value; + + std::shared_ptr schema = SchemaFromTuple::MakeSchema(names); + + std::vector> builders(n_columns); + ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive::Make(pool, &builders)); + + for (auto const& row : rows) { + ARROW_RETURN_NOT_OK(internal::RowIterator::Append(builders, row)); + } + + std::vector> arrays; + for (auto const& builder : builders) { + std::shared_ptr array; + ARROW_RETURN_NOT_OK(builder->Finish(&array)); + arrays.emplace_back(array); + } + + *table = Table::Make(schema, arrays); + + return Status::OK(); +} + +template +Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options, + compute::FunctionContext* ctx, Range* rows) { + using row_type = typename std::decay::type; + constexpr std::size_t n_columns = std::tuple_size::value; + + if (table.schema()->num_fields() != n_columns) { + std::stringstream ss; + ss << "Number of columns in the table does not match the width of the target: "; + ss << table.schema()->num_fields() << " != " << n_columns; + return Status::Invalid(ss.str()); + } + + // TODO: Use std::size with C++17 + if (rows->size() != static_cast(table.num_rows())) { + std::stringstream ss; + ss << "Number of rows in the table does not match the size of the target: "; + ss << table.num_rows() << " != " << rows->size(); + return Status::Invalid(ss.str()); + } + + // Check that all columns have the correct type, otherwise cast them. + std::shared_ptr
table_owner; + std::reference_wrapper current_table(table); + + ARROW_RETURN_NOT_OK(internal::EnsureColumnTypes::Cast( + table, &table_owner, cast_options, ctx, ¤t_table)); + + internal::TupleSetter::Fill(current_table.get(), rows); + + return Status::OK(); +} + +} // namespace stl +} // namespace arrow + +#endif // ARROW_STL_H diff --git a/r/R/inst/include/arrow/table.h b/r/R/inst/include/arrow/table.h new file mode 100644 index 00000000000..8016371d808 --- /dev/null +++ b/r/R/inst/include/arrow/table.h @@ -0,0 +1,377 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TABLE_H +#define ARROW_TABLE_H + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/record_batch.h" +#include "arrow/type.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class Status; + +/// \class ChunkedArray +/// \brief A data structure managing a list of primitive Arrow arrays logically +/// as one large array +class ARROW_EXPORT ChunkedArray { + public: + /// \brief Construct a chunked array from a vector of arrays + /// + /// The vector should be non-empty and all its elements should have the same + /// data type. + explicit ChunkedArray(const ArrayVector& chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(const std::shared_ptr& chunk) + : ChunkedArray(ArrayVector({chunk})) {} + + /// \brief Construct a chunked array from a vector of arrays and a data type + /// + /// As the data type is passed explicitly, the vector may be empty. + ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); + + /// \return the total length of the chunked array; computed on construction + int64_t length() const { return length_; } + + /// \return the total number of nulls among all chunks + int64_t null_count() const { return null_count_; } + + int num_chunks() const { return static_cast(chunks_.size()); } + + /// \return chunk a particular chunk from the chunked array + std::shared_ptr chunk(int i) const { return chunks_[i]; } + + const ArrayVector& chunks() const { return chunks_; } + + /// \brief Construct a zero-copy slice of the chunked array with the + /// indicated offset and length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the chunked array, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const; + + /// \brief Slice from offset until end of the chunked array + std::shared_ptr Slice(int64_t offset) const; + + /// \brief Flatten this chunked array as a vector of chunked arrays, one + /// for each struct field + /// + /// \param[in] pool The pool for buffer allocations, if any + /// \param[out] out The resulting vector of arrays + Status Flatten(MemoryPool* pool, std::vector>* out) const; + + std::shared_ptr type() const { return type_; } + + /// \brief Determine if two chunked arrays are equal. + /// + /// Two chunked arrays can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. + bool Equals(const ChunkedArray& other) const; + /// \brief Determine if two chunked arrays are equal. + bool Equals(const std::shared_ptr& other) const; + + protected: + ArrayVector chunks_; + int64_t length_; + int64_t null_count_; + std::shared_ptr type_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); +}; + +/// \class Column +/// \brief An immutable column data structure consisting of a field (type +/// metadata) and a chunked data array +class ARROW_EXPORT Column { + public: + /// \brief Construct a column from a vector of arrays + /// + /// The array chunks' datatype must match the field's datatype. + Column(const std::shared_ptr& field, const ArrayVector& chunks); + /// \brief Construct a column from a chunked array + /// + /// The chunked array's datatype must match the field's datatype. + Column(const std::shared_ptr& field, const std::shared_ptr& data); + /// \brief Construct a column from a single array + /// + /// The array's datatype must match the field's datatype. + Column(const std::shared_ptr& field, const std::shared_ptr& data); + + /// \brief Construct a column from a name and an array + /// + /// A field with the given name and the array's datatype is automatically created. + Column(const std::string& name, const std::shared_ptr& data); + /// \brief Construct a column from a name and a chunked array + /// + /// A field with the given name and the array's datatype is automatically created. + Column(const std::string& name, const std::shared_ptr& data); + + int64_t length() const { return data_->length(); } + + int64_t null_count() const { return data_->null_count(); } + + std::shared_ptr field() const { return field_; } + + /// \brief The column name + /// \return the column's name in the passed metadata + const std::string& name() const { return field_->name(); } + + /// \brief The column type + /// \return the column's type according to the metadata + std::shared_ptr type() const { return field_->type(); } + + /// \brief The column data as a chunked array + /// \return the column's data as a chunked logical array + std::shared_ptr data() const { return data_; } + + /// \brief Construct a zero-copy slice of the column with the indicated + /// offset and length + /// + /// \param[in] offset the position of the first element in the constructed + /// slice + /// \param[in] length the length of the slice. If there are not enough + /// elements in the column, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + std::shared_ptr Slice(int64_t offset, int64_t length) const { + return std::make_shared(field_, data_->Slice(offset, length)); + } + + /// \brief Slice from offset until end of the column + std::shared_ptr Slice(int64_t offset) const { + return std::make_shared(field_, data_->Slice(offset)); + } + + /// \brief Flatten this column as a vector of columns + /// + /// \param[in] pool The pool for buffer allocations, if any + /// \param[out] out The resulting vector of arrays + Status Flatten(MemoryPool* pool, std::vector>* out) const; + + /// \brief Determine if two columns are equal. + /// + /// Two columns can be equal only if they have equal datatypes. + /// However, they may be equal even if they have different chunkings. + bool Equals(const Column& other) const; + /// \brief Determine if the two columns are equal. + bool Equals(const std::shared_ptr& other) const; + + /// \brief Verify that the column's array data is consistent with the passed + /// field's metadata + Status ValidateData(); + + protected: + std::shared_ptr field_; + std::shared_ptr data_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Column); +}; + +/// \class Table +/// \brief Logical table as sequence of chunked arrays +class ARROW_EXPORT Table { + public: + virtual ~Table() = default; + + /// \brief Construct a Table from schema and columns + /// If columns is zero-length, the table's number of rows is zero + /// \param schema The table schema (column types) + /// \param columns The table's columns + /// \param num_rows number of rows in table, -1 (default) to infer from columns + static std::shared_ptr
Make(const std::shared_ptr& schema, + const std::vector>& columns, + int64_t num_rows = -1); + + /// \brief Construct a Table from columns, schema is assembled from column fields + /// If columns is zero-length, the table's number of rows is zero + /// \param columns The table's columns + /// \param num_rows number of rows in table, -1 (default) to infer from columns + static std::shared_ptr
Make(const std::vector>& columns, + int64_t num_rows = -1); + + /// \brief Construct a Table from schema and arrays + /// \param schema The table schema (column types) + /// \param arrays The table's columns as arrays + /// \param num_rows number of rows in table, -1 (default) to infer from columns + static std::shared_ptr
Make(const std::shared_ptr& schema, + const std::vector>& arrays, + int64_t num_rows = -1); + + /// \brief Construct a Table from RecordBatches, using schema supplied by the first + /// RecordBatch. + /// + /// \param[in] batches a std::vector of record batches + /// \param[out] table the returned table + /// \return Status Returns Status::Invalid if there is some problem + static Status FromRecordBatches( + const std::vector>& batches, + std::shared_ptr
* table); + + /// \brief Construct a Table from RecordBatches, using supplied schema. There may be + /// zero record batches + /// + /// \param[in] schema the arrow::Schema for each batch + /// \param[in] batches a std::vector of record batches + /// \param[out] table the returned table + /// \return Status + static Status FromRecordBatches( + const std::shared_ptr& schema, + const std::vector>& batches, + std::shared_ptr
* table); + + /// \brief Construct a Table from a chunked StructArray. One column will be produced + /// for each field of the StructArray. + /// + /// \param[in] array a chunked StructArray + /// \param[out] table the returned table + /// \return Status + static Status FromChunkedStructArray(const std::shared_ptr& array, + std::shared_ptr
* table); + + /// Return the table schema + std::shared_ptr schema() const { return schema_; } + + /// Return a column by index + virtual std::shared_ptr column(int i) const = 0; + + /// \brief Construct a zero-copy slice of the table with the + /// indicated offset and length + /// + /// \param[in] offset the index of the first row in the constructed + /// slice + /// \param[in] length the number of rows of the slice. If there are not enough + /// rows in the table, the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr
+ virtual std::shared_ptr
Slice(int64_t offset, int64_t length) const = 0; + + /// \brief Slice from first row at offset until end of the table + std::shared_ptr
Slice(int64_t offset) const { return Slice(offset, num_rows_); } + + /// \brief Return a column by name + /// \param[in] name field name + /// \return an Array or null if no field was found + std::shared_ptr GetColumnByName(const std::string& name) const { + auto i = schema_->GetFieldIndex(name); + return i == -1 ? NULLPTR : column(i); + } + + /// \brief Remove column from the table, producing a new Table + virtual Status RemoveColumn(int i, std::shared_ptr
* out) const = 0; + + /// \brief Add column to the table, producing a new Table + virtual Status AddColumn(int i, const std::shared_ptr& column, + std::shared_ptr
* out) const = 0; + + /// \brief Replace a column in the table, producing a new Table + virtual Status SetColumn(int i, const std::shared_ptr& column, + std::shared_ptr
* out) const = 0; + + /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) + /// \since 0.5.0 + /// + /// \param[in] metadata new KeyValueMetadata + /// \return new Table + virtual std::shared_ptr
ReplaceSchemaMetadata( + const std::shared_ptr& metadata) const = 0; + + /// \brief Flatten the table, producing a new Table. Any column with a + /// struct type will be flattened into multiple columns + /// + /// \param[in] pool The pool for buffer allocations, if any + /// \param[out] out The returned table + virtual Status Flatten(MemoryPool* pool, std::shared_ptr
* out) const = 0; + + /// \brief Perform any checks to validate the input arguments + virtual Status Validate() const = 0; + + /// \brief Return the number of columns in the table + int num_columns() const { return schema_->num_fields(); } + + /// \brief Return the number of rows (equal to each column's logical length) + int64_t num_rows() const { return num_rows_; } + + /// \brief Determine if tables are equal + /// + /// Two tables can be equal only if they have equal schemas. + /// However, they may be equal even if they have different chunkings. + bool Equals(const Table& other) const; + + protected: + Table(); + + std::shared_ptr schema_; + int64_t num_rows_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Table); +}; + +/// \brief Compute a stream of record batches from a (possibly chunked) Table +/// +/// The conversion is zero-copy: each record batch is a view over a slice +/// of the table's columns. +class ARROW_EXPORT TableBatchReader : public RecordBatchReader { + public: + ~TableBatchReader() override; + + /// \brief Construct a TableBatchReader for the given table + explicit TableBatchReader(const Table& table); + + std::shared_ptr schema() const override; + + Status ReadNext(std::shared_ptr* out) override; + + /// \brief Set the desired maximum chunk size of record batches + /// + /// The actual chunk size of each record batch may be smaller, depending + /// on actual chunking characteristics of each table column. + void set_chunksize(int64_t chunksize); + + private: + class TableBatchReaderImpl; + std::unique_ptr impl_; +}; + +/// \brief Construct table from multiple input tables. +/// +/// The tables are concatenated vertically. Therefore, all tables should +/// have the same schema. Each column in the output table is the result +/// of concatenating the corresponding columns in all input tables. +ARROW_EXPORT +Status ConcatenateTables(const std::vector>& tables, + std::shared_ptr
* table); + +} // namespace arrow + +#endif // ARROW_TABLE_H diff --git a/r/R/inst/include/arrow/table_builder.h b/r/R/inst/include/arrow/table_builder.h new file mode 100644 index 00000000000..8e7dfc1e5b3 --- /dev/null +++ b/r/R/inst/include/arrow/table_builder.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TABLE_BUILDER_H +#define ARROW_TABLE_BUILDER_H + +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; +class RecordBatch; + +/// \class RecordBatchBuilder +/// \brief Helper class for creating record batches iteratively given a known +/// schema +class ARROW_EXPORT RecordBatchBuilder { + public: + /// \brief Create an initialize a RecordBatchBuilder + /// \param[in] schema The schema for the record batch + /// \param[in] pool A MemoryPool to use for allocations + /// \param[in] builder the created builder instance + static Status Make(const std::shared_ptr& schema, MemoryPool* pool, + std::unique_ptr* builder); + + /// \brief Create an initialize a RecordBatchBuilder + /// \param[in] schema The schema for the record batch + /// \param[in] pool A MemoryPool to use for allocations + /// \param[in] initial_capacity The initial capacity for the builders + /// \param[in] builder the created builder instance + static Status Make(const std::shared_ptr& schema, MemoryPool* pool, + int64_t initial_capacity, + std::unique_ptr* builder); + + /// \brief Get base pointer to field builder + /// \param i the field index + /// \return pointer to ArrayBuilder + ArrayBuilder* GetField(int i) { return raw_field_builders_[i]; } + + /// \brief Return field builder casted to indicated specific builder type + /// \param i the field index + /// \return pointer to template type + template + T* GetFieldAs(int i) { + return internal::checked_cast(raw_field_builders_[i]); + } + + /// \brief Finish current batch and optionally reset + /// \param[in] reset_builders the resulting RecordBatch + /// \param[out] batch the resulting RecordBatch + /// \return Status + Status Flush(bool reset_builders, std::shared_ptr* batch); + + /// \brief Finish current batch and reset + /// \param[out] batch the resulting RecordBatch + /// \return Status + Status Flush(std::shared_ptr* batch); + + /// \brief Set the initial capacity for new builders + void SetInitialCapacity(int64_t capacity); + + /// \brief The initial capacity for builders + int64_t initial_capacity() const { return initial_capacity_; } + + /// \brief The number of fields in the schema + int num_fields() const { return schema_->num_fields(); } + + /// \brief The number of fields in the schema + std::shared_ptr schema() const { return schema_; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchBuilder); + + RecordBatchBuilder(const std::shared_ptr& schema, MemoryPool* pool, + int64_t initial_capacity); + + Status CreateBuilders(); + Status InitBuilders(); + + std::shared_ptr schema_; + int64_t initial_capacity_; + MemoryPool* pool_; + + std::vector> field_builders_; + std::vector raw_field_builders_; +}; + +} // namespace arrow + +#endif // ARROW_TABLE_BUILDER_H diff --git a/r/R/inst/include/arrow/tensor.h b/r/R/inst/include/arrow/tensor.h new file mode 100644 index 00000000000..317150234e3 --- /dev/null +++ b/r/R/inst/include/arrow/tensor.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TENSOR_H +#define ARROW_TENSOR_H + +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +static inline bool is_tensor_supported(Type::type type_id) { + switch (type_id) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + return true; + default: + break; + } + return false; +} + +template +class SparseTensorImpl; + +class ARROW_EXPORT Tensor { + public: + virtual ~Tensor() = default; + + /// Constructor with no dimension names or strides, data assumed to be row-major + Tensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape); + + /// Constructor with non-negative strides + Tensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, const std::vector& strides); + + /// Constructor with non-negative strides and dimension names + Tensor(const std::shared_ptr& type, const std::shared_ptr& data, + const std::vector& shape, const std::vector& strides, + const std::vector& dim_names); + + std::shared_ptr type() const { return type_; } + std::shared_ptr data() const { return data_; } + + const uint8_t* raw_data() const { return data_->data(); } + uint8_t* raw_mutable_data() { return data_->mutable_data(); } + + const std::vector& shape() const { return shape_; } + const std::vector& strides() const { return strides_; } + + int ndim() const { return static_cast(shape_.size()); } + + const std::vector& dim_names() const { return dim_names_; } + const std::string& dim_name(int i) const; + + /// Total number of value cells in the tensor + int64_t size() const; + + /// Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + /// Either row major or column major + bool is_contiguous() const; + + /// AKA "C order" + bool is_row_major() const; + + /// AKA "Fortran order" + bool is_column_major() const; + + Type::type type_id() const; + + bool Equals(const Tensor& other) const; + + /// Compute the number of non-zero values in the tensor + Status CountNonZero(int64_t* result) const; + + protected: + Tensor() {} + + std::shared_ptr type_; + std::shared_ptr data_; + std::vector shape_; + std::vector strides_; + + /// These names are optional + std::vector dim_names_; + + template + friend class SparseTensorImpl; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); +}; + +template +class NumericTensor : public Tensor { + public: + using TypeClass = TYPE; + using value_type = typename TypeClass::c_type; + + /// Constructor with non-negative strides and dimension names + NumericTensor(const std::shared_ptr& data, const std::vector& shape, + const std::vector& strides, + const std::vector& dim_names) + : Tensor(TypeTraits::type_singleton(), data, shape, strides, dim_names) {} + + /// Constructor with no dimension names or strides, data assumed to be row-major + NumericTensor(const std::shared_ptr& data, const std::vector& shape) + : NumericTensor(data, shape, {}, {}) {} + + /// Constructor with non-negative strides + NumericTensor(const std::shared_ptr& data, const std::vector& shape, + const std::vector& strides) + : NumericTensor(data, shape, strides, {}) {} + + const value_type& Value(const std::vector& index) const { + int64_t offset = CalculateValueOffset(index); + const value_type* ptr = reinterpret_cast(raw_data() + offset); + return *ptr; + } + + protected: + int64_t CalculateValueOffset(const std::vector& index) const { + int64_t offset = 0; + for (size_t i = 0; i < index.size(); ++i) { + offset += index[i] * strides_[i]; + } + return offset; + } +}; + +} // namespace arrow + +#endif // ARROW_TENSOR_H diff --git a/r/R/inst/include/arrow/testing/gtest_common.h b/r/R/inst/include/arrow/testing/gtest_common.h new file mode 100644 index 00000000000..d0221de4b49 --- /dev/null +++ b/r/R/inst/include/arrow/testing/gtest_common.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TEST_COMMON_H +#define ARROW_TEST_COMMON_H + +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/memory_pool.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" + +namespace arrow { + +class TestBase : public ::testing::Test { + public: + void SetUp() { + pool_ = default_memory_pool(); + random_seed_ = 0; + } + + std::shared_ptr MakeRandomNullBitmap(int64_t length, int64_t null_count) { + const int64_t null_nbytes = BitUtil::BytesForBits(length); + + std::shared_ptr null_bitmap; + ARROW_EXPECT_OK(AllocateBuffer(pool_, null_nbytes, &null_bitmap)); + memset(null_bitmap->mutable_data(), 255, null_nbytes); + for (int64_t i = 0; i < null_count; i++) { + BitUtil::ClearBit(null_bitmap->mutable_data(), i * (length / null_count)); + } + return null_bitmap; + } + + template + inline std::shared_ptr MakeRandomArray(int64_t length, int64_t null_count = 0); + + protected: + uint32_t random_seed_; + MemoryPool* pool_; +}; + +template +std::shared_ptr TestBase::MakeRandomArray(int64_t length, int64_t null_count) { + const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type); + std::shared_ptr data; + ARROW_EXPECT_OK(AllocateBuffer(pool_, data_nbytes, &data)); + + // Fill with random data + random_bytes(data_nbytes, random_seed_++, data->mutable_data()); + std::shared_ptr null_bitmap = MakeRandomNullBitmap(length, null_count); + + return std::make_shared(length, data, null_bitmap, null_count); +} + +template <> +inline std::shared_ptr TestBase::MakeRandomArray(int64_t length, + int64_t null_count) { + return std::make_shared(length); +} + +template <> +inline std::shared_ptr TestBase::MakeRandomArray( + int64_t length, int64_t null_count) { + const int byte_width = 10; + std::shared_ptr null_bitmap = MakeRandomNullBitmap(length, null_count); + std::shared_ptr data; + ARROW_EXPECT_OK(AllocateBuffer(pool_, byte_width * length, &data)); + + ::arrow::random_bytes(data->size(), 0, data->mutable_data()); + return std::make_shared(fixed_size_binary(byte_width), length, + data, null_bitmap, null_count); +} + +template <> +inline std::shared_ptr TestBase::MakeRandomArray(int64_t length, + int64_t null_count) { + std::vector valid_bytes(length, 1); + for (int64_t i = 0; i < null_count; i++) { + valid_bytes[i * 2] = 0; + } + BinaryBuilder builder(pool_); + + const int kBufferSize = 10; + uint8_t buffer[kBufferSize]; + for (int64_t i = 0; i < length; i++) { + if (!valid_bytes[i]) { + ARROW_EXPECT_OK(builder.AppendNull()); + } else { + ::arrow::random_bytes(kBufferSize, static_cast(i), buffer); + ARROW_EXPECT_OK(builder.Append(buffer, kBufferSize)); + } + } + + std::shared_ptr out; + ARROW_EXPECT_OK(builder.Finish(&out)); + return out; +} + +class TestBuilder : public ::testing::Test { + public: + void SetUp() { pool_ = default_memory_pool(); } + + protected: + MemoryPool* pool_; + std::shared_ptr type_; +}; + +} // namespace arrow + +#endif // ARROW_TEST_COMMON_H_ diff --git a/r/R/inst/include/arrow/testing/gtest_util.h b/r/R/inst/include/arrow/testing/gtest_util.h new file mode 100644 index 00000000000..c44bb17653b --- /dev/null +++ b/r/R/inst/include/arrow/testing/gtest_util.h @@ -0,0 +1,302 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + ::arrow::Status _st = (expr); \ + if (!_st.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << _st.ToString(); \ + } \ + } while (false) + +#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ + do { \ + ::arrow::Status _st = (expr); \ + if (!_st.Is##ENUM()) { \ + FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ + ENUM) ", but got " \ + << _st.ToString(); \ + } \ + ASSERT_EQ((message), _st.ToString()); \ + } while (false) + +#define ASSERT_OK(expr) \ + do { \ + ::arrow::Status _st = (expr); \ + if (!_st.ok()) { \ + FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \ + } \ + } while (false) + +#define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) + +#define ARROW_EXPECT_OK(expr) \ + do { \ + ::arrow::Status _st = (expr); \ + EXPECT_TRUE(_st.ok()); \ + } while (false) + +#define ABORT_NOT_OK(s) \ + do { \ + ::arrow::Status _st = (s); \ + if (ARROW_PREDICT_FALSE(!_st.ok())) { \ + _st.Abort(); \ + } \ + } while (false); + +namespace arrow { + +// ---------------------------------------------------------------------- +// Useful testing::Types declarations + +typedef ::testing::Types + NumericArrowTypes; + +class Array; +class ChunkedArray; +class Column; +class RecordBatch; +class Table; + +namespace compute { +struct Datum; +} + +using Datum = compute::Datum; + +using ArrayVector = std::vector>; + +#define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs)) +#define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs)) + +ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual); +ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected, + const RecordBatch& actual); +ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected, + const ChunkedArray& actual); +ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& actual, + const ArrayVector& expected); +ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, + const std::vector& expected); +ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::string& expected); +ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const Buffer& expected); +ARROW_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs); + +ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, + bool same_chunk_layout = true); + +ARROW_EXPORT void AssertDatumsEqual(const Datum& expected, const Datum& actual); + +template +void AssertNumericDataEqual(const C_TYPE* raw_data, + const std::vector& expected_values) { + for (auto expected : expected_values) { + ASSERT_EQ(expected, *raw_data); + ++raw_data; + } +} + +ARROW_EXPORT void CompareBatch(const RecordBatch& left, const RecordBatch& right, + bool compare_metadata = true); + +// Check if the padding of the buffers of the array is zero. +// Also cause valgrind warnings if the padding bytes are uninitialized. +ARROW_EXPORT void AssertZeroPadded(const Array& array); + +// Check if the valid buffer bytes are initialized +// and cause valgrind warnings otherwise. +ARROW_EXPORT void TestInitialized(const Array& array); + +template +void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr* out) { + ASSERT_OK(builder->Finish(out)); + AssertZeroPadded(**out); + TestInitialized(**out); +} + +#define DECL_T() typedef typename TestFixture::T T; + +#define DECL_TYPE() typedef typename TestFixture::Type Type; + +// ArrayFromJSON: construct an Array from a simple JSON representation + +ARROW_EXPORT +std::shared_ptr ArrayFromJSON(const std::shared_ptr&, + const std::string& json); + +// ArrayFromVector: construct an Array from vectors of C values + +template +void ArrayFromVector(const std::shared_ptr& type, + const std::vector& is_valid, const std::vector& values, + std::shared_ptr* out) { + DCHECK_EQ(TYPE::type_id, type->id()) + << "template parameter and concrete DataType instance don't agree"; + + std::unique_ptr builder_ptr; + ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); + // Get the concrete builder class to access its Append() specializations + auto& builder = dynamic_cast::BuilderType&>(*builder_ptr); + + for (size_t i = 0; i < values.size(); ++i) { + if (is_valid[i]) { + ASSERT_OK(builder.Append(values[i])); + } else { + ASSERT_OK(builder.AppendNull()); + } + } + ASSERT_OK(builder.Finish(out)); +} + +template +void ArrayFromVector(const std::shared_ptr& type, + const std::vector& values, std::shared_ptr* out) { + DCHECK_EQ(TYPE::type_id, type->id()) + << "template parameter and concrete DataType instance don't agree"; + + std::unique_ptr builder_ptr; + ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); + // Get the concrete builder class to access its Append() specializations + auto& builder = dynamic_cast::BuilderType&>(*builder_ptr); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_OK(builder.Append(values[i])); + } + ASSERT_OK(builder.Finish(out)); +} + +// Overloads without a DataType argument, for parameterless types + +template +void ArrayFromVector(const std::vector& is_valid, const std::vector& values, + std::shared_ptr* out) { + auto type = TypeTraits::type_singleton(); + ArrayFromVector(type, is_valid, values, out); +} + +template +void ArrayFromVector(const std::vector& values, std::shared_ptr* out) { + auto type = TypeTraits::type_singleton(); + ArrayFromVector(type, values, out); +} + +// ChunkedArrayFromVector: construct a ChunkedArray from vectors of C values + +template +void ChunkedArrayFromVector(const std::shared_ptr& type, + const std::vector>& is_valid, + const std::vector>& values, + std::shared_ptr* out) { + ArrayVector chunks; + DCHECK_EQ(is_valid.size(), values.size()); + for (size_t i = 0; i < values.size(); ++i) { + std::shared_ptr array; + ArrayFromVector(type, is_valid[i], values[i], &array); + chunks.push_back(array); + } + *out = std::make_shared(chunks); +} + +template +void ChunkedArrayFromVector(const std::shared_ptr& type, + const std::vector>& values, + std::shared_ptr* out) { + ArrayVector chunks; + for (size_t i = 0; i < values.size(); ++i) { + std::shared_ptr array; + ArrayFromVector(type, values[i], &array); + chunks.push_back(array); + } + *out = std::make_shared(chunks); +} + +// Overloads without a DataType argument, for parameterless types + +template +void ChunkedArrayFromVector(const std::vector>& is_valid, + const std::vector>& values, + std::shared_ptr* out) { + auto type = TypeTraits::type_singleton(); + ChunkedArrayFromVector(type, is_valid, values, out); +} + +template +void ChunkedArrayFromVector(const std::vector>& values, + std::shared_ptr* out) { + auto type = TypeTraits::type_singleton(); + ChunkedArrayFromVector(type, values, out); +} + +template +static inline Status GetBitmapFromVector(const std::vector& is_valid, + std::shared_ptr* result) { + size_t length = is_valid.size(); + + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateEmptyBitmap(length, &buffer)); + + uint8_t* bitmap = buffer->mutable_data(); + for (size_t i = 0; i < static_cast(length); ++i) { + if (is_valid[i]) { + BitUtil::SetBit(bitmap, i); + } + } + + *result = buffer; + return Status::OK(); +} + +template +inline void BitmapFromVector(const std::vector& is_valid, + std::shared_ptr* out) { + ASSERT_OK(GetBitmapFromVector(is_valid, out)); +} + +template +void AssertSortedEquals(std::vector u, std::vector v) { + std::sort(u.begin(), u.end()); + std::sort(v.begin(), v.end()); + ASSERT_EQ(u, v); +} + +} // namespace arrow diff --git a/r/R/inst/include/arrow/testing/random.h b/r/R/inst/include/arrow/testing/random.h new file mode 100644 index 00000000000..6b188fd573b --- /dev/null +++ b/r/R/inst/include/arrow/testing/random.h @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace random { + +using SeedType = std::random_device::result_type; +constexpr SeedType kSeedMax = std::numeric_limits::max(); + +class ARROW_EXPORT RandomArrayGenerator { + public: + explicit RandomArrayGenerator(SeedType seed) + : seed_distribution_(static_cast(1), kSeedMax), seed_rng_(seed) {} + + /// \brief Generates a random BooleanArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] probability the estimated number of active bits + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Boolean(int64_t size, double probability, + double null_probability); + + /// \brief Generates a random UInt8Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr UInt8(int64_t size, uint8_t min, uint8_t max, + double null_probability); + + /// \brief Generates a random Int8Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Int8(int64_t size, int8_t min, int8_t max, + double null_probability); + + /// \brief Generates a random UInt16Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr UInt16(int64_t size, uint16_t min, uint16_t max, + double null_probability); + + /// \brief Generates a random Int16Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Int16(int64_t size, int16_t min, int16_t max, + double null_probability); + + /// \brief Generates a random UInt32Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr UInt32(int64_t size, uint32_t min, uint32_t max, + double null_probability); + + /// \brief Generates a random Int32Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Int32(int64_t size, int32_t min, int32_t max, + double null_probability); + + /// \brief Generates a random UInt64Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr UInt64(int64_t size, uint64_t min, uint64_t max, + double null_probability); + + /// \brief Generates a random Int64Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Int64(int64_t size, int64_t min, int64_t max, + double null_probability); + + /// \brief Generates a random FloatArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Float32(int64_t size, float min, float max, + double null_probability); + + /// \brief Generates a random DoubleArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr Float64(int64_t size, double min, double max, + double null_probability); + + template + std::shared_ptr Numeric(int64_t size, CType min, CType max, + double null_probability) { + switch (ArrowType::type_id) { + case Type::UINT8: + return UInt8(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT8: + return Int8(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT16: + return UInt16(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT16: + return Int16(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT32: + return UInt32(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT32: + return Int32(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT64: + return UInt64(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT64: + return Int64(size, static_cast(min), static_cast(max), + null_probability); + case Type::FLOAT: + return Float32(size, static_cast(min), static_cast(max), + null_probability); + case Type::DOUBLE: + return Float64(size, static_cast(min), static_cast(max), + null_probability); + default: + return nullptr; + } + } + + /// \brief Generates a random StringArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr String(int64_t size, int32_t min_length, + int32_t max_length, double null_probability); + + /// \brief Generates a random StringArray with repeated values + /// + /// \param[in] size the size of the array to generate + /// \param[in] unique the number of unique string values used + /// to populate the array + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr StringWithRepeats(int64_t size, int64_t unique, + int32_t min_length, int32_t max_length, + double null_probability); + + private: + SeedType seed() { return seed_distribution_(seed_rng_); } + + std::uniform_int_distribution seed_distribution_; + std::default_random_engine seed_rng_; +}; + +} // namespace random + +// +// Assorted functions +// + +template +void randint(int64_t N, T lower, T upper, std::vector* out) { + const int random_seed = 0; + std::default_random_engine gen(random_seed); + std::uniform_int_distribution d(lower, upper); + out->resize(N, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); +} + +template +void random_real(int64_t n, uint32_t seed, T min_value, T max_value, + std::vector* out) { + std::default_random_engine gen(seed); + std::uniform_real_distribution d(min_value, max_value); + out->resize(n, static_cast(0)); + std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); +} + +template +void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { + assert(out || (n == 0)); + std::default_random_engine gen(seed); + std::uniform_int_distribution d(min_value, max_value); + std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); +} + +} // namespace arrow diff --git a/r/R/inst/include/arrow/testing/util.h b/r/R/inst/include/arrow/testing/util.h new file mode 100644 index 00000000000..d12f57e3b7e --- /dev/null +++ b/r/R/inst/include/arrow/testing/util.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/record_batch.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class Column; +class MemoryPool; +class RecordBatch; +class Table; + +using ArrayVector = std::vector>; + +template +Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, + std::shared_ptr* result) { + int64_t nbytes = static_cast(values.size()) * sizeof(T); + + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool, nbytes, &buffer)); + auto immutable_data = reinterpret_cast(values.data()); + std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data()); + memset(buffer->mutable_data() + nbytes, 0, + static_cast(buffer->capacity() - nbytes)); + + *result = buffer; + return Status::OK(); +} + +// Sets approximately pct_null of the first n bytes in null_bytes to zero +// and the rest to non-zero (true) values. +ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); +ARROW_EXPORT void random_is_valid(int64_t n, double pct_null, std::vector* is_valid, + int random_seed = 0); +ARROW_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out); +ARROW_EXPORT int32_t DecimalSize(int32_t precision); +ARROW_EXPORT void random_decimals(int64_t n, uint32_t seed, int32_t precision, + uint8_t* out); +ARROW_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out); +ARROW_EXPORT int64_t CountNulls(const std::vector& valid_bytes); + +ARROW_EXPORT Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool, + std::shared_ptr* out, + uint32_t seed = 0); + +ARROW_EXPORT uint64_t random_seed(); + +template +Status MakeArray(const std::vector& valid_bytes, const std::vector& values, + int64_t size, Builder* builder, std::shared_ptr* out) { + // Append the first 1000 + for (int64_t i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + RETURN_NOT_OK(builder->Append(values[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return builder->Finish(out); +} + +#define DECL_T() typedef typename TestFixture::T T; + +#define DECL_TYPE() typedef typename TestFixture::Type Type; + +// ---------------------------------------------------------------------- +// A RecordBatchReader for serving a sequence of in-memory record batches + +class BatchIterator : public RecordBatchReader { + public: + BatchIterator(const std::shared_ptr& schema, + const std::vector>& batches) + : schema_(schema), batches_(batches), position_(0) {} + + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* out) override { + if (position_ >= batches_.size()) { + *out = nullptr; + } else { + *out = batches_[position_++]; + } + return Status::OK(); + } + + private: + std::shared_ptr schema_; + std::vector> batches_; + size_t position_; +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/type.h b/r/R/inst/include/arrow/type.h new file mode 100644 index 00000000000..b5eef6ffc28 --- /dev/null +++ b/r/R/inst/include/arrow/type.h @@ -0,0 +1,1104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_H +#define ARROW_TYPE_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/type_fwd.h" // IWYU pragma: export +#include "arrow/util/checked_cast.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" +#include "arrow/visitor.h" // IWYU pragma: keep + +namespace arrow { + +class Array; +class Field; +class MemoryPool; + +struct Type { + /// \brief Main data type enumeration + /// + /// This enumeration provides a quick way to interrogate the category + /// of a DataType instance. + enum type { + /// A NULL type having no physical storage + NA, + + /// Boolean as 1 bit, LSB bit-packed ordering + BOOL, + + /// Unsigned 8-bit little-endian integer + UINT8, + + /// Signed 8-bit little-endian integer + INT8, + + /// Unsigned 16-bit little-endian integer + UINT16, + + /// Signed 16-bit little-endian integer + INT16, + + /// Unsigned 32-bit little-endian integer + UINT32, + + /// Signed 32-bit little-endian integer + INT32, + + /// Unsigned 64-bit little-endian integer + UINT64, + + /// Signed 64-bit little-endian integer + INT64, + + /// 2-byte floating point value + HALF_FLOAT, + + /// 4-byte floating point value + FLOAT, + + /// 8-byte floating point value + DOUBLE, + + /// UTF8 variable-length string as List + STRING, + + /// Variable-length bytes (no guarantee of UTF8-ness) + BINARY, + + /// Fixed-size binary. Each value occupies the same number of bytes + FIXED_SIZE_BINARY, + + /// int32_t days since the UNIX epoch + DATE32, + + /// int64_t milliseconds since the UNIX epoch + DATE64, + + /// Exact timestamp encoded with int64 since UNIX epoch + /// Default unit millisecond + TIMESTAMP, + + /// Time as signed 32-bit integer, representing either seconds or + /// milliseconds since midnight + TIME32, + + /// Time as signed 64-bit integer, representing either microseconds or + /// nanoseconds since midnight + TIME64, + + /// YEAR_MONTH or DAY_TIME interval in SQL style + INTERVAL, + + /// Precision- and scale-based decimal type. Storage type depends on the + /// parameters. + DECIMAL, + + /// A list of some logical data type + LIST, + + /// Struct of logical types + STRUCT, + + /// Unions of logical types + UNION, + + /// Dictionary-encoded type, also called "categorical" or "factor" + /// in other programming languages. Holds the dictionary value + /// type but not the dictionary itself, which is part of the + /// ArrayData struct + DICTIONARY, + + /// Map, a repeated struct logical type + MAP, + + /// Custom data type, implemented by user + EXTENSION, + + /// Fixed size list of some logical type + FIXED_SIZE_LIST, + + /// Measure of elapsed time in either seconds, milliseconds, microseconds + /// or nanoseconds. + DURATION + }; +}; + +/// \brief Base class for all data types +/// +/// Data types in this library are all *logical*. They can be expressed as +/// either a primitive physical type (bytes or bits of some fixed size), a +/// nested type consisting of other data types, or another data type (e.g. a +/// timestamp encoded as an int64). +/// +/// Simple datatypes may be entirely described by their Type::type id, but +/// complex datatypes are usually parametric. +class ARROW_EXPORT DataType { + public: + explicit DataType(Type::type id) : id_(id) {} + virtual ~DataType(); + + /// \brief Return whether the types are equal + /// + /// Types that are logically convertible from one to another (e.g. List + /// and Binary) are NOT equal. + bool Equals(const DataType& other, bool check_metadata = true) const; + + /// \brief Return whether the types are equal + bool Equals(const std::shared_ptr& other) const; + + std::shared_ptr child(int i) const { return children_[i]; } + + const std::vector>& children() const { return children_; } + + int num_children() const { return static_cast(children_.size()); } + + Status Accept(TypeVisitor* visitor) const; + + /// \brief A string representation of the type, including any children + virtual std::string ToString() const = 0; + + /// \brief A string name of the type, omitting any child fields + /// + /// \note Experimental API + /// \since 0.7.0 + virtual std::string name() const = 0; + + /// \brief Return the type category + Type::type id() const { return id_; } + + protected: + Type::type id_; + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); +}; + +std::ostream& operator<<(std::ostream& os, const DataType& type); + +/// \brief Base class for all fixed-width data types +class ARROW_EXPORT FixedWidthType : public DataType { + public: + using DataType::DataType; + + virtual int bit_width() const = 0; +}; + +/// \brief Base class for all data types representing primitive values +class ARROW_EXPORT PrimitiveCType : public FixedWidthType { + public: + using FixedWidthType::FixedWidthType; +}; + +/// \brief Base class for all numeric data types +class ARROW_EXPORT NumberType : public PrimitiveCType { + public: + using PrimitiveCType::PrimitiveCType; +}; + +/// \brief Base class for all integral data types +class ARROW_EXPORT IntegerType : public NumberType { + public: + using NumberType::NumberType; + virtual bool is_signed() const = 0; +}; + +/// \brief Base class for all floating-point data types +class ARROW_EXPORT FloatingPointType : public NumberType { + public: + using NumberType::NumberType; + enum Precision { HALF, SINGLE, DOUBLE }; + virtual Precision precision() const = 0; +}; + +/// \brief Base class for all parametric data types +class ParametricType {}; + +class ARROW_EXPORT NestedType : public DataType, public ParametricType { + public: + using DataType::DataType; +}; + +class NoExtraMeta {}; + +/// \brief The combination of a field name and data type, with optional metadata +/// +/// Fields are used to describe the individual constituents of a +/// nested DataType or a Schema. +/// +/// A field's metadata is represented by a KeyValueMetadata instance, +/// which holds arbitrary key-value pairs. +class ARROW_EXPORT Field { + public: + Field(const std::string& name, const std::shared_ptr& type, + bool nullable = true, + const std::shared_ptr& metadata = NULLPTR) + : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + + /// \brief Return the field's attached metadata + std::shared_ptr metadata() const { return metadata_; } + + /// \brief Return whether the field has non-empty metadata + bool HasMetadata() const; + + /// \brief Return a copy of this field with the given metadata attached to it + std::shared_ptr AddMetadata( + const std::shared_ptr& metadata) const; + /// \brief Return a copy of this field without any metadata attached to it + std::shared_ptr RemoveMetadata() const; + + /// \brief Return a copy of this field with the replaced type. + std::shared_ptr WithType(const std::shared_ptr& type) const; + + std::vector> Flatten() const; + + bool Equals(const Field& other, bool check_metadata = true) const; + bool Equals(const std::shared_ptr& other, bool check_metadata = true) const; + + /// \brief Return a string representation ot the field + std::string ToString() const; + + /// \brief Return the field name + const std::string& name() const { return name_; } + /// \brief Return the field data type + std::shared_ptr type() const { return type_; } + /// \brief Return whether the field is nullable + bool nullable() const { return nullable_; } + + std::shared_ptr Copy() const; + + private: + // Field name + std::string name_; + + // The field's data type + std::shared_ptr type_; + + // Fields can be nullable + bool nullable_; + + // The field's metadata, if any + std::shared_ptr metadata_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(Field); +}; + +namespace detail { + +template +class ARROW_EXPORT CTypeImpl : public BASE { + public: + using c_type = C_TYPE; + static constexpr Type::type type_id = TYPE_ID; + + CTypeImpl() : BASE(TYPE_ID) {} + + int bit_width() const override { return static_cast(sizeof(C_TYPE) * CHAR_BIT); } + + std::string ToString() const override { return this->name(); } +}; + +template +class IntegerTypeImpl : public detail::CTypeImpl { + bool is_signed() const override { return std::is_signed::value; } +}; + +} // namespace detail + +/// Concrete type class for always-null data +class ARROW_EXPORT NullType : public DataType, public NoExtraMeta { + public: + static constexpr Type::type type_id = Type::NA; + + NullType() : DataType(Type::NA) {} + + std::string ToString() const override; + + std::string name() const override { return "null"; } +}; + +/// Concrete type class for boolean data +class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { + public: + static constexpr Type::type type_id = Type::BOOL; + + BooleanType() : FixedWidthType(Type::BOOL) {} + + std::string ToString() const override; + + int bit_width() const override { return 1; } + std::string name() const override { return "bool"; } +}; + +/// Concrete type class for unsigned 8-bit integer data +class ARROW_EXPORT UInt8Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "uint8"; } +}; + +/// Concrete type class for signed 8-bit integer data +class ARROW_EXPORT Int8Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "int8"; } +}; + +/// Concrete type class for unsigned 16-bit integer data +class ARROW_EXPORT UInt16Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "uint16"; } +}; + +/// Concrete type class for signed 16-bit integer data +class ARROW_EXPORT Int16Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "int16"; } +}; + +/// Concrete type class for unsigned 32-bit integer data +class ARROW_EXPORT UInt32Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "uint32"; } +}; + +/// Concrete type class for signed 32-bit integer data +class ARROW_EXPORT Int32Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "int32"; } +}; + +/// Concrete type class for unsigned 64-bit integer data +class ARROW_EXPORT UInt64Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "uint64"; } +}; + +/// Concrete type class for signed 64-bit integer data +class ARROW_EXPORT Int64Type + : public detail::IntegerTypeImpl { + public: + std::string name() const override { return "int64"; } +}; + +/// Concrete type class for 16-bit floating-point data +class ARROW_EXPORT HalfFloatType + : public detail::CTypeImpl { + public: + Precision precision() const override; + std::string name() const override { return "halffloat"; } +}; + +/// Concrete type class for 32-bit floating-point data (C "float") +class ARROW_EXPORT FloatType + : public detail::CTypeImpl { + public: + Precision precision() const override; + std::string name() const override { return "float"; } +}; + +/// Concrete type class for 64-bit floating-point data (C "double") +class ARROW_EXPORT DoubleType + : public detail::CTypeImpl { + public: + Precision precision() const override; + std::string name() const override { return "double"; } +}; + +/// \brief Concrete type class for list data +/// +/// List data is nested data where each value is a variable number of +/// child items. Lists can be recursively nested, for example +/// list(list(int32)). +class ARROW_EXPORT ListType : public NestedType { + public: + static constexpr Type::type type_id = Type::LIST; + + // List can contain any other logical value type + explicit ListType(const std::shared_ptr& value_type) + : ListType(std::make_shared("item", value_type)) {} + + explicit ListType(const std::shared_ptr& value_field) : NestedType(Type::LIST) { + children_ = {value_field}; + } + + std::shared_ptr value_field() const { return children_[0]; } + + std::shared_ptr value_type() const { return children_[0]->type(); } + + std::string ToString() const override; + + std::string name() const override { return "list"; } +}; + +/// \brief Concrete type class for fixed size list data +class ARROW_EXPORT FixedSizeListType : public NestedType { + public: + static constexpr Type::type type_id = Type::FIXED_SIZE_LIST; + + // List can contain any other logical value type + explicit FixedSizeListType(const std::shared_ptr& value_type, + int32_t list_size) + : FixedSizeListType(std::make_shared("item", value_type), list_size) {} + + explicit FixedSizeListType(const std::shared_ptr& value_field, int32_t list_size) + : NestedType(Type::FIXED_SIZE_LIST), list_size_(list_size) { + children_ = {value_field}; + } + + std::shared_ptr value_field() const { return children_[0]; } + + std::shared_ptr value_type() const { return children_[0]->type(); } + + std::string ToString() const override; + + std::string name() const override { return "fixed_size_list"; } + + int32_t list_size() const { return list_size_; } + + protected: + int32_t list_size_; +}; + +/// \brief Concrete type class for variable-size binary data +class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { + public: + static constexpr Type::type type_id = Type::BINARY; + + BinaryType() : BinaryType(Type::BINARY) {} + + std::string ToString() const override; + std::string name() const override { return "binary"; } + + protected: + // Allow subclasses to change the logical type. + explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} +}; + +/// \brief Concrete type class for fixed-size binary data +class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public ParametricType { + public: + static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY; + + explicit FixedSizeBinaryType(int32_t byte_width) + : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} + explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id) + : FixedWidthType(override_type_id), byte_width_(byte_width) {} + + std::string ToString() const override; + std::string name() const override { return "fixed_size_binary"; } + + int32_t byte_width() const { return byte_width_; } + int bit_width() const override; + + protected: + int32_t byte_width_; +}; + +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringType : public BinaryType { + public: + static constexpr Type::type type_id = Type::STRING; + + StringType() : BinaryType(Type::STRING) {} + + std::string ToString() const override; + std::string name() const override { return "utf8"; } +}; + +/// \brief Concrete type class for struct data +class ARROW_EXPORT StructType : public NestedType { + public: + static constexpr Type::type type_id = Type::STRUCT; + + explicit StructType(const std::vector>& fields); + + ~StructType() override; + + std::string ToString() const override; + std::string name() const override { return "struct"; } + + /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Return all fields having this name + std::vector> GetAllFieldsByName(const std::string& name) const; + + /// Returns -1 if name not found or if there are multiple fields having the + /// same name + int GetFieldIndex(const std::string& name) const; + + /// Return the indices of all fields having this name + std::vector GetAllFieldIndices(const std::string& name) const; + + ARROW_DEPRECATED("Use GetFieldByName") + std::shared_ptr GetChildByName(const std::string& name) const; + + ARROW_DEPRECATED("Use GetFieldIndex") + int GetChildIndex(const std::string& name) const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +/// \brief Base type class for (fixed-size) decimal data +class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { + public: + explicit DecimalType(int32_t byte_width, int32_t precision, int32_t scale) + : FixedSizeBinaryType(byte_width, Type::DECIMAL), + precision_(precision), + scale_(scale) {} + + int32_t precision() const { return precision_; } + int32_t scale() const { return scale_; } + + protected: + int32_t precision_; + int32_t scale_; +}; + +/// \brief Concrete type class for 128-bit decimal data +class ARROW_EXPORT Decimal128Type : public DecimalType { + public: + static constexpr Type::type type_id = Type::DECIMAL; + + explicit Decimal128Type(int32_t precision, int32_t scale); + + std::string ToString() const override; + std::string name() const override { return "decimal"; } +}; + +struct UnionMode { + enum type { SPARSE, DENSE }; +}; + +/// \brief Concrete type class for union data +class ARROW_EXPORT UnionType : public NestedType { + public: + static constexpr Type::type type_id = Type::UNION; + + UnionType(const std::vector>& fields, + const std::vector& type_codes, + UnionMode::type mode = UnionMode::SPARSE); + + std::string ToString() const override; + std::string name() const override { return "union"; } + + const std::vector& type_codes() const { return type_codes_; } + + UnionMode::type mode() const { return mode_; } + + private: + UnionMode::type mode_; + + // The type id used in the data to indicate each data type in the union. For + // example, the first type in the union might be denoted by the id 5 (instead + // of 0). + std::vector type_codes_; +}; + +// ---------------------------------------------------------------------- +// Date and time types + +enum class DateUnit : char { DAY = 0, MILLI = 1 }; + +/// \brief Base type for all date and time types +class ARROW_EXPORT TemporalType : public FixedWidthType { + public: + using FixedWidthType::FixedWidthType; +}; + +/// \brief Base type class for date data +class ARROW_EXPORT DateType : public TemporalType { + public: + virtual DateUnit unit() const = 0; + + protected: + explicit DateType(Type::type type_id); +}; + +/// Concrete type class for 32-bit date data (as number of days since UNIX epoch) +class ARROW_EXPORT Date32Type : public DateType { + public: + static constexpr Type::type type_id = Type::DATE32; + static constexpr DateUnit UNIT = DateUnit::DAY; + + using c_type = int32_t; + + Date32Type(); + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + std::string ToString() const override; + + std::string name() const override { return "date32"; } + DateUnit unit() const override { return UNIT; } +}; + +/// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) +class ARROW_EXPORT Date64Type : public DateType { + public: + static constexpr Type::type type_id = Type::DATE64; + static constexpr DateUnit UNIT = DateUnit::MILLI; + + using c_type = int64_t; + + Date64Type(); + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + std::string ToString() const override; + + std::string name() const override { return "date64"; } + DateUnit unit() const override { return UNIT; } +}; + +struct TimeUnit { + /// The unit for a time or timestamp DataType + enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; +}; + +std::ostream& operator<<(std::ostream& os, TimeUnit::type unit); + +/// Base type class for time data +class ARROW_EXPORT TimeType : public TemporalType, public ParametricType { + public: + TimeUnit::type unit() const { return unit_; } + + protected: + TimeType(Type::type type_id, TimeUnit::type unit); + TimeUnit::type unit_; +}; + +class ARROW_EXPORT Time32Type : public TimeType { + public: + static constexpr Type::type type_id = Type::TIME32; + using c_type = int32_t; + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI); + + std::string ToString() const override; + + std::string name() const override { return "time32"; } +}; + +class ARROW_EXPORT Time64Type : public TimeType { + public: + static constexpr Type::type type_id = Type::TIME64; + using c_type = int64_t; + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO); + + std::string ToString() const override; + + std::string name() const override { return "time64"; } +}; + +class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType { + public: + using Unit = TimeUnit; + + typedef int64_t c_type; + static constexpr Type::type type_id = Type::TIMESTAMP; + + int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } + + explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI) + : TemporalType(Type::TIMESTAMP), unit_(unit) {} + + explicit TimestampType(TimeUnit::type unit, const std::string& timezone) + : TemporalType(Type::TIMESTAMP), unit_(unit), timezone_(timezone) {} + + std::string ToString() const override; + std::string name() const override { return "timestamp"; } + + TimeUnit::type unit() const { return unit_; } + const std::string& timezone() const { return timezone_; } + + private: + TimeUnit::type unit_; + std::string timezone_; +}; + +// Base class for the different kinds of intervals. +class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { + public: + enum type { MONTHS, DAY_TIME }; + IntervalType() : TemporalType(Type::INTERVAL) {} + + virtual type interval_type() const = 0; + virtual ~IntervalType() = default; +}; + +/// \brief Represents a some number of months. +/// +/// Type representing a number of months. Corresponeds to YearMonth type +/// in Schema.fbs (Years are defined as 12 months). +class ARROW_EXPORT MonthIntervalType : public IntervalType { + public: + using c_type = int32_t; + static constexpr Type::type type_id = Type::INTERVAL; + + IntervalType::type interval_type() const override { return IntervalType::MONTHS; } + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + MonthIntervalType() : IntervalType() {} + + std::string ToString() const override { return name(); } + std::string name() const override { return "month_interval"; } +}; + +/// \brief Represents a number of days and milliseconds (fraction of day). +class ARROW_EXPORT DayTimeIntervalType : public IntervalType { + public: + struct DayMilliseconds { + int32_t days; + int32_t milliseconds; + bool operator==(DayMilliseconds other) { + return this->days == other.days && this->milliseconds == other.milliseconds; + } + bool operator!=(DayMilliseconds other) { return !(*this == other); } + }; + using c_type = DayMilliseconds; + static_assert(sizeof(DayMilliseconds) == 8, + "DayMilliseconds struct assumed to be of size 8 bytes"); + static constexpr Type::type type_id = Type::INTERVAL; + IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; } + + DayTimeIntervalType() : IntervalType() {} + + int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } + + std::string ToString() const override { return name(); } + std::string name() const override { return "day_time_interval"; } +}; + +// \brief Represents an amount of elapsed time without any relation to a calendar +// artifact. +class ARROW_EXPORT DurationType : public TemporalType, public ParametricType { + public: + using Unit = TimeUnit; + + static constexpr Type::type type_id = Type::DURATION; + using c_type = int64_t; + + int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } + + explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI) + : TemporalType(Type::DURATION), unit_(unit) {} + + std::string ToString() const override; + std::string name() const override { return "duration"; } + + TimeUnit::type unit() const { return unit_; } + + private: + TimeUnit::type unit_; +}; + +// ---------------------------------------------------------------------- +// Dictionary type (for representing categorical or dictionary-encoded +// in memory) + +/// \brief Dictionary-encoded value type with data-dependent +/// dictionary +class ARROW_EXPORT DictionaryType : public FixedWidthType { + public: + static constexpr Type::type type_id = Type::DICTIONARY; + + DictionaryType(const std::shared_ptr& index_type, + const std::shared_ptr& value_type, bool ordered = false); + + std::string ToString() const override; + std::string name() const override { return "dictionary"; } + + int bit_width() const override; + + std::shared_ptr index_type() const { return index_type_; } + std::shared_ptr value_type() const { return value_type_; } + + bool ordered() const { return ordered_; } + + /// \brief Unify dictionaries types + /// + /// Compute a resulting dictionary that will allow the union of values + /// of all input dictionary types. The input types must all have the + /// same value type. + /// \param[in] pool Memory pool to allocate dictionary values from + /// \param[in] types A sequence of input dictionary types + /// \param[in] dictionaries A sequence of input dictionaries + /// corresponding to each type + /// \param[out] out_type The unified dictionary type + /// \param[out] out_dictionary The unified dictionary + /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, + /// one per input type. Each integer vector represents the transposition + /// of input type indices into unified type indices. + // XXX Should we return something special (an empty transpose map?) when + // the transposition is the identity function? + static Status Unify(MemoryPool* pool, const std::vector& types, + const std::vector& dictionaries, + std::shared_ptr* out_type, + std::shared_ptr* out_dictionary, + std::vector>* out_transpose_maps = NULLPTR); + + protected: + // Must be an integer type (not currently checked) + std::shared_ptr index_type_; + std::shared_ptr value_type_; + bool ordered_; +}; + +// ---------------------------------------------------------------------- +// Schema + +/// \class Schema +/// \brief Sequence of arrow::Field objects describing the columns of a record +/// batch or table data structure +class ARROW_EXPORT Schema { + public: + explicit Schema(const std::vector>& fields, + const std::shared_ptr& metadata = NULLPTR); + + explicit Schema(std::vector>&& fields, + const std::shared_ptr& metadata = NULLPTR); + + Schema(const Schema&); + + virtual ~Schema(); + + /// Returns true if all of the schema fields are equal + bool Equals(const Schema& other, bool check_metadata = true) const; + + /// \brief Return the number of fields (columns) in the schema + int num_fields() const; + + /// Return the ith schema element. Does not boundscheck + std::shared_ptr field(int i) const; + + const std::vector>& fields() const; + + std::vector field_names() const; + + /// Returns null if name not found + std::shared_ptr GetFieldByName(const std::string& name) const; + + /// Return all fields having this name + std::vector> GetAllFieldsByName(const std::string& name) const; + + /// Returns -1 if name not found + int GetFieldIndex(const std::string& name) const; + + /// Return the indices of all fields having this name + std::vector GetAllFieldIndices(const std::string& name) const; + + /// \brief The custom key-value metadata, if any + /// + /// \return metadata may be null + std::shared_ptr metadata() const; + + /// \brief Render a string representation of the schema suitable for debugging + std::string ToString() const; + + Status AddField(int i, const std::shared_ptr& field, + std::shared_ptr* out) const; + Status RemoveField(int i, std::shared_ptr* out) const; + Status SetField(int i, const std::shared_ptr& field, + std::shared_ptr* out) const; + + /// \brief Replace key-value metadata with new metadata + /// + /// \param[in] metadata new KeyValueMetadata + /// \return new Schema + std::shared_ptr AddMetadata( + const std::shared_ptr& metadata) const; + + /// \brief Return copy of Schema without the KeyValueMetadata + std::shared_ptr RemoveMetadata() const; + + /// \brief Indicates that Schema has non-empty KevValueMetadata + bool HasMetadata() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +// ---------------------------------------------------------------------- +// Parametric factory functions +// Other factory functions are in type_fwd.h + +/// \addtogroup type-factories +/// @{ + +/// \brief Create a FixedSizeBinaryType instance. +ARROW_EXPORT +std::shared_ptr fixed_size_binary(int32_t byte_width); + +/// \brief Create a Decimal128Type instance +ARROW_EXPORT +std::shared_ptr decimal(int32_t precision, int32_t scale); + +/// \brief Create a ListType instance from its child Field type +ARROW_EXPORT +std::shared_ptr list(const std::shared_ptr& value_type); + +/// \brief Create a ListType instance from its child DataType +ARROW_EXPORT +std::shared_ptr list(const std::shared_ptr& value_type); + +/// \brief Create a FixedSizeListType instance from its child Field type +ARROW_EXPORT +std::shared_ptr fixed_size_list(const std::shared_ptr& value_type, + int32_t list_size); + +/// \brief Create a FixedSizeListType instance from its child DataType +ARROW_EXPORT +std::shared_ptr fixed_size_list(const std::shared_ptr& value_type, + int32_t list_size); +/// \brief Return an Duration instance (naming use _type to avoid namespace conflict with +/// built in time clases). +std::shared_ptr ARROW_EXPORT duration(TimeUnit::type unit); + +/// \brief Return an DayTimeIntervalType instance +std::shared_ptr ARROW_EXPORT day_time_interval(); + +/// \brief Return an MonthIntervalType instance +std::shared_ptr ARROW_EXPORT month_interval(); + +/// \brief Create a TimestampType instance from its unit +ARROW_EXPORT +std::shared_ptr timestamp(TimeUnit::type unit); + +/// \brief Create a TimestampType instance from its unit and timezone +ARROW_EXPORT +std::shared_ptr timestamp(TimeUnit::type unit, const std::string& timezone); + +/// \brief Create a 32-bit time type instance +/// +/// Unit can be either SECOND or MILLI +std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); + +/// \brief Create a 64-bit time type instance +/// +/// Unit can be either MICRO or NANO +std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); + +/// \brief Create a StructType instance +std::shared_ptr ARROW_EXPORT +struct_(const std::vector>& fields); + +/// \brief Create a UnionType instance +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& child_fields, + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create a UnionType instance +std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + const std::vector& field_names, + const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); + +/// \brief Create a UnionType instance +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + const std::vector& field_names, + UnionMode::type mode = UnionMode::SPARSE) { + return union_(children, field_names, {}, mode); +} + +/// \brief Create a UnionType instance +inline std::shared_ptr ARROW_EXPORT +union_(const std::vector>& children, + UnionMode::type mode = UnionMode::SPARSE) { + return union_(children, {}, {}, mode); +} + +/// \brief Create a DictionaryType instance +/// \param[in] index_type the type of the dictionary indices (must be +/// a signed integer) +/// \param[in] dict_type the type of the values in the variable dictionary +/// \param[in] ordered true if the order of the dictionary values has +/// semantic meaning and should be preserved where possible +ARROW_EXPORT +std::shared_ptr dictionary(const std::shared_ptr& index_type, + const std::shared_ptr& dict_type, + bool ordered = false); + +/// @} + +/// \defgroup schema-factories Factory functions for fields and schemas +/// +/// Factory functions for fields and schemas +/// @{ + +/// \brief Create a Field instance +/// +/// \param name the field name +/// \param type the field value type +/// \param nullable whether the values are nullable, default true +/// \param metadata any custom key-value metadata, default null +std::shared_ptr ARROW_EXPORT field( + const std::string& name, const std::shared_ptr& type, bool nullable = true, + const std::shared_ptr& metadata = NULLPTR); + +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + const std::vector>& fields, + const std::shared_ptr& metadata = NULLPTR); + +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields (rvalue reference) +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + std::vector>&& fields, + const std::shared_ptr& metadata = NULLPTR); + +/// @} + +} // namespace arrow + +#endif // ARROW_TYPE_H diff --git a/r/R/inst/include/arrow/type_fwd.h b/r/R/inst/include/arrow/type_fwd.h new file mode 100644 index 00000000000..040ccf2ffb4 --- /dev/null +++ b/r/R/inst/include/arrow/type_fwd.h @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_FWD_H +#define ARROW_TYPE_FWD_H + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +class DataType; +class KeyValueMetadata; +class Array; +struct ArrayData; +class ArrayBuilder; +class Field; +class Tensor; + +class ChunkedArray; +class Column; +class RecordBatch; +class Table; + +class Buffer; +class MemoryPool; +class RecordBatch; +class Schema; + +class DictionaryType; +class DictionaryArray; +class DictionaryScalar; + +class NullType; +class NullArray; +class NullBuilder; +struct NullScalar; + +class BooleanType; +class BooleanArray; +class BooleanBuilder; +struct BooleanScalar; + +class BinaryType; +class BinaryArray; +class BinaryBuilder; +struct BinaryScalar; + +class FixedSizeBinaryType; +class FixedSizeBinaryArray; +class FixedSizeBinaryBuilder; +struct FixedSizeBinaryScalar; + +class StringType; +class StringArray; +class StringBuilder; +struct StringScalar; + +class ListType; +class ListArray; +class ListBuilder; +struct ListScalar; + +class FixedSizeListType; +class FixedSizeListArray; +class FixedSizeListBuilder; +struct FixedSizeListScalar; + +class StructType; +class StructArray; +class StructBuilder; +struct StructScalar; + +class Decimal128Type; +class Decimal128Array; +class Decimal128Builder; +struct Decimal128Scalar; + +class UnionType; +class UnionArray; +class UnionScalar; + +template +class NumericArray; + +template +class NumericBuilder; + +template +class NumericTensor; + +template +struct NumericScalar; + +#define _NUMERIC_TYPE_DECL(KLASS) \ + class KLASS##Type; \ + using KLASS##Array = NumericArray; \ + using KLASS##Builder = NumericBuilder; \ + using KLASS##Scalar = NumericScalar; \ + using KLASS##Tensor = NumericTensor; + +_NUMERIC_TYPE_DECL(Int8) +_NUMERIC_TYPE_DECL(Int16) +_NUMERIC_TYPE_DECL(Int32) +_NUMERIC_TYPE_DECL(Int64) +_NUMERIC_TYPE_DECL(UInt8) +_NUMERIC_TYPE_DECL(UInt16) +_NUMERIC_TYPE_DECL(UInt32) +_NUMERIC_TYPE_DECL(UInt64) +_NUMERIC_TYPE_DECL(HalfFloat) +_NUMERIC_TYPE_DECL(Float) +_NUMERIC_TYPE_DECL(Double) + +#undef _NUMERIC_TYPE_DECL + +class Date64Type; +using Date64Array = NumericArray; +using Date64Builder = NumericBuilder; +class Date64Scalar; + +class Date32Type; +using Date32Array = NumericArray; +using Date32Builder = NumericBuilder; +class Date32Scalar; + +class Time32Type; +using Time32Array = NumericArray; +using Time32Builder = NumericBuilder; +class Time32Scalar; + +class Time64Type; +using Time64Array = NumericArray; +using Time64Builder = NumericBuilder; +class Time64Scalar; + +class TimestampType; +using TimestampArray = NumericArray; +using TimestampBuilder = NumericBuilder; +class TimestampScalar; + +class MonthIntervalType; +using MonthIntervalArray = NumericArray; +using MonthIntervalBuilder = NumericBuilder; +class MonthIntervalScalar; + +class DayTimeIntervalType; +class DayTimeIntervalArray; +class DayTimeIntervalBuilder; +class DayTimeIntervalScalar; + +class DurationType; +using DurationArray = NumericArray; +using DurationBuilder = NumericBuilder; +class DurationScalar; + +class ExtensionType; +class ExtensionArray; +class ExtensionScalar; + +// ---------------------------------------------------------------------- +// (parameter-free) Factory functions +// Other factory functions are in type.h + +/// \defgroup type-factories Factory functions for creating data types +/// +/// Factory functions for creating data types +/// @{ + +/// \brief Return a NullType instance +std::shared_ptr ARROW_EXPORT null(); +/// \brief Return a BooleanType instance +std::shared_ptr ARROW_EXPORT boolean(); +/// \brief Return a Int8Type instance +std::shared_ptr ARROW_EXPORT int8(); +/// \brief Return a Int16Type instance +std::shared_ptr ARROW_EXPORT int16(); +/// \brief Return a Int32Type instance +std::shared_ptr ARROW_EXPORT int32(); +/// \brief Return a Int64Type instance +std::shared_ptr ARROW_EXPORT int64(); +/// \brief Return a UInt8Type instance +std::shared_ptr ARROW_EXPORT uint8(); +/// \brief Return a UInt16Type instance +std::shared_ptr ARROW_EXPORT uint16(); +/// \brief Return a UInt32Type instance +std::shared_ptr ARROW_EXPORT uint32(); +/// \brief Return a UInt64Type instance +std::shared_ptr ARROW_EXPORT uint64(); +/// \brief Return a HalfFloatType instance +std::shared_ptr ARROW_EXPORT float16(); +/// \brief Return a FloatType instance +std::shared_ptr ARROW_EXPORT float32(); +/// \brief Return a DoubleType instance +std::shared_ptr ARROW_EXPORT float64(); +/// \brief Return a StringType instance +std::shared_ptr ARROW_EXPORT utf8(); +/// \brief Return a BinaryType instance +std::shared_ptr ARROW_EXPORT binary(); +/// \brief Return a Date32Type instance +std::shared_ptr ARROW_EXPORT date32(); +/// \brief Return a Date64Type instance +std::shared_ptr ARROW_EXPORT date64(); + +/// @} + +} // namespace arrow + +#endif // ARROW_TYPE_FWD_H diff --git a/r/R/inst/include/arrow/type_traits.h b/r/R/inst/include/arrow/type_traits.h new file mode 100644 index 00000000000..49c8ff86486 --- /dev/null +++ b/r/R/inst/include/arrow/type_traits.h @@ -0,0 +1,590 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_TRAITS_H +#define ARROW_TYPE_TRAITS_H + +#include +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/bit-util.h" + +namespace arrow { + +// +// Per-type type traits +// + +template +struct TypeTraits {}; + +template +struct CTypeTraits {}; + +template <> +struct TypeTraits { + using ArrayType = NullArray; + using BuilderType = NullBuilder; + using ScalarType = NullScalar; + + static constexpr int64_t bytes_required(int64_t) { return 0; } + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return null(); } +}; + +template <> +struct TypeTraits { + using ArrayType = BooleanArray; + using BuilderType = BooleanBuilder; + using ScalarType = BooleanScalar; + using CType = bool; + + static constexpr int64_t bytes_required(int64_t elements) { + return BitUtil::BytesForBits(elements); + } + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return boolean(); } +}; + +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BooleanType; +}; + +#define PRIMITIVE_TYPE_TRAITS_DEF_(CType_, ArrowType_, ArrowArrayType, ArrowBuilderType, \ + ArrowScalarType, ArrowTensorType, SingletonFn) \ + template <> \ + struct TypeTraits { \ + using ArrayType = ArrowArrayType; \ + using BuilderType = ArrowBuilderType; \ + using ScalarType = ArrowScalarType; \ + using TensorType = ArrowTensorType; \ + using CType = ArrowType_::c_type; \ + static constexpr int64_t bytes_required(int64_t elements) { \ + return elements * static_cast(sizeof(CType)); \ + } \ + constexpr static bool is_parameter_free = true; \ + static inline std::shared_ptr type_singleton() { return SingletonFn(); } \ + }; \ + \ + template <> \ + struct CTypeTraits : public TypeTraits { \ + using ArrowType = ArrowType_; \ + }; + +#define PRIMITIVE_TYPE_TRAITS_DEF(CType, ArrowShort, SingletonFn) \ + PRIMITIVE_TYPE_TRAITS_DEF_( \ + CType, ARROW_CONCAT(ArrowShort, Type), ARROW_CONCAT(ArrowShort, Array), \ + ARROW_CONCAT(ArrowShort, Builder), ARROW_CONCAT(ArrowShort, Scalar), \ + ARROW_CONCAT(ArrowShort, Tensor), SingletonFn) + +PRIMITIVE_TYPE_TRAITS_DEF(uint8_t, UInt8, uint8) +PRIMITIVE_TYPE_TRAITS_DEF(int8_t, Int8, int8) +PRIMITIVE_TYPE_TRAITS_DEF(uint16_t, UInt16, uint16) +PRIMITIVE_TYPE_TRAITS_DEF(int16_t, Int16, int16) +PRIMITIVE_TYPE_TRAITS_DEF(uint32_t, UInt32, uint32) +PRIMITIVE_TYPE_TRAITS_DEF(int32_t, Int32, int32) +PRIMITIVE_TYPE_TRAITS_DEF(uint64_t, UInt64, uint64) +PRIMITIVE_TYPE_TRAITS_DEF(int64_t, Int64, int64) +PRIMITIVE_TYPE_TRAITS_DEF(float, Float, float32) +PRIMITIVE_TYPE_TRAITS_DEF(double, Double, float64) + +#undef PRIMITIVE_TYPE_TRAITS_DEF +#undef PRIMITIVE_TYPE_TRAITS_DEF_ + +template <> +struct TypeTraits { + using ArrayType = Date64Array; + using BuilderType = Date64Builder; + using ScalarType = Date64Scalar; + using CType = Date64Type::c_type; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int64_t)); + } + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return date64(); } +}; + +template <> +struct TypeTraits { + using ArrayType = Date32Array; + using BuilderType = Date32Builder; + using ScalarType = Date32Scalar; + using CType = Date32Type::c_type; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int32_t)); + } + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return date32(); } +}; + +template <> +struct TypeTraits { + using ArrayType = TimestampArray; + using BuilderType = TimestampBuilder; + using ScalarType = TimestampScalar; + using CType = TimestampType::c_type; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int64_t)); + } + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = DurationArray; + using BuilderType = DurationBuilder; + using ScalarType = DurationScalar; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int64_t)); + } + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = DayTimeIntervalArray; + using BuilderType = DayTimeIntervalBuilder; + using ScalarType = DayTimeIntervalScalar; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(DayTimeIntervalType::DayMilliseconds)); + } + constexpr static bool is_parameter_free = true; +}; + +template <> +struct TypeTraits { + using ArrayType = MonthIntervalArray; + using BuilderType = MonthIntervalBuilder; + using ScalarType = MonthIntervalScalar; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int32_t)); + } + constexpr static bool is_parameter_free = true; +}; + +template <> +struct TypeTraits { + using ArrayType = Time32Array; + using BuilderType = Time32Builder; + using ScalarType = Time32Scalar; + using CType = Time32Type::c_type; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int32_t)); + } + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = Time64Array; + using BuilderType = Time64Builder; + using ScalarType = Time64Scalar; + using CType = Time64Type::c_type; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(int64_t)); + } + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = HalfFloatArray; + using BuilderType = HalfFloatBuilder; + using ScalarType = HalfFloatScalar; + using TensorType = HalfFloatTensor; + + static constexpr int64_t bytes_required(int64_t elements) { + return elements * static_cast(sizeof(uint16_t)); + } + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return float16(); } +}; + +template <> +struct TypeTraits { + using ArrayType = Decimal128Array; + using BuilderType = Decimal128Builder; + using ScalarType = Decimal128Scalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = BinaryArray; + using BuilderType = BinaryBuilder; + using ScalarType = BinaryScalar; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary(); } +}; + +template <> +struct TypeTraits { + using ArrayType = FixedSizeBinaryArray; + using BuilderType = FixedSizeBinaryBuilder; + using ScalarType = FixedSizeBinaryScalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = StringArray; + using BuilderType = StringBuilder; + using ScalarType = StringScalar; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8(); } +}; + +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = StringType; +}; + +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = StringType; +}; + +template <> +struct TypeTraits { + using ArrayType = ListArray; + using BuilderType = ListBuilder; + using ScalarType = ListScalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = FixedSizeListArray; + using BuilderType = FixedSizeListBuilder; + using ScalarType = FixedSizeListScalar; + constexpr static bool is_parameter_free = false; +}; + +template +struct CTypeTraits> : public TypeTraits { + using ArrowType = ListType; + + static inline std::shared_ptr type_singleton() { + return list(CTypeTraits::type_singleton()); + } +}; + +template <> +struct TypeTraits { + using ArrayType = StructArray; + using BuilderType = StructBuilder; + using ScalarType = StructScalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = UnionArray; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = DictionaryArray; + using ScalarType = DictionaryScalar; + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = ExtensionArray; + constexpr static bool is_parameter_free = false; +}; + +// +// Useful type predicates +// + +template +using is_number_type = std::is_base_of; + +template +using is_integer_type = std::is_base_of; + +template +using is_floating_type = std::is_base_of; + +template +using is_temporal_type = std::is_base_of; + +template +struct has_c_type { + static constexpr bool value = + (std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value); +}; + +template +struct is_8bit_int { + static constexpr bool value = + (std::is_same::value || std::is_same::value); +}; + +template +using enable_if_8bit_int = typename std::enable_if::value, R>::type; + +template +using enable_if_primitive_ctype = + typename std::enable_if::value, R>::type; + +template +using enable_if_integer = typename std::enable_if::value, R>::type; + +template +using is_signed_integer = + std::integral_constant::value && + std::is_signed::value>; + +template +using enable_if_signed_integer = + typename std::enable_if::value, R>::type; + +template +using enable_if_unsigned_integer = typename std::enable_if< + is_integer_type::value && std::is_unsigned::value, R>::type; + +template +using enable_if_floating_point = + typename std::enable_if::value, R>::type; + +template +using is_date = std::is_base_of; + +template +using enable_if_date = typename std::enable_if::value, R>::type; + +template +using is_time = std::is_base_of; + +template +using enable_if_time = typename std::enable_if::value, R>::type; + +template +using is_timestamp = std::is_base_of; + +template +using enable_if_timestamp = typename std::enable_if::value, R>::type; + +template +using enable_if_has_c_type = typename std::enable_if::value, R>::type; + +template +using enable_if_null = typename std::enable_if::value, R>::type; + +template +using enable_if_binary = + typename std::enable_if::value, R>::type; + +template +using enable_if_boolean = + typename std::enable_if::value, R>::type; + +template +using enable_if_binary_like = + typename std::enable_if::value || + std::is_base_of::value, + R>::type; + +template +using enable_if_fixed_size_binary = + typename std::enable_if::value, R>::type; + +template +using enable_if_list = + typename std::enable_if::value, R>::type; + +template +using enable_if_fixed_size_list = + typename std::enable_if::value, R>::type; + +template +using enable_if_number = typename std::enable_if::value, R>::type; + +namespace detail { + +// Not all type classes have a c_type +template +struct as_void { + using type = void; +}; + +// The partial specialization will match if T has the ATTR_NAME member +#define GET_ATTR(ATTR_NAME, DEFAULT) \ + template \ + struct GetAttr_##ATTR_NAME { \ + using type = DEFAULT; \ + }; \ + \ + template \ + struct GetAttr_##ATTR_NAME::type> { \ + using type = typename T::ATTR_NAME; \ + }; + +GET_ATTR(c_type, void) +GET_ATTR(TypeClass, void) + +#undef GET_ATTR + +} // namespace detail + +#define PRIMITIVE_TRAITS(T) \ + using TypeClass = \ + typename std::conditional::value, T, \ + typename detail::GetAttr_TypeClass::type>::type; \ + using c_type = typename detail::GetAttr_c_type::type + +template +struct IsUnsignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_unsigned::value; +}; + +template +struct IsSignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_signed::value; +}; + +template +struct IsInteger { + PRIMITIVE_TRAITS(T); + static constexpr bool value = std::is_integral::value; +}; + +template +struct IsFloatingPoint { + PRIMITIVE_TRAITS(T); + static constexpr bool value = std::is_floating_point::value; +}; + +template +struct IsNumeric { + PRIMITIVE_TRAITS(T); + static constexpr bool value = std::is_arithmetic::value; +}; + +static inline bool is_integer(Type::type type_id) { + switch (type_id) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + return true; + default: + break; + } + return false; +} + +static inline bool is_floating(Type::type type_id) { + switch (type_id) { + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + return true; + default: + break; + } + return false; +} + +static inline bool is_primitive(Type::type type_id) { + switch (type_id) { + case Type::NA: + case Type::BOOL: + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + case Type::DATE32: + case Type::DATE64: + case Type::TIME32: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::INTERVAL: + return true; + default: + break; + } + return false; +} + +static inline bool is_binary_like(Type::type type_id) { + switch (type_id) { + case Type::BINARY: + case Type::STRING: + return true; + default: + break; + } + return false; +} + +static inline bool is_dictionary(Type::type type_id) { + return type_id == Type::DICTIONARY; +} + +static inline bool is_fixed_size_binary(Type::type type_id) { + switch (type_id) { + case Type::DECIMAL: + case Type::FIXED_SIZE_BINARY: + return true; + default: + break; + } + return false; +} + +static inline bool is_fixed_width(Type::type type_id) { + return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id); +} + +} // namespace arrow + +#endif // ARROW_TYPE_TRAITS_H diff --git a/r/R/inst/include/arrow/util/basic_decimal.h b/r/R/inst/include/arrow/util/basic_decimal.h new file mode 100644 index 00000000000..2e5857c3012 --- /dev/null +++ b/r/R/inst/include/arrow/util/basic_decimal.h @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +enum class DecimalStatus { + kSuccess, + kDivideByZero, + kOverflow, + kRescaleDataLoss, +}; + +/// Represents a signed 128-bit integer in two's complement. +/// +/// This class is also compiled into LLVM IR - so, it should not have cpp references like +/// streams and boost. +class ARROW_EXPORT BasicDecimal128 { + public: + /// \brief Create a BasicDecimal128 from the two's complement representation. + constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept + : low_bits_(low), high_bits_(high) {} + + /// \brief Empty constructor creates a BasicDecimal128 with a value of 0. + constexpr BasicDecimal128() noexcept : BasicDecimal128(0, 0) {} + + /// \brief Convert any integer value into a BasicDecimal128. + template ::value, T>::type> + constexpr BasicDecimal128(T value) noexcept + : BasicDecimal128(static_cast(value) >= 0 ? 0 : -1, + static_cast(value)) {} + + /// \brief Create a BasicDecimal128 from an array of bytes. Bytes are assumed to be in + /// little-endian byte order. + explicit BasicDecimal128(const uint8_t* bytes); + + /// \brief Negate the current value (in-place) + BasicDecimal128& Negate(); + + /// \brief Absolute value (in-place) + BasicDecimal128& Abs(); + + /// \brief Absolute value + static BasicDecimal128 Abs(const BasicDecimal128& left); + + /// \brief Add a number to this one. The result is truncated to 128 bits. + BasicDecimal128& operator+=(const BasicDecimal128& right); + + /// \brief Subtract a number from this one. The result is truncated to 128 bits. + BasicDecimal128& operator-=(const BasicDecimal128& right); + + /// \brief Multiply this number by another number. The result is truncated to 128 bits. + BasicDecimal128& operator*=(const BasicDecimal128& right); + + /// Divide this number by right and return the result. + /// + /// This operation is not destructive. + /// The answer rounds to zero. Signs work like: + /// 21 / 5 -> 4, 1 + /// -21 / 5 -> -4, -1 + /// 21 / -5 -> -4, 1 + /// -21 / -5 -> 4, -1 + /// \param[in] divisor the number to divide by + /// \param[out] result the quotient + /// \param[out] remainder the remainder after the division + DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result, + BasicDecimal128* remainder) const; + + /// \brief In-place division. + BasicDecimal128& operator/=(const BasicDecimal128& right); + + /// \brief Bitwise "or" between two BasicDecimal128. + BasicDecimal128& operator|=(const BasicDecimal128& right); + + /// \brief Bitwise "and" between two BasicDecimal128. + BasicDecimal128& operator&=(const BasicDecimal128& right); + + /// \brief Shift left by the given number of bits. + BasicDecimal128& operator<<=(uint32_t bits); + + /// \brief Shift right by the given number of bits. Negative values will + BasicDecimal128& operator>>=(uint32_t bits); + + /// \brief Get the high bits of the two's complement representation of the number. + inline int64_t high_bits() const { return high_bits_; } + + /// \brief Get the low bits of the two's complement representation of the number. + inline uint64_t low_bits() const { return low_bits_; } + + /// \brief Return the raw bytes of the value in little-endian byte order. + std::array ToBytes() const; + void ToBytes(uint8_t* out) const; + + /// \brief seperate the integer and fractional parts for the given scale. + void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole, + BasicDecimal128* fraction) const; + + /// \brief Scale multiplier for given scale value. + static const BasicDecimal128& GetScaleMultiplier(int32_t scale); + + /// \brief Convert BasicDecimal128 from one scale to another + DecimalStatus Rescale(int32_t original_scale, int32_t new_scale, + BasicDecimal128* out) const; + + /// \brief Scale up. + BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const; + + /// \brief Scale down. + /// - If 'round' is true, the right-most digits are dropped and the result value is + /// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits + /// (>= 10^reduce_by / 2). + /// - If 'round' is false, the right-most digits are simply dropped. + BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const; + + // returns 1 for positive and zero decimal values, -1 for negative decimal values. + inline int64_t Sign() const { return 1 | (high_bits_ >> 63); } + + /// \brief count the number of leading binary zeroes. + int32_t CountLeadingBinaryZeros() const; + + /// \brief Get the maximum valid unscaled decimal value. + static const BasicDecimal128& GetMaxValue(); + + private: + uint64_t low_bits_; + int64_t high_bits_; +}; + +ARROW_EXPORT bool operator==(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right); +ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right); + +ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand); +ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand); +ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left, + const BasicDecimal128& right); +ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left, + const BasicDecimal128& right); + +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/bit-stream-utils.h b/r/R/inst/include/arrow/util/bit-stream-utils.h new file mode 100644 index 00000000000..ad86ee87c9f --- /dev/null +++ b/r/R/inst/include/arrow/util/bit-stream-utils.h @@ -0,0 +1,416 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala (incubating) as of 2016-01-29 + +#ifndef ARROW_UTIL_BIT_STREAM_UTILS_H +#define ARROW_UTIL_BIT_STREAM_UTILS_H + +#include +#include +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/bpacking.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace BitUtil { + +/// Utility class to write bit/byte streams. This class can write data to either be +/// bit packed or byte aligned (and a single stream that has a mix of both). +/// This class does not allocate memory. +class BitWriter { + public: + /// buffer: buffer to write bits to. Buffer should be preallocated with + /// 'buffer_len' bytes. + BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) { + Clear(); + } + + void Clear() { + buffered_values_ = 0; + byte_offset_ = 0; + bit_offset_ = 0; + } + + /// The number of current bytes written, including the current byte (i.e. may include a + /// fraction of a byte). Includes buffered values. + int bytes_written() const { + return byte_offset_ + static_cast(BitUtil::BytesForBits(bit_offset_)); + } + uint8_t* buffer() const { return buffer_; } + int buffer_len() const { return max_bytes_; } + + /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit + /// packed. Returns false if there was not enough space. num_bits must be <= 32. + bool PutValue(uint64_t v, int num_bits); + + /// Writes v to the next aligned byte using num_bytes. If T is larger than + /// num_bytes, the extra high-order bytes will be ignored. Returns false if + /// there was not enough space. + template + bool PutAligned(T v, int num_bytes); + + /// Write a Vlq encoded int to the buffer. Returns false if there was not enough + /// room. The value is written byte aligned. + /// For more details on vlq: + /// en.wikipedia.org/wiki/Variable-length_quantity + bool PutVlqInt(uint32_t v); + + // Writes an int zigzag encoded. + bool PutZigZagVlqInt(int32_t v); + + /// Get a pointer to the next aligned byte and advance the underlying buffer + /// by num_bytes. + /// Returns NULL if there was not enough space. + uint8_t* GetNextBytePtr(int num_bytes = 1); + + /// Flushes all buffered values to the buffer. Call this when done writing to + /// the buffer. If 'align' is true, buffered_values_ is reset and any future + /// writes will be written to the next byte boundary. + void Flush(bool align = false); + + private: + uint8_t* buffer_; + int max_bytes_; + + /// Bit-packed values are initially written to this variable before being memcpy'd to + /// buffer_. This is faster than writing values byte by byte directly to buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +/// Utility class to read bit/byte stream. This class can read bits or bytes +/// that are either byte aligned or not. It also has utilities to read multiple +/// bytes in one read (e.g. encoded int). +class BitReader { + public: + /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) { + int num_bytes = std::min(8, max_bytes_ - byte_offset_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); + } + + BitReader() + : buffer_(NULL), + max_bytes_(0), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) {} + + void Reset(const uint8_t* buffer, int buffer_len) { + buffer_ = buffer; + max_bytes_ = buffer_len; + byte_offset_ = 0; + bit_offset_ = 0; + int num_bytes = std::min(8, max_bytes_ - byte_offset_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); + } + + /// Gets the next value from the buffer. Returns true if 'v' could be read or false if + /// there are not enough bytes left. num_bits must be <= 32. + template + bool GetValue(int num_bits, T* v); + + /// Get a number of values from the buffer. Return the number of values actually read. + template + int GetBatch(int num_bits, T* v, int batch_size); + + /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T + /// needs to be a little-endian native type and big enough to store + /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will + /// be advanced to the start of the next byte before 'v' is read. Returns + /// false if there are not enough bytes left. + template + bool GetAligned(int num_bytes, T* v); + + /// Reads a vlq encoded int from the stream. The encoded int must start at + /// the beginning of a byte. Return false if there were not enough bytes in + /// the buffer. + bool GetVlqInt(int32_t* v); + + // Reads a zigzag encoded int `into` v. + bool GetZigZagVlqInt(int32_t* v); + + /// Returns the number of bytes left in the stream, not including the current + /// byte (i.e., there may be an additional fraction of a byte). + int bytes_left() { + return max_bytes_ - + (byte_offset_ + static_cast(BitUtil::BytesForBits(bit_offset_))); + } + + /// Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; + + private: + const uint8_t* buffer_; + int max_bytes_; + + /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is + /// faster than reading values byte by byte directly from buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +inline bool BitWriter::PutValue(uint64_t v, int num_bits) { + // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases) + DCHECK_LE(num_bits, 32); + DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; + + if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) + return false; + + buffered_values_ |= v << bit_offset_; + bit_offset_ += num_bits; + + if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) { + // Flush buffered_values_ and write out bits of v that did not fit + memcpy(buffer_ + byte_offset_, &buffered_values_, 8); + buffered_values_ = 0; + byte_offset_ += 8; + bit_offset_ -= 64; + buffered_values_ = v >> (num_bits - bit_offset_); + } + DCHECK_LT(bit_offset_, 64); + return true; +} + +inline void BitWriter::Flush(bool align) { + int num_bytes = static_cast(BitUtil::BytesForBits(bit_offset_)); + DCHECK_LE(byte_offset_ + num_bytes, max_bytes_); + memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes); + + if (align) { + buffered_values_ = 0; + byte_offset_ += num_bytes; + bit_offset_ = 0; + } +} + +inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { + Flush(/* align */ true); + DCHECK_LE(byte_offset_, max_bytes_); + if (byte_offset_ + num_bytes > max_bytes_) return NULL; + uint8_t* ptr = buffer_ + byte_offset_; + byte_offset_ += num_bytes; + return ptr; +} + +template +inline bool BitWriter::PutAligned(T val, int num_bytes) { + uint8_t* ptr = GetNextBytePtr(num_bytes); + if (ptr == NULL) return false; + memcpy(ptr, &val, num_bytes); + return true; +} + +inline bool BitWriter::PutVlqInt(uint32_t v) { + bool result = true; + while ((v & 0xFFFFFF80) != 0L) { + result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); + v >>= 7; + } + result &= PutAligned(static_cast(v & 0x7F), 1); + return result; +} + +namespace detail { + +template +inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, + int* bit_offset, int* byte_offset, uint64_t* buffered_values) { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800) +#endif + *v = static_cast(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> + *bit_offset); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + *bit_offset += num_bits; + if (*bit_offset >= 64) { + *byte_offset += 8; + *bit_offset -= 64; + + int bytes_remaining = max_bytes - *byte_offset; + if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(buffered_values, buffer + *byte_offset, 8); + } else { + memcpy(buffered_values, buffer + *byte_offset, bytes_remaining); + } +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800 4805) +#endif + // Read bits of v that crossed into new buffered_values_ + *v = *v | static_cast(BitUtil::TrailingBits(*buffered_values, *bit_offset) + << (num_bits - *bit_offset)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + DCHECK_LE(*bit_offset, 64); + } +} + +} // namespace detail + +template +inline bool BitReader::GetValue(int num_bits, T* v) { + return GetBatch(num_bits, v, 1) == 1; +} + +template +inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { + DCHECK(buffer_ != NULL); + // TODO: revisit this limit if necessary + DCHECK_LE(num_bits, 32); + DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)); + + int bit_offset = bit_offset_; + int byte_offset = byte_offset_; + uint64_t buffered_values = buffered_values_; + int max_bytes = max_bytes_; + const uint8_t* buffer = buffer_; + + uint64_t needed_bits = num_bits * batch_size; + uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset; + if (remaining_bits < needed_bits) { + batch_size = static_cast(remaining_bits) / num_bits; + } + + int i = 0; + if (ARROW_PREDICT_FALSE(bit_offset != 0)) { + for (; i < batch_size && bit_offset != 0; ++i) { + detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, + &buffered_values); + } + } + + if (sizeof(T) == 4) { + int num_unpacked = + internal::unpack32(reinterpret_cast(buffer + byte_offset), + reinterpret_cast(v + i), batch_size - i, num_bits); + i += num_unpacked; + byte_offset += num_unpacked * num_bits / 8; + } else { + const int buffer_size = 1024; + uint32_t unpack_buffer[buffer_size]; + while (i < batch_size) { + int unpack_size = std::min(buffer_size, batch_size - i); + int num_unpacked = + internal::unpack32(reinterpret_cast(buffer + byte_offset), + unpack_buffer, unpack_size, num_bits); + if (num_unpacked == 0) { + break; + } + for (int k = 0; k < num_unpacked; ++k) { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4800) +#endif + v[i + k] = static_cast(unpack_buffer[k]); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + i += num_unpacked; + byte_offset += num_unpacked * num_bits / 8; + } + } + + int bytes_remaining = max_bytes - byte_offset; + if (bytes_remaining >= 8) { + memcpy(&buffered_values, buffer + byte_offset, 8); + } else { + memcpy(&buffered_values, buffer + byte_offset, bytes_remaining); + } + + for (; i < batch_size; ++i) { + detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, + &buffered_values); + } + + bit_offset_ = bit_offset; + byte_offset_ = byte_offset; + buffered_values_ = buffered_values; + + return batch_size; +} + +template +inline bool BitReader::GetAligned(int num_bytes, T* v) { + DCHECK_LE(num_bytes, static_cast(sizeof(T))); + int bytes_read = static_cast(BitUtil::BytesForBits(bit_offset_)); + if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) + return false; + + // Advance byte_offset to next unread byte and read num_bytes + byte_offset_ += bytes_read; + memcpy(v, buffer_ + byte_offset_, num_bytes); + byte_offset_ += num_bytes; + + // Reset buffered_values_ + bit_offset_ = 0; + int bytes_remaining = max_bytes_ - byte_offset_; + if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } + return true; +} + +inline bool BitReader::GetVlqInt(int32_t* v) { + *v = 0; + int shift = 0; + int num_bytes = 0; + uint8_t byte = 0; + do { + if (!GetAligned(1, &byte)) return false; + *v |= (byte & 0x7F) << shift; + shift += 7; + DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); + } while ((byte & 0x80) != 0); + return true; +} + +inline bool BitWriter::PutZigZagVlqInt(int32_t v) { + // Note negative left shift is undefined + uint32_t u = (static_cast(v) << 1) ^ (v >> 31); + return PutVlqInt(u); +} + +inline bool BitReader::GetZigZagVlqInt(int32_t* v) { + int32_t u_signed; + if (!GetVlqInt(&u_signed)) return false; + uint32_t u = static_cast(u_signed); + *reinterpret_cast(v) = (u >> 1) ^ -(static_cast(u & 1)); + return true; +} + +} // namespace BitUtil +} // namespace arrow + +#endif // ARROW_UTIL_BIT_STREAM_UTILS_H diff --git a/r/R/inst/include/arrow/util/bit-util.h b/r/R/inst/include/arrow/util/bit-util.h new file mode 100644 index 00000000000..b7de112b85c --- /dev/null +++ b/r/R/inst/include/arrow/util/bit-util.h @@ -0,0 +1,855 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_BIT_UTIL_H +#define ARROW_UTIL_BIT_UTIL_H + +#ifdef _WIN32 +#define ARROW_LITTLE_ENDIAN 1 +#else +#ifdef __APPLE__ +#include +#else +#include +#endif +# +#ifndef __BYTE_ORDER__ +#error "__BYTE_ORDER__ not defined" +#endif +# +#ifndef __ORDER_LITTLE_ENDIAN__ +#error "__ORDER_LITTLE_ENDIAN__ not defined" +#endif +# +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define ARROW_LITTLE_ENDIAN 1 +#else +#define ARROW_LITTLE_ENDIAN 0 +#endif +#endif + +#if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanReverse) +#pragma intrinsic(_BitScanForward) +#define ARROW_BYTE_SWAP64 _byteswap_uint64 +#define ARROW_BYTE_SWAP32 _byteswap_ulong +#else +#define ARROW_BYTE_SWAP64 __builtin_bswap64 +#define ARROW_BYTE_SWAP32 __builtin_bswap32 +#endif + +#include +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace detail { + +template +typename std::make_unsigned::type as_unsigned(Integer x) { + return static_cast::type>(x); +} + +} // namespace detail + +namespace BitUtil { + +// The number of set bits in a given unsigned byte value, pre-computed +// +// Generated with the following Python code +// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};' +// popcounts = [str(bin(i).count('1')) for i in range(0, 256)] +// print(output.format(', '.join(popcounts))) +static constexpr uint8_t kBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +// +// Bit-related computations on integer values +// + +// Returns the ceil of value/divisor +constexpr int64_t CeilDiv(int64_t value, int64_t divisor) { + return value / divisor + (value % divisor != 0); +} + +constexpr int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; } + +// Returns the smallest power of two that contains v. If v is already a +// power of two, it is returned as is. +static inline int64_t NextPower2(int64_t n) { + // Taken from + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + n++; + return n; +} + +constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } + +constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } + +// Returns 'value' rounded up to the nearest multiple of 'factor' +constexpr int64_t RoundUp(int64_t value, int64_t factor) { + return (value + (factor - 1)) / factor * factor; +} + +// Returns 'value' rounded down to the nearest multiple of 'factor' +constexpr int64_t RoundDown(int64_t value, int64_t factor) { + return (value / factor) * factor; +} + +// Returns 'value' rounded up to the nearest multiple of 'factor' when factor +// is a power of two. +// The result is undefined on overflow, i.e. if `value > 2**64 - factor`, +// since we cannot return the correct result which would be 2**64. +constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { + // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); + return (value + (factor - 1)) & ~(factor - 1); +} + +constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); } + +constexpr int64_t RoundUpToMultipleOf64(int64_t num) { + return RoundUpToPowerOf2(num, 64); +} + +// Returns the number of bytes covering a sliced bitmap. Find the length +// rounded to cover full bytes on both extremities. +// +// The following example represents a slice (offset=10, length=9) +// +// 0 8 16 24 +// |-------|-------|------| +// [ ] (slice) +// [ ] (same slice aligned to bytes bounds, length=16) +// +// The covering bytes is the length (in bytes) of this new aligned slice. +constexpr int64_t CoveringBytes(int64_t offset, int64_t length) { + return (BitUtil::RoundUp(length + offset, 8) - BitUtil::RoundDown(offset, 8)) / 8; +} + +// Returns the 'num_bits' least-significant bits of 'v'. +static inline uint64_t TrailingBits(uint64_t v, int num_bits) { + if (ARROW_PREDICT_FALSE(num_bits == 0)) return 0; + if (ARROW_PREDICT_FALSE(num_bits >= 64)) return v; + int n = 64 - num_bits; + return (v << n) >> n; +} + +/// \brief Count the number of leading zeros in an unsigned integer. +static inline int CountLeadingZeros(uint32_t value) { +#if defined(__clang__) || defined(__GNUC__) + if (value == 0) return 32; + return static_cast(__builtin_clz(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + if (_BitScanReverse(&index, static_cast(value))) { // NOLINT + return 31 - static_cast(index); + } else { + return 32; + } +#else + int bitpos = 0; + while (value != 0) { + value >>= 1; + ++bitpos; + } + return 32 - bitpos; +#endif +} + +static inline int CountLeadingZeros(uint64_t value) { +#if defined(__clang__) || defined(__GNUC__) + if (value == 0) return 64; + return static_cast(__builtin_clzll(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + if (_BitScanReverse64(&index, value)) { // NOLINT + return 63 - static_cast(index); + } else { + return 64; + } +#else + int bitpos = 0; + while (value != 0) { + value >>= 1; + ++bitpos; + } + return 64 - bitpos; +#endif +} + +static inline int CountTrailingZeros(uint32_t value) { +#if defined(__clang__) || defined(__GNUC__) + if (value == 0) return 32; + return static_cast(__builtin_ctzl(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + if (_BitScanForward(&index, value)) { + return static_cast(index); + } else { + return 32; + } +#else + int bitpos = 0; + if (value) { + while (value & 1 == 0) { + value >>= 1; + ++bitpos; + } + } else { + bitpos = 32; + } + return bitpos; +#endif +} + +static inline int CountTrailingZeros(uint64_t value) { +#if defined(__clang__) || defined(__GNUC__) + if (value == 0) return 64; + return static_cast(__builtin_ctzll(value)); +#elif defined(_MSC_VER) + unsigned long index; // NOLINT + if (_BitScanForward64(&index, value)) { + return static_cast(index); + } else { + return 64; + } +#else + int bitpos = 0; + if (value) { + while (value & 1 == 0) { + value >>= 1; + ++bitpos; + } + } else { + bitpos = 64; + } + return bitpos; +#endif +} + +// Returns the minimum number of bits needed to represent an unsigned value +static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); } + +// Returns ceil(log2(x)). +static inline int Log2(uint64_t x) { + // DCHECK_GT(x, 0); + return NumRequiredBits(x - 1); +} + +// +// Byte-swap 16-bit, 32-bit and 64-bit values +// + +// Swap the byte order (i.e. endianess) +static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } +static inline uint64_t ByteSwap(uint64_t value) { + return static_cast(ARROW_BYTE_SWAP64(value)); +} +static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } +static inline uint32_t ByteSwap(uint32_t value) { + return static_cast(ARROW_BYTE_SWAP32(value)); +} +static inline int16_t ByteSwap(int16_t value) { + constexpr auto m = static_cast(0xff); + return static_cast(((value >> 8) & m) | ((value & m) << 8)); +} +static inline uint16_t ByteSwap(uint16_t value) { + return static_cast(ByteSwap(static_cast(value))); +} + +// Write the swapped bytes into dst. Src and dst cannot overlap. +static inline void ByteSwap(void* dst, const void* src, int len) { + switch (len) { + case 1: + *reinterpret_cast(dst) = *reinterpret_cast(src); + return; + case 2: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + case 4: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + case 8: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + default: + break; + } + + auto d = reinterpret_cast(dst); + auto s = reinterpret_cast(src); + for (int i = 0; i < len; ++i) { + d[i] = s[len - i - 1]; + } +} + +// Convert to little/big endian format from the machine's native endian format. +#if ARROW_LITTLE_ENDIAN +template > +static inline T ToBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T ToLittleEndian(T value) { + return value; +} +#else +template > +static inline T ToBigEndian(T value) { + return value; +} + +template > +static inline T ToLittleEndian(T value) { + return ByteSwap(value); +} +#endif + +// Convert from big/little endian format to the machine's native endian format. +#if ARROW_LITTLE_ENDIAN +template > +static inline T FromBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T FromLittleEndian(T value) { + return value; +} +#else +template > +static inline T FromBigEndian(T value) { + return value; +} + +template > +static inline T FromLittleEndian(T value) { + return ByteSwap(value); +} +#endif + +// +// Utilities for reading and writing individual bits by their index +// in a memory area. +// + +// Bitmask selecting the k-th bit in a byte +static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; + +// the bitwise complement version of kBitmask +static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; + +// Bitmask selecting the (k - 1) preceding bits in a byte +static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; +static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63, 127}; + +// the bitwise complement version of kPrecedingBitmask +static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; + +static inline bool GetBit(const uint8_t* bits, uint64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; +} + +static inline void ClearBit(uint8_t* bits, int64_t i) { + bits[i / 8] &= kFlippedBitmask[i % 8]; +} + +static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; } + +static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { + // https://graphics.stanford.edu/~seander/bithacks.html + // "Conditionally set or clear bits without branching" + // NOTE: this seems to confuse Valgrind as it reads from potentially + // uninitialized memory + bits[i / 8] ^= static_cast(-static_cast(bit_is_set) ^ bits[i / 8]) & + kBitmask[i % 8]; +} + +/// \brief set or clear a range of bits quickly +static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, + bool bits_are_set) { + if (length == 0) return; + + const auto i_begin = start_offset; + const auto i_end = start_offset + length; + const uint8_t fill_byte = static_cast(-static_cast(bits_are_set)); + + const auto bytes_begin = i_begin / 8; + const auto bytes_end = i_end / 8 + 1; + + const auto first_byte_mask = kPrecedingBitmask[i_begin % 8]; + const auto last_byte_mask = kTrailingBitmask[i_end % 8]; + + if (bytes_end == bytes_begin + 1) { + // set bits within a single byte + const auto only_byte_mask = + i_end % 8 == 0 ? first_byte_mask + : static_cast(first_byte_mask | last_byte_mask); + bits[bytes_begin] &= only_byte_mask; + bits[bytes_begin] |= static_cast(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte + bits[bytes_begin] &= first_byte_mask; + bits[bytes_begin] |= static_cast(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { + // set/clear whole bytes + std::memset(bits + bytes_begin + 1, fill_byte, + static_cast(bytes_end - bytes_begin - 2)); + } + + if (i_end % 8 == 0) return; + + // set/clear leading bits of last byte + bits[bytes_end - 1] &= last_byte_mask; + bits[bytes_end - 1] |= static_cast(fill_byte & ~last_byte_mask); +} + +/// \brief Convert vector of bytes to bitmap buffer +ARROW_EXPORT +Status BytesToBits(const std::vector&, MemoryPool*, std::shared_ptr*); + +} // namespace BitUtil + +namespace internal { + +class BitmapReader { + public: + BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap), position_(0), length_(length) { + current_byte_ = 0; + byte_offset_ = start_offset / 8; + bit_offset_ = start_offset % 8; + if (length > 0) { + current_byte_ = bitmap[byte_offset_]; + } + } + + bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; } + + bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } + + void Next() { + ++bit_offset_; + ++position_; + if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) { + bit_offset_ = 0; + ++byte_offset_; + if (ARROW_PREDICT_TRUE(position_ < length_)) { + current_byte_ = bitmap_[byte_offset_]; + } + } + } + + private: + const uint8_t* bitmap_; + int64_t position_; + int64_t length_; + + uint8_t current_byte_; + int64_t byte_offset_; + int64_t bit_offset_; +}; + +class BitmapWriter { + // A sequential bitwise writer that preserves surrounding bit values. + + public: + BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap), position_(0), length_(length) { + byte_offset_ = start_offset / 8; + bit_mask_ = BitUtil::kBitmask[start_offset % 8]; + if (length > 0) { + current_byte_ = bitmap[byte_offset_]; + } else { + current_byte_ = 0; + } + } + + void Set() { current_byte_ |= bit_mask_; } + + void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; } + + void Next() { + bit_mask_ = static_cast(bit_mask_ << 1); + ++position_; + if (bit_mask_ == 0) { + // Finished this byte, need advancing + bit_mask_ = 0x01; + bitmap_[byte_offset_++] = current_byte_; + if (ARROW_PREDICT_TRUE(position_ < length_)) { + current_byte_ = bitmap_[byte_offset_]; + } + } + } + + void Finish() { + // Store current byte if we didn't went past bitmap storage + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { + bitmap_[byte_offset_] = current_byte_; + } + } + + int64_t position() const { return position_; } + + private: + uint8_t* bitmap_; + int64_t position_; + int64_t length_; + + uint8_t current_byte_; + uint8_t bit_mask_; + int64_t byte_offset_; +}; + +class FirstTimeBitmapWriter { + // Like BitmapWriter, but any bit values *following* the bits written + // might be clobbered. It is hence faster than BitmapWriter, and can + // also avoid false positives with Valgrind. + + public: + FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap), position_(0), length_(length) { + current_byte_ = 0; + byte_offset_ = start_offset / 8; + bit_mask_ = BitUtil::kBitmask[start_offset % 8]; + if (length > 0) { + current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8]; + } else { + current_byte_ = 0; + } + } + + void Set() { current_byte_ |= bit_mask_; } + + void Clear() {} + + void Next() { + bit_mask_ = static_cast(bit_mask_ << 1); + ++position_; + if (bit_mask_ == 0) { + // Finished this byte, need advancing + bit_mask_ = 0x01; + bitmap_[byte_offset_++] = current_byte_; + current_byte_ = 0; + } + } + + void Finish() { + // Store current byte if we didn't went past bitmap storage + if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { + bitmap_[byte_offset_] = current_byte_; + } + } + + int64_t position() const { return position_; } + + private: + uint8_t* bitmap_; + int64_t position_; + int64_t length_; + + uint8_t current_byte_; + uint8_t bit_mask_; + int64_t byte_offset_; +}; + +// A std::generate() like function to write sequential bits into a bitmap area. +// Bits preceding the bitmap area are preserved, bits following the bitmap +// area may be clobbered. + +template +void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { + if (length == 0) { + return; + } + uint8_t* cur = bitmap + start_offset / 8; + uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8]; + uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8]; + + for (int64_t index = 0; index < length; ++index) { + const bool bit = g(); + current_byte = bit ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + if (bit_mask == 0) { + bit_mask = 1; + *cur++ = current_byte; + current_byte = 0; + } + } + if (bit_mask != 1) { + *cur++ = current_byte; + } +} + +// Like GenerateBits(), but unrolls its main loop for higher performance. + +template +void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length, + Generator&& g) { + if (length == 0) { + return; + } + uint8_t current_byte; + uint8_t* cur = bitmap + start_offset / 8; + const uint64_t start_bit_offset = start_offset % 8; + uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset]; + int64_t remaining = length; + + if (bit_mask != 0x01) { + current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset]; + while (bit_mask != 0 && remaining > 0) { + current_byte = g() ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + --remaining; + } + *cur++ = current_byte; + } + + int64_t remaining_bytes = remaining / 8; + while (remaining_bytes-- > 0) { + current_byte = 0; + current_byte = g() ? current_byte | 0x01 : current_byte; + current_byte = g() ? current_byte | 0x02 : current_byte; + current_byte = g() ? current_byte | 0x04 : current_byte; + current_byte = g() ? current_byte | 0x08 : current_byte; + current_byte = g() ? current_byte | 0x10 : current_byte; + current_byte = g() ? current_byte | 0x20 : current_byte; + current_byte = g() ? current_byte | 0x40 : current_byte; + current_byte = g() ? current_byte | 0x80 : current_byte; + *cur++ = current_byte; + } + + int64_t remaining_bits = remaining % 8; + if (remaining_bits) { + current_byte = 0; + bit_mask = 0x01; + while (remaining_bits-- > 0) { + current_byte = g() ? (current_byte | bit_mask) : current_byte; + bit_mask = static_cast(bit_mask << 1); + } + *cur++ = current_byte; + } +} + +// ---------------------------------------------------------------------- +// Bitmap utilities + +/// Copy a bit range of an existing bitmap +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] bitmap source data +/// \param[in] offset bit offset into the source data +/// \param[in] length number of bits to copy +/// \param[out] out the resulting copy +/// +/// \return Status message +ARROW_EXPORT +Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, int64_t length, + std::shared_ptr* out); + +/// Copy a bit range of an existing bitmap into an existing bitmap +/// +/// \param[in] bitmap source data +/// \param[in] offset bit offset into the source data +/// \param[in] length number of bits to copy +/// \param[in] dest_offset bit offset into the destination +/// \param[in] restore_trailing_bits don't clobber bits outside the destination range +/// \param[out] dest the destination buffer, must have at least space for +/// (offset + length) bits +ARROW_EXPORT +void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, + int64_t dest_offset, bool restore_trailing_bits = true); + +/// Invert a bit range of an existing bitmap into an existing bitmap +/// +/// \param[in] bitmap source data +/// \param[in] offset bit offset into the source data +/// \param[in] length number of bits to copy +/// \param[in] dest_offset bit offset into the destination +/// \param[out] dest the destination buffer, must have at least space for +/// (offset + length) bits +ARROW_EXPORT +void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, + int64_t dest_offset); + +/// Invert a bit range of an existing bitmap +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] bitmap source data +/// \param[in] offset bit offset into the source data +/// \param[in] length number of bits to copy +/// \param[out] out the resulting copy +/// +/// \return Status message +ARROW_EXPORT +Status InvertBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, + int64_t length, std::shared_ptr* out); + +/// Compute the number of 1's in the given data array +/// +/// \param[in] data a packed LSB-ordered bitmap as a byte array +/// \param[in] bit_offset a bitwise offset into the bitmap +/// \param[in] length the number of bits to inspect in the bitmap relative to +/// the offset +/// +/// \return The number of set (1) bits in the range +ARROW_EXPORT +int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length); + +ARROW_EXPORT +bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, int64_t bit_length); + +/// \brief Do a "bitmap and" on right and left buffers starting at +/// their respective bit-offsets for the given bit-length and put +/// the results in out_buffer starting at the given bit-offset. +/// +/// out_buffer will be allocated and initialized to zeros using pool before +/// the operation. +ARROW_EXPORT +Status BitmapAnd(MemoryPool* pool, const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t length, + int64_t out_offset, std::shared_ptr* out_buffer); + +/// \brief Do a "bitmap and" on right and left buffers starting at +/// their respective bit-offsets for the given bit-length and put +/// the results in out starting at the given bit-offset. +ARROW_EXPORT +void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); + +/// \brief Do a "bitmap or" for the given bit length on right and left buffers +/// starting at their respective bit-offsets and put the results in out_buffer +/// starting at the given bit-offset. +/// +/// out_buffer will be allocated and initialized to zeros using pool before +/// the operation. +ARROW_EXPORT +Status BitmapOr(MemoryPool* pool, const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t length, + int64_t out_offset, std::shared_ptr* out_buffer); + +/// \brief Do a "bitmap or" for the given bit length on right and left buffers +/// starting at their respective bit-offsets and put the results in out +/// starting at the given bit-offset. +ARROW_EXPORT +void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); + +/// \brief Do a "bitmap xor" for the given bit-length on right and left +/// buffers starting at their respective bit-offsets and put the results in +/// out_buffer starting at the given bit offset. +/// +/// out_buffer will be allocated and initialized to zeros using pool before +/// the operation. +ARROW_EXPORT +Status BitmapXor(MemoryPool* pool, const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t length, + int64_t out_offset, std::shared_ptr* out_buffer); + +/// \brief Do a "bitmap xor" for the given bit-length on right and left +/// buffers starting at their respective bit-offsets and put the results in +/// out starting at the given bit offset. +ARROW_EXPORT +void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); + +/// \brief Store a stack of bitsets efficiently. The top bitset may be +/// accessed and its bits may be modified, but it may not be resized. +class BitsetStack { + public: + using reference = typename std::vector::reference; + + /// \brief push a bitset onto the stack + /// \param size number of bits in the next bitset + /// \param value initial value for bits in the pushed bitset + void Push(int size, bool value) { + offsets_.push_back(bit_count()); + bits_.resize(bit_count() + size, value); + } + + /// \brief number of bits in the bitset at the top of the stack + int TopSize() const { + if (offsets_.size() == 0) return 0; + return bit_count() - offsets_.back(); + } + + /// \brief pop a bitset off the stack + void Pop() { + bits_.resize(offsets_.back()); + offsets_.pop_back(); + } + + /// \brief get the value of a bit in the top bitset + /// \param i index of the bit to access + bool operator[](int i) const { return bits_[offsets_.back() + i]; } + + /// \brief get a mutable reference to a bit in the top bitset + /// \param i index of the bit to access + reference operator[](int i) { return bits_[offsets_.back() + i]; } + + private: + int bit_count() const { return static_cast(bits_.size()); } + std::vector bits_; + std::vector offsets_; +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_BIT_UTIL_H diff --git a/r/R/inst/include/arrow/util/bpacking.h b/r/R/inst/include/arrow/util/bpacking.h new file mode 100644 index 00000000000..14258cff6e4 --- /dev/null +++ b/r/R/inst/include/arrow/util/bpacking.h @@ -0,0 +1,3308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file was modified from its original version for inclusion in parquet-cpp. +// Original source: +// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp +// The original copyright notice follows. + +// This code is released under the +// Apache License Version 2.0 http://www.apache.org/licenses/. +// (c) Daniel Lemire 2013 + +#ifndef ARROW_UTIL_BPACKING_H +#define ARROW_UTIL_BPACKING_H + +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) & 1; + out++; + *out = ((*in) >> 1) & 1; + out++; + *out = ((*in) >> 2) & 1; + out++; + *out = ((*in) >> 3) & 1; + out++; + *out = ((*in) >> 4) & 1; + out++; + *out = ((*in) >> 5) & 1; + out++; + *out = ((*in) >> 6) & 1; + out++; + *out = ((*in) >> 7) & 1; + out++; + *out = ((*in) >> 8) & 1; + out++; + *out = ((*in) >> 9) & 1; + out++; + *out = ((*in) >> 10) & 1; + out++; + *out = ((*in) >> 11) & 1; + out++; + *out = ((*in) >> 12) & 1; + out++; + *out = ((*in) >> 13) & 1; + out++; + *out = ((*in) >> 14) & 1; + out++; + *out = ((*in) >> 15) & 1; + out++; + *out = ((*in) >> 16) & 1; + out++; + *out = ((*in) >> 17) & 1; + out++; + *out = ((*in) >> 18) & 1; + out++; + *out = ((*in) >> 19) & 1; + out++; + *out = ((*in) >> 20) & 1; + out++; + *out = ((*in) >> 21) & 1; + out++; + *out = ((*in) >> 22) & 1; + out++; + *out = ((*in) >> 23) & 1; + out++; + *out = ((*in) >> 24) & 1; + out++; + *out = ((*in) >> 25) & 1; + out++; + *out = ((*in) >> 26) & 1; + out++; + *out = ((*in) >> 27) & 1; + out++; + *out = ((*in) >> 28) & 1; + out++; + *out = ((*in) >> 29) & 1; + out++; + *out = ((*in) >> 30) & 1; + out++; + *out = ((*in) >> 31); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 3); + out++; + *out = ((*in) >> 3) % (1U << 3); + out++; + *out = ((*in) >> 6) % (1U << 3); + out++; + *out = ((*in) >> 9) % (1U << 3); + out++; + *out = ((*in) >> 12) % (1U << 3); + out++; + *out = ((*in) >> 15) % (1U << 3); + out++; + *out = ((*in) >> 18) % (1U << 3); + out++; + *out = ((*in) >> 21) % (1U << 3); + out++; + *out = ((*in) >> 24) % (1U << 3); + out++; + *out = ((*in) >> 27) % (1U << 3); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 1)) << (3 - 1); + out++; + *out = ((*in) >> 1) % (1U << 3); + out++; + *out = ((*in) >> 4) % (1U << 3); + out++; + *out = ((*in) >> 7) % (1U << 3); + out++; + *out = ((*in) >> 10) % (1U << 3); + out++; + *out = ((*in) >> 13) % (1U << 3); + out++; + *out = ((*in) >> 16) % (1U << 3); + out++; + *out = ((*in) >> 19) % (1U << 3); + out++; + *out = ((*in) >> 22) % (1U << 3); + out++; + *out = ((*in) >> 25) % (1U << 3); + out++; + *out = ((*in) >> 28) % (1U << 3); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 2)) << (3 - 2); + out++; + *out = ((*in) >> 2) % (1U << 3); + out++; + *out = ((*in) >> 5) % (1U << 3); + out++; + *out = ((*in) >> 8) % (1U << 3); + out++; + *out = ((*in) >> 11) % (1U << 3); + out++; + *out = ((*in) >> 14) % (1U << 3); + out++; + *out = ((*in) >> 17) % (1U << 3); + out++; + *out = ((*in) >> 20) % (1U << 3); + out++; + *out = ((*in) >> 23) % (1U << 3); + out++; + *out = ((*in) >> 26) % (1U << 3); + out++; + *out = ((*in) >> 29); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 5); + out++; + *out = ((*in) >> 5) % (1U << 5); + out++; + *out = ((*in) >> 10) % (1U << 5); + out++; + *out = ((*in) >> 15) % (1U << 5); + out++; + *out = ((*in) >> 20) % (1U << 5); + out++; + *out = ((*in) >> 25) % (1U << 5); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + out++; + *out = ((*in) >> 3) % (1U << 5); + out++; + *out = ((*in) >> 8) % (1U << 5); + out++; + *out = ((*in) >> 13) % (1U << 5); + out++; + *out = ((*in) >> 18) % (1U << 5); + out++; + *out = ((*in) >> 23) % (1U << 5); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 1)) << (5 - 1); + out++; + *out = ((*in) >> 1) % (1U << 5); + out++; + *out = ((*in) >> 6) % (1U << 5); + out++; + *out = ((*in) >> 11) % (1U << 5); + out++; + *out = ((*in) >> 16) % (1U << 5); + out++; + *out = ((*in) >> 21) % (1U << 5); + out++; + *out = ((*in) >> 26) % (1U << 5); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 4)) << (5 - 4); + out++; + *out = ((*in) >> 4) % (1U << 5); + out++; + *out = ((*in) >> 9) % (1U << 5); + out++; + *out = ((*in) >> 14) % (1U << 5); + out++; + *out = ((*in) >> 19) % (1U << 5); + out++; + *out = ((*in) >> 24) % (1U << 5); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 2)) << (5 - 2); + out++; + *out = ((*in) >> 2) % (1U << 5); + out++; + *out = ((*in) >> 7) % (1U << 5); + out++; + *out = ((*in) >> 12) % (1U << 5); + out++; + *out = ((*in) >> 17) % (1U << 5); + out++; + *out = ((*in) >> 22) % (1U << 5); + out++; + *out = ((*in) >> 27); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 7); + out++; + *out = ((*in) >> 7) % (1U << 7); + out++; + *out = ((*in) >> 14) % (1U << 7); + out++; + *out = ((*in) >> 21) % (1U << 7); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + out++; + *out = ((*in) >> 3) % (1U << 7); + out++; + *out = ((*in) >> 10) % (1U << 7); + out++; + *out = ((*in) >> 17) % (1U << 7); + out++; + *out = ((*in) >> 24) % (1U << 7); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 6)) << (7 - 6); + out++; + *out = ((*in) >> 6) % (1U << 7); + out++; + *out = ((*in) >> 13) % (1U << 7); + out++; + *out = ((*in) >> 20) % (1U << 7); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 2)) << (7 - 2); + out++; + *out = ((*in) >> 2) % (1U << 7); + out++; + *out = ((*in) >> 9) % (1U << 7); + out++; + *out = ((*in) >> 16) % (1U << 7); + out++; + *out = ((*in) >> 23) % (1U << 7); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 5)) << (7 - 5); + out++; + *out = ((*in) >> 5) % (1U << 7); + out++; + *out = ((*in) >> 12) % (1U << 7); + out++; + *out = ((*in) >> 19) % (1U << 7); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 1)) << (7 - 1); + out++; + *out = ((*in) >> 1) % (1U << 7); + out++; + *out = ((*in) >> 8) % (1U << 7); + out++; + *out = ((*in) >> 15) % (1U << 7); + out++; + *out = ((*in) >> 22) % (1U << 7); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 4)) << (7 - 4); + out++; + *out = ((*in) >> 4) % (1U << 7); + out++; + *out = ((*in) >> 11) % (1U << 7); + out++; + *out = ((*in) >> 18) % (1U << 7); + out++; + *out = ((*in) >> 25); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 9); + out++; + *out = ((*in) >> 9) % (1U << 9); + out++; + *out = ((*in) >> 18) % (1U << 9); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + out++; + *out = ((*in) >> 4) % (1U << 9); + out++; + *out = ((*in) >> 13) % (1U << 9); + out++; + *out = ((*in) >> 22) % (1U << 9); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + out++; + *out = ((*in) >> 8) % (1U << 9); + out++; + *out = ((*in) >> 17) % (1U << 9); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 3)) << (9 - 3); + out++; + *out = ((*in) >> 3) % (1U << 9); + out++; + *out = ((*in) >> 12) % (1U << 9); + out++; + *out = ((*in) >> 21) % (1U << 9); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 7)) << (9 - 7); + out++; + *out = ((*in) >> 7) % (1U << 9); + out++; + *out = ((*in) >> 16) % (1U << 9); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 2)) << (9 - 2); + out++; + *out = ((*in) >> 2) % (1U << 9); + out++; + *out = ((*in) >> 11) % (1U << 9); + out++; + *out = ((*in) >> 20) % (1U << 9); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 6)) << (9 - 6); + out++; + *out = ((*in) >> 6) % (1U << 9); + out++; + *out = ((*in) >> 15) % (1U << 9); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 1)) << (9 - 1); + out++; + *out = ((*in) >> 1) % (1U << 9); + out++; + *out = ((*in) >> 10) % (1U << 9); + out++; + *out = ((*in) >> 19) % (1U << 9); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 5)) << (9 - 5); + out++; + *out = ((*in) >> 5) % (1U << 9); + out++; + *out = ((*in) >> 14) % (1U << 9); + out++; + *out = ((*in) >> 23); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 11); + out++; + *out = ((*in) >> 11) % (1U << 11); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + out++; + *out = ((*in) >> 1) % (1U << 11); + out++; + *out = ((*in) >> 12) % (1U << 11); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + out++; + *out = ((*in) >> 2) % (1U << 11); + out++; + *out = ((*in) >> 13) % (1U << 11); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 3)) << (11 - 3); + out++; + *out = ((*in) >> 3) % (1U << 11); + out++; + *out = ((*in) >> 14) % (1U << 11); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 4)) << (11 - 4); + out++; + *out = ((*in) >> 4) % (1U << 11); + out++; + *out = ((*in) >> 15) % (1U << 11); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 5)) << (11 - 5); + out++; + *out = ((*in) >> 5) % (1U << 11); + out++; + *out = ((*in) >> 16) % (1U << 11); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 6)) << (11 - 6); + out++; + *out = ((*in) >> 6) % (1U << 11); + out++; + *out = ((*in) >> 17) % (1U << 11); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 7)) << (11 - 7); + out++; + *out = ((*in) >> 7) % (1U << 11); + out++; + *out = ((*in) >> 18) % (1U << 11); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 8)) << (11 - 8); + out++; + *out = ((*in) >> 8) % (1U << 11); + out++; + *out = ((*in) >> 19) % (1U << 11); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 9)) << (11 - 9); + out++; + *out = ((*in) >> 9) % (1U << 11); + out++; + *out = ((*in) >> 20) % (1U << 11); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 10)) << (11 - 10); + out++; + *out = ((*in) >> 10) % (1U << 11); + out++; + *out = ((*in) >> 21); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 13); + out++; + *out = ((*in) >> 13) % (1U << 13); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + out++; + *out = ((*in) >> 7) % (1U << 13); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + out++; + *out = ((*in) >> 1) % (1U << 13); + out++; + *out = ((*in) >> 14) % (1U << 13); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + out++; + *out = ((*in) >> 8) % (1U << 13); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 2)) << (13 - 2); + out++; + *out = ((*in) >> 2) % (1U << 13); + out++; + *out = ((*in) >> 15) % (1U << 13); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 9)) << (13 - 9); + out++; + *out = ((*in) >> 9) % (1U << 13); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 3)) << (13 - 3); + out++; + *out = ((*in) >> 3) % (1U << 13); + out++; + *out = ((*in) >> 16) % (1U << 13); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 10)) << (13 - 10); + out++; + *out = ((*in) >> 10) % (1U << 13); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 4)) << (13 - 4); + out++; + *out = ((*in) >> 4) % (1U << 13); + out++; + *out = ((*in) >> 17) % (1U << 13); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 11)) << (13 - 11); + out++; + *out = ((*in) >> 11) % (1U << 13); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 5)) << (13 - 5); + out++; + *out = ((*in) >> 5) % (1U << 13); + out++; + *out = ((*in) >> 18) % (1U << 13); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 12)) << (13 - 12); + out++; + *out = ((*in) >> 12) % (1U << 13); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 6)) << (13 - 6); + out++; + *out = ((*in) >> 6) % (1U << 13); + out++; + *out = ((*in) >> 19); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 15); + out++; + *out = ((*in) >> 15) % (1U << 15); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + out++; + *out = ((*in) >> 13) % (1U << 15); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + out++; + *out = ((*in) >> 11) % (1U << 15); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + out++; + *out = ((*in) >> 9) % (1U << 15); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 7)) << (15 - 7); + out++; + *out = ((*in) >> 7) % (1U << 15); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 5)) << (15 - 5); + out++; + *out = ((*in) >> 5) % (1U << 15); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 3)) << (15 - 3); + out++; + *out = ((*in) >> 3) % (1U << 15); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 1)) << (15 - 1); + out++; + *out = ((*in) >> 1) % (1U << 15); + out++; + *out = ((*in) >> 16) % (1U << 15); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 14)) << (15 - 14); + out++; + *out = ((*in) >> 14) % (1U << 15); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 12)) << (15 - 12); + out++; + *out = ((*in) >> 12) % (1U << 15); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 10)) << (15 - 10); + out++; + *out = ((*in) >> 10) % (1U << 15); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 8)) << (15 - 8); + out++; + *out = ((*in) >> 8) % (1U << 15); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 6)) << (15 - 6); + out++; + *out = ((*in) >> 6) % (1U << 15); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 4)) << (15 - 4); + out++; + *out = ((*in) >> 4) % (1U << 15); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 2)) << (15 - 2); + out++; + *out = ((*in) >> 2) % (1U << 15); + out++; + *out = ((*in) >> 17); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + out++; + *out = ((*in) >> 2) % (1U << 17); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + out++; + *out = ((*in) >> 4) % (1U << 17); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + out++; + *out = ((*in) >> 6) % (1U << 17); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + out++; + *out = ((*in) >> 8) % (1U << 17); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 10)) << (17 - 10); + out++; + *out = ((*in) >> 10) % (1U << 17); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 12)) << (17 - 12); + out++; + *out = ((*in) >> 12) % (1U << 17); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 14)) << (17 - 14); + out++; + *out = ((*in) >> 14) % (1U << 17); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 16)) << (17 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 1)) << (17 - 1); + out++; + *out = ((*in) >> 1) % (1U << 17); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 3)) << (17 - 3); + out++; + *out = ((*in) >> 3) % (1U << 17); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 5)) << (17 - 5); + out++; + *out = ((*in) >> 5) % (1U << 17); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 7)) << (17 - 7); + out++; + *out = ((*in) >> 7) % (1U << 17); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 9)) << (17 - 9); + out++; + *out = ((*in) >> 9) % (1U << 17); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 11)) << (17 - 11); + out++; + *out = ((*in) >> 11) % (1U << 17); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 13)) << (17 - 13); + out++; + *out = ((*in) >> 13) % (1U << 17); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 15)) << (17 - 15); + out++; + *out = ((*in) >> 15); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + out++; + *out = ((*in) >> 6) % (1U << 19); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + out++; + *out = ((*in) >> 12) % (1U << 19); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + out++; + *out = ((*in) >> 5) % (1U << 19); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 11)) << (19 - 11); + out++; + *out = ((*in) >> 11) % (1U << 19); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 17)) << (19 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 4)) << (19 - 4); + out++; + *out = ((*in) >> 4) % (1U << 19); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 10)) << (19 - 10); + out++; + *out = ((*in) >> 10) % (1U << 19); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 16)) << (19 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 3)) << (19 - 3); + out++; + *out = ((*in) >> 3) % (1U << 19); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 9)) << (19 - 9); + out++; + *out = ((*in) >> 9) % (1U << 19); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 15)) << (19 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 2)) << (19 - 2); + out++; + *out = ((*in) >> 2) % (1U << 19); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 8)) << (19 - 8); + out++; + *out = ((*in) >> 8) % (1U << 19); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 14)) << (19 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 1)) << (19 - 1); + out++; + *out = ((*in) >> 1) % (1U << 19); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 7)) << (19 - 7); + out++; + *out = ((*in) >> 7) % (1U << 19); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 13)) << (19 - 13); + out++; + *out = ((*in) >> 13); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + out++; + *out = ((*in) >> 10) % (1U << 21); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + out++; + *out = ((*in) >> 9) % (1U << 21); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + out++; + *out = ((*in) >> 8) % (1U << 21); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 18)) << (21 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 7)) << (21 - 7); + out++; + *out = ((*in) >> 7) % (1U << 21); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 17)) << (21 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 6)) << (21 - 6); + out++; + *out = ((*in) >> 6) % (1U << 21); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 16)) << (21 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 5)) << (21 - 5); + out++; + *out = ((*in) >> 5) % (1U << 21); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 15)) << (21 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 4)) << (21 - 4); + out++; + *out = ((*in) >> 4) % (1U << 21); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 14)) << (21 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 3)) << (21 - 3); + out++; + *out = ((*in) >> 3) % (1U << 21); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 13)) << (21 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 2)) << (21 - 2); + out++; + *out = ((*in) >> 2) % (1U << 21); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 12)) << (21 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 1)) << (21 - 1); + out++; + *out = ((*in) >> 1) % (1U << 21); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 11)) << (21 - 11); + out++; + *out = ((*in) >> 11); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + out++; + *out = ((*in) >> 5) % (1U << 23); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + out++; + *out = ((*in) >> 1) % (1U << 23); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 15)) << (23 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 6)) << (23 - 6); + out++; + *out = ((*in) >> 6) % (1U << 23); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 20)) << (23 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 11)) << (23 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 2)) << (23 - 2); + out++; + *out = ((*in) >> 2) % (1U << 23); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 16)) << (23 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 7)) << (23 - 7); + out++; + *out = ((*in) >> 7) % (1U << 23); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 21)) << (23 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 12)) << (23 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 3)) << (23 - 3); + out++; + *out = ((*in) >> 3) % (1U << 23); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 17)) << (23 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 8)) << (23 - 8); + out++; + *out = ((*in) >> 8) % (1U << 23); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 22)) << (23 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 13)) << (23 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 4)) << (23 - 4); + out++; + *out = ((*in) >> 4) % (1U << 23); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 18)) << (23 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 9)) << (23 - 9); + out++; + *out = ((*in) >> 9); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + out++; + *out = ((*in) >> 4) % (1U << 25); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 1)) << (25 - 1); + out++; + *out = ((*in) >> 1) % (1U << 25); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 19)) << (25 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 12)) << (25 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 5)) << (25 - 5); + out++; + *out = ((*in) >> 5) % (1U << 25); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 23)) << (25 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 16)) << (25 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 9)) << (25 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 2)) << (25 - 2); + out++; + *out = ((*in) >> 2) % (1U << 25); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 20)) << (25 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 13)) << (25 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 6)) << (25 - 6); + out++; + *out = ((*in) >> 6) % (1U << 25); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 24)) << (25 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 17)) << (25 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 10)) << (25 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 3)) << (25 - 3); + out++; + *out = ((*in) >> 3) % (1U << 25); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 21)) << (25 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 14)) << (25 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 7)) << (25 - 7); + out++; + *out = ((*in) >> 7); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + out++; + *out = ((*in) >> 2) % (1U << 27); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 19)) << (27 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 14)) << (27 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 9)) << (27 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 4)) << (27 - 4); + out++; + *out = ((*in) >> 4) % (1U << 27); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 26)) << (27 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 21)) << (27 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 16)) << (27 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 11)) << (27 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 6)) << (27 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 1)) << (27 - 1); + out++; + *out = ((*in) >> 1) % (1U << 27); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 23)) << (27 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 18)) << (27 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 13)) << (27 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 8)) << (27 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 3)) << (27 - 3); + out++; + *out = ((*in) >> 3) % (1U << 27); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 25)) << (27 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 20)) << (27 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 15)) << (27 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 10)) << (27 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 5)) << (27 - 5); + out++; + *out = ((*in) >> 5); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 5)) << (29 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 2)) << (29 - 2); + out++; + *out = ((*in) >> 2) % (1U << 29); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 28)) << (29 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 25)) << (29 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 22)) << (29 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 19)) << (29 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 16)) << (29 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 13)) << (29 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 10)) << (29 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 7)) << (29 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 4)) << (29 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 1)) << (29 - 1); + out++; + *out = ((*in) >> 1) % (1U << 29); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 27)) << (29 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 24)) << (29 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 21)) << (29 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 18)) << (29 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 15)) << (29 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 12)) << (29 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 9)) << (29 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 6)) << (29 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 3)) << (29 - 3); + out++; + *out = ((*in) >> 3); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0) % (1U << 31); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 23)) << (31 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 22)) << (31 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 21)) << (31 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 20)) << (31 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 19)) << (31 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 18)) << (31 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 17)) << (31 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 16)) << (31 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 15)) << (31 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 14)) << (31 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 13)) << (31 - 13); + out++; + *out = ((*in) >> 13); + ++in; + *out |= ((*in) % (1U << 12)) << (31 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 11)) << (31 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 10)) << (31 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 9)) << (31 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 8)) << (31 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 7)) << (31 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 6)) << (31 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 5)) << (31 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 4)) << (31 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 3)) << (31 - 3); + out++; + *out = ((*in) >> 3); + ++in; + *out |= ((*in) % (1U << 2)) << (31 - 2); + out++; + *out = ((*in) >> 2); + ++in; + *out |= ((*in) % (1U << 1)) << (31 - 1); + out++; + *out = ((*in) >> 1); + ++in; + out++; + + return in; +} + +inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + + return in; +} + +inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) { + for (int k = 0; k < 32; ++k) { + out[k] = 0; + } + return in; +} + +inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { + batch_size = batch_size / 32 * 32; + int num_loops = batch_size / 32; + + switch (num_bits) { + case 0: + for (int i = 0; i < num_loops; ++i) in = nullunpacker32(in, out + i * 32); + break; + case 1: + for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32); + break; + case 2: + for (int i = 0; i < num_loops; ++i) in = unpack2_32(in, out + i * 32); + break; + case 3: + for (int i = 0; i < num_loops; ++i) in = unpack3_32(in, out + i * 32); + break; + case 4: + for (int i = 0; i < num_loops; ++i) in = unpack4_32(in, out + i * 32); + break; + case 5: + for (int i = 0; i < num_loops; ++i) in = unpack5_32(in, out + i * 32); + break; + case 6: + for (int i = 0; i < num_loops; ++i) in = unpack6_32(in, out + i * 32); + break; + case 7: + for (int i = 0; i < num_loops; ++i) in = unpack7_32(in, out + i * 32); + break; + case 8: + for (int i = 0; i < num_loops; ++i) in = unpack8_32(in, out + i * 32); + break; + case 9: + for (int i = 0; i < num_loops; ++i) in = unpack9_32(in, out + i * 32); + break; + case 10: + for (int i = 0; i < num_loops; ++i) in = unpack10_32(in, out + i * 32); + break; + case 11: + for (int i = 0; i < num_loops; ++i) in = unpack11_32(in, out + i * 32); + break; + case 12: + for (int i = 0; i < num_loops; ++i) in = unpack12_32(in, out + i * 32); + break; + case 13: + for (int i = 0; i < num_loops; ++i) in = unpack13_32(in, out + i * 32); + break; + case 14: + for (int i = 0; i < num_loops; ++i) in = unpack14_32(in, out + i * 32); + break; + case 15: + for (int i = 0; i < num_loops; ++i) in = unpack15_32(in, out + i * 32); + break; + case 16: + for (int i = 0; i < num_loops; ++i) in = unpack16_32(in, out + i * 32); + break; + case 17: + for (int i = 0; i < num_loops; ++i) in = unpack17_32(in, out + i * 32); + break; + case 18: + for (int i = 0; i < num_loops; ++i) in = unpack18_32(in, out + i * 32); + break; + case 19: + for (int i = 0; i < num_loops; ++i) in = unpack19_32(in, out + i * 32); + break; + case 20: + for (int i = 0; i < num_loops; ++i) in = unpack20_32(in, out + i * 32); + break; + case 21: + for (int i = 0; i < num_loops; ++i) in = unpack21_32(in, out + i * 32); + break; + case 22: + for (int i = 0; i < num_loops; ++i) in = unpack22_32(in, out + i * 32); + break; + case 23: + for (int i = 0; i < num_loops; ++i) in = unpack23_32(in, out + i * 32); + break; + case 24: + for (int i = 0; i < num_loops; ++i) in = unpack24_32(in, out + i * 32); + break; + case 25: + for (int i = 0; i < num_loops; ++i) in = unpack25_32(in, out + i * 32); + break; + case 26: + for (int i = 0; i < num_loops; ++i) in = unpack26_32(in, out + i * 32); + break; + case 27: + for (int i = 0; i < num_loops; ++i) in = unpack27_32(in, out + i * 32); + break; + case 28: + for (int i = 0; i < num_loops; ++i) in = unpack28_32(in, out + i * 32); + break; + case 29: + for (int i = 0; i < num_loops; ++i) in = unpack29_32(in, out + i * 32); + break; + case 30: + for (int i = 0; i < num_loops; ++i) in = unpack30_32(in, out + i * 32); + break; + case 31: + for (int i = 0; i < num_loops; ++i) in = unpack31_32(in, out + i * 32); + break; + case 32: + for (int i = 0; i < num_loops; ++i) in = unpack32_32(in, out + i * 32); + break; + default: + DCHECK(false) << "Unsupported num_bits"; + } + + return batch_size; +} + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_BPACKING_H diff --git a/r/R/inst/include/arrow/util/checked_cast.h b/r/R/inst/include/arrow/util/checked_cast.h new file mode 100644 index 00000000000..718f1057343 --- /dev/null +++ b/r/R/inst/include/arrow/util/checked_cast.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_CAST_H +#define ARROW_CAST_H + +#include +#include + +namespace arrow { +namespace internal { + +template +inline OutputType checked_cast(InputType&& value) { + static_assert(std::is_class::type>::type>::value, + "checked_cast input type must be a class"); + static_assert(std::is_class::type>::type>::value, + "checked_cast output type must be a class"); +#ifdef NDEBUG + return static_cast(value); +#else + return dynamic_cast(value); +#endif +} + +template +std::shared_ptr checked_pointer_cast(const std::shared_ptr& r) noexcept { +#ifndef NDEBUG + return std::static_pointer_cast(r); +#else + return std::dynamic_pointer_cast(r); +#endif +} + +} // namespace internal +} // namespace arrow + +#endif // ARROW_CAST_H diff --git a/r/R/inst/include/arrow/util/compiler-util.h b/r/R/inst/include/arrow/util/compiler-util.h new file mode 100644 index 00000000000..820a9b0c11b --- /dev/null +++ b/r/R/inst/include/arrow/util/compiler-util.h @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Deprecated header, here for backwards compatibility in parquet-cpp + +#ifndef ARROW_UTIL_COMPILER_UTIL_H +#define ARROW_UTIL_COMPILER_UTIL_H + +#include "arrow/util/macros.h" + +#endif // ARROW_UTIL_COMPILER_UTIL_H diff --git a/r/R/inst/include/arrow/util/compression.h b/r/R/inst/include/arrow/util/compression.h new file mode 100644 index 00000000000..43174f4dba4 --- /dev/null +++ b/r/R/inst/include/arrow/util/compression.h @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_H +#define ARROW_UTIL_COMPRESSION_H + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Status; + +struct Compression { + enum type { UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO, BZ2 }; +}; + +namespace util { + +/// \brief Streaming compressor interface +/// +class ARROW_EXPORT Compressor { + public: + virtual ~Compressor(); + + /// \brief Compress some input. + /// + /// If bytes_read is 0 on return, then a larger output buffer should be supplied. + virtual Status Compress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output, int64_t* bytes_read, + int64_t* bytes_written) = 0; + + /// \brief Flush part of the compressed output. + /// + /// If should_retry is true on return, Flush() should be called again + /// with a larger buffer. + virtual Status Flush(int64_t output_len, uint8_t* output, int64_t* bytes_written, + bool* should_retry) = 0; + + /// \brief End compressing, doing whatever is necessary to end the stream. + /// + /// If should_retry is true on return, End() should be called again + /// with a larger buffer. Otherwise, the Compressor should not be used anymore. + /// + /// End() implies Flush(). + virtual Status End(int64_t output_len, uint8_t* output, int64_t* bytes_written, + bool* should_retry) = 0; + + // XXX add methods for buffer size heuristics? +}; + +/// \brief Streaming decompressor interface +/// +class ARROW_EXPORT Decompressor { + public: + virtual ~Decompressor(); + + /// \brief Decompress some input. + /// + /// If need_more_output is true on return, a larger output buffer needs + /// to be supplied. + /// XXX is need_more_output necessary? (Brotli?) + virtual Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output, int64_t* bytes_read, int64_t* bytes_written, + bool* need_more_output) = 0; + + /// \brief Return whether the compressed stream is finished. + /// + /// This is a heuristic. If true is returned, then it is guaranteed + /// that the stream is finished. If false is returned, however, it may + /// simply be that the underlying library isn't able to provide the information. + virtual bool IsFinished() = 0; + + // XXX add methods for buffer size heuristics? +}; + +class ARROW_EXPORT Codec { + public: + virtual ~Codec(); + + static Status Create(Compression::type codec, std::unique_ptr* out); + + /// \brief One-shot decompression function + /// + /// output_buffer_len must be correct and therefore be obtained in advance. + /// + /// \note One-shot decompression is not always compatible with streaming + /// compression. Depending on the codec (e.g. LZ4), different formats may + /// be used. + virtual Status Decompress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer) = 0; + + /// \brief One-shot decompression function that also returns the + /// actual decompressed size. + /// + /// \param[in] input_len the number of bytes of compressed data. + /// \param[in] input the compressed data. + /// \param[in] output_buffer_len the number of bytes of buffer for + /// decompressed data. + /// \param[in] output_buffer the buffer for decompressed data. + /// \param[out] output_len the actual decompressed size. + /// + /// \note One-shot decompression is not always compatible with streaming + /// compression. Depending on the codec (e.g. LZ4), different formats may + /// be used. + virtual Status Decompress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_len) = 0; + + /// \brief One-shot compression function + /// + /// output_buffer_len must first have been computed using MaxCompressedLen(). + /// + /// \note One-shot compression is not always compatible with streaming + /// decompression. Depending on the codec (e.g. LZ4), different formats may + /// be used. + virtual Status Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer, + int64_t* output_len) = 0; + + virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0; + + // XXX Should be able to choose compression level, or presets? ("fast", etc.) + + /// \brief Create a streaming compressor instance + virtual Status MakeCompressor(std::shared_ptr* out) = 0; + + /// \brief Create a streaming decompressor instance + virtual Status MakeDecompressor(std::shared_ptr* out) = 0; + + virtual const char* name() const = 0; +}; + +} // namespace util +} // namespace arrow + +#endif diff --git a/r/R/inst/include/arrow/util/compression_brotli.h b/r/R/inst/include/arrow/util/compression_brotli.h new file mode 100644 index 00000000000..59f97cda6b9 --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_brotli.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_BROTLI_H +#define ARROW_UTIL_COMPRESSION_BROTLI_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +// Brotli codec. +class ARROW_EXPORT BrotliCodec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override { return "brotli"; } +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_BROTLI_H diff --git a/r/R/inst/include/arrow/util/compression_bz2.h b/r/R/inst/include/arrow/util/compression_bz2.h new file mode 100644 index 00000000000..21461588255 --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_bz2.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_BZ2_H +#define ARROW_UTIL_COMPRESSION_BZ2_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +// BZ2 codec. +class ARROW_EXPORT BZ2Codec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override { return "bz2"; } +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_BZ2_H diff --git a/r/R/inst/include/arrow/util/compression_lz4.h b/r/R/inst/include/arrow/util/compression_lz4.h new file mode 100644 index 00000000000..4d06f03c2c4 --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_lz4.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_LZ4_H +#define ARROW_UTIL_COMPRESSION_LZ4_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +// Lz4 codec. +class ARROW_EXPORT Lz4Codec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override { return "lz4"; } +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_LZ4_H diff --git a/r/R/inst/include/arrow/util/compression_snappy.h b/r/R/inst/include/arrow/util/compression_snappy.h new file mode 100644 index 00000000000..7029400ab2e --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_snappy.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_SNAPPY_H +#define ARROW_UTIL_COMPRESSION_SNAPPY_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +class ARROW_EXPORT SnappyCodec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override { return "snappy"; } +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_SNAPPY_H diff --git a/r/R/inst/include/arrow/util/compression_zlib.h b/r/R/inst/include/arrow/util/compression_zlib.h new file mode 100644 index 00000000000..9a5feaa290c --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_zlib.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_ZLIB_H +#define ARROW_UTIL_COMPRESSION_ZLIB_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +// GZip codec. +class ARROW_EXPORT GZipCodec : public Codec { + public: + /// Compression formats supported by the zlib library + enum Format { + ZLIB, + DEFLATE, + GZIP, + }; + + explicit GZipCodec(Format format = GZIP); + ~GZipCodec() override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override; + + private: + // The gzip compressor is stateful + class GZipCodecImpl; + std::unique_ptr impl_; +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_ZLIB_H diff --git a/r/R/inst/include/arrow/util/compression_zstd.h b/r/R/inst/include/arrow/util/compression_zstd.h new file mode 100644 index 00000000000..8b05d8c80a9 --- /dev/null +++ b/r/R/inst/include/arrow/util/compression_zstd.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_COMPRESSION_ZSTD_H +#define ARROW_UTIL_COMPRESSION_ZSTD_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/compression.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +// ZSTD codec. +class ARROW_EXPORT ZSTDCodec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer) override; + + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_len) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + Status MakeCompressor(std::shared_ptr* out) override; + + Status MakeDecompressor(std::shared_ptr* out) override; + + const char* name() const override { return "zstd"; } +}; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_COMPRESSION_ZSTD_H diff --git a/r/R/inst/include/arrow/util/cpu-info.h b/r/R/inst/include/arrow/util/cpu-info.h new file mode 100644 index 00000000000..714d7ac5bc5 --- /dev/null +++ b/r/R/inst/include/arrow/util/cpu-info.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal +// set of functions needed for Apache Arrow / Apache parquet-cpp + +#ifndef ARROW_UTIL_CPU_INFO_H +#define ARROW_UTIL_CPU_INFO_H + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// CpuInfo is an interface to query for cpu information at runtime. The caller can +/// ask for the sizes of the caches and what hardware features are supported. +/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and +/// /sys/devices) +class ARROW_EXPORT CpuInfo { + public: + static constexpr int64_t SSSE3 = (1 << 1); + static constexpr int64_t SSE4_1 = (1 << 2); + static constexpr int64_t SSE4_2 = (1 << 3); + static constexpr int64_t POPCNT = (1 << 4); + + /// Cache enums for L1 (data), L2 and L3 + enum CacheLevel { + L1_CACHE = 0, + L2_CACHE = 1, + L3_CACHE = 2, + }; + + static CpuInfo* GetInstance(); + + /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error + /// and terminate. + void VerifyCpuRequirements(); + + /// Returns all the flags for this cpu + int64_t hardware_flags(); + + /// Returns whether of not the cpu supports this flag + bool IsSupported(int64_t flag) const { return (hardware_flags_ & flag) != 0; } + + /// \brief The processor supports SSE4.2 and the Arrow libraries are built + /// with support for it + bool CanUseSSE4_2() const; + + /// Toggle a hardware feature on and off. It is not valid to turn on a feature + /// that the underlying hardware cannot support. This is useful for testing. + void EnableFeature(int64_t flag, bool enable); + + /// Returns the size of the cache in KB at this cache level + int64_t CacheSize(CacheLevel level); + + /// Returns the number of cpu cycles per millisecond + int64_t cycles_per_ms(); + + /// Returns the number of cores (including hyper-threaded) on this machine. + int num_cores(); + + /// Returns the model name of the cpu (e.g. Intel i7-2600) + std::string model_name(); + + private: + CpuInfo(); + + void Init(); + + /// Inits CPU cache size variables with default values + void SetDefaultCacheSize(); + + int64_t hardware_flags_; + int64_t original_hardware_flags_; + int64_t cache_sizes_[L3_CACHE + 1]; + int64_t cycles_per_ms_; + int num_cores_; + std::string model_name_; +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_CPU_INFO_H diff --git a/r/R/inst/include/arrow/util/decimal.h b/r/R/inst/include/arrow/util/decimal.h new file mode 100644 index 00000000000..3a576d085aa --- /dev/null +++ b/r/R/inst/include/arrow/util/decimal.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/basic_decimal.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +/// Represents a signed 128-bit integer in two's complement. +/// Calculations wrap around and overflow is ignored. +/// +/// For a discussion of the algorithms, look at Knuth's volume 2, +/// Semi-numerical Algorithms section 4.3.1. +/// +/// Adapted from the Apache ORC C++ implementation +/// +/// The implementation is split into two parts : +/// +/// 1. BasicDecimal128 +/// - can be safely compiled to IR without references to libstdc++. +/// 2. Decimal128 +/// - has additional functionality on top of BasicDecimal128 to deal with +/// strings and streams. +class ARROW_EXPORT Decimal128 : public BasicDecimal128 { + public: + /// \cond FALSE + // (need to avoid a duplicate definition in Sphinx) + using BasicDecimal128::BasicDecimal128; + /// \endcond + + /// \brief constructor creates a Decimal128 from a BasicDecimal128. + constexpr Decimal128(const BasicDecimal128& value) noexcept : BasicDecimal128(value) {} + + /// \brief Parse the number from a base 10 string representation. + explicit Decimal128(const std::string& value); + + /// \brief Empty constructor creates a Decimal128 with a value of 0. + // This is required on some older compilers. + constexpr Decimal128() noexcept : BasicDecimal128() {} + + /// Divide this number by right and return the result. + /// + /// This operation is not destructive. + /// The answer rounds to zero. Signs work like: + /// 21 / 5 -> 4, 1 + /// -21 / 5 -> -4, -1 + /// 21 / -5 -> -4, 1 + /// -21 / -5 -> 4, -1 + /// \param[in] divisor the number to divide by + /// \param[out] result the quotient + /// \param[out] remainder the remainder after the division + Status Divide(const Decimal128& divisor, Decimal128* result, + Decimal128* remainder) const { + auto dstatus = BasicDecimal128::Divide(divisor, result, remainder); + return ToArrowStatus(dstatus); + } + + /// \brief Convert the Decimal128 value to a base 10 decimal string with the given + /// scale. + std::string ToString(int32_t scale) const; + + /// \brief Convert the value to an integer string + std::string ToIntegerString() const; + + /// \brief Cast this value to an int64_t. + explicit operator int64_t() const; + + /// \brief Convert a decimal string to a Decimal128 value, optionally including + /// precision and scale if they're passed in and not null. + static Status FromString(const util::string_view& s, Decimal128* out, + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const std::string& s, Decimal128* out, + int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); + static Status FromString(const char* s, Decimal128* out, int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); + + /// \brief Convert from a big-endian byte representation. The length must be + /// between 1 and 16. + /// \return error status if the length is an invalid value + static Status FromBigEndian(const uint8_t* data, int32_t length, Decimal128* out); + + /// \brief Convert Decimal128 from one scale to another + Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const { + auto dstatus = BasicDecimal128::Rescale(original_scale, new_scale, out); + return ToArrowStatus(dstatus); + } + + /// \brief Convert to a signed integer + template > + Status ToInteger(T* out) const { + constexpr auto min_value = std::numeric_limits::min(); + constexpr auto max_value = std::numeric_limits::max(); + const auto& self = *this; + if (self < min_value || self > max_value) { + return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T), + " byte integer"); + } + *out = static_cast(low_bits()); + return Status::OK(); + } + + friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os, + const Decimal128& decimal); + + private: + /// Converts internal error code to Status + Status ToArrowStatus(DecimalStatus dstatus) const; +}; + +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/hash-util.h b/r/R/inst/include/arrow/util/hash-util.h new file mode 100644 index 00000000000..7aed3c171dc --- /dev/null +++ b/r/R/inst/include/arrow/util/hash-util.h @@ -0,0 +1,310 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala (incubating) as of 2016-02-22 + +#ifndef ARROW_UTIL_HASH_UTIL_H +#define ARROW_UTIL_HASH_UTIL_H + +#include +#include + +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/neon-util.h" +#include "arrow/util/sse-util.h" + +static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { + DCHECK(false) << "Hardware CRC support is not enabled"; + return 0; +} + +static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { + DCHECK(false) << "Hardware CRC support is not enabled"; + return 0; +} + +static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { + DCHECK(false) << "Hardware CRC support is not enabled"; + return 0; +} + +static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { + DCHECK(false) << "Hardware CRC support is not enabled"; + return 0; +} + +#ifdef ARROW_HAVE_SSE4_2 +#define HW_crc32_u8 SSE4_crc32_u8 +#define HW_crc32_u16 SSE4_crc32_u16 +#define HW_crc32_u32 SSE4_crc32_u32 +#define HW_crc32_u64 SSE4_crc32_u64 +#elif defined(ARROW_HAVE_ARM_CRC) +#define HW_crc32_u8 ARMCE_crc32_u8 +#define HW_crc32_u16 ARMCE_crc32_u16 +#define HW_crc32_u32 ARMCE_crc32_u32 +#define HW_crc32_u64 ARMCE_crc32_u64 +#endif + +namespace arrow { + +/// Utility class to compute hash values. +class HashUtil { + public: +#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC) + static constexpr bool have_hardware_crc32 = true; +#else + static constexpr bool have_hardware_crc32 = false; +#endif + + /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions. The input hash + /// parameter is the current hash/seed value. + /// This should only be called if SSE/ArmCRC is supported. + /// This is ~4x faster than Fnv/Boost Hash. + /// TODO: crc32 hashes with different seeds do not result in different hash functions. + /// The resulting hashes are correlated. + static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) { + const uint8_t* p = reinterpret_cast(data); + const uint8_t* end = p + nbytes; + +#if ARROW_BITNESS >= 64 + while (p <= end - 8) { + hash = HW_crc32_u64(hash, *reinterpret_cast(p)); + p += 8; + } +#endif + + while (p <= end - 4) { + hash = HW_crc32_u32(hash, *reinterpret_cast(p)); + p += 4; + } + while (p < end) { + hash = HW_crc32_u8(hash, *p); + ++p; + } + + // The lower half of the CRC hash has has poor uniformity, so swap the halves + // for anyone who only uses the first several bits of the hash. + hash = (hash << 16) | (hash >> 16); + return hash; + } + + /// A variant of CRC32 hashing that computes two independent running CRCs + /// over interleaved halves of the input, giving out a 64-bit integer. + /// The result's quality should be improved by a finalization step. + /// + /// In addition to producing more bits of output, this should be twice + /// faster than CrcHash on CPUs that can overlap several independent + /// CRC computations. + static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) { + const uint8_t* p = reinterpret_cast(data); + + uint32_t h1 = static_cast(hash >> 32); + uint32_t h2 = static_cast(hash); + +#if ARROW_BITNESS >= 64 + while (nbytes >= 16) { + h1 = HW_crc32_u64(h1, *reinterpret_cast(p)); + h2 = HW_crc32_u64(h2, *reinterpret_cast(p + 8)); + nbytes -= 16; + p += 16; + } + if (nbytes >= 8) { + h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); + h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); + nbytes -= 8; + p += 8; + } +#else + while (nbytes >= 8) { + h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); + h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); + nbytes -= 8; + p += 8; + } +#endif + + if (nbytes >= 4) { + h1 = HW_crc32_u16(h1, *reinterpret_cast(p)); + h2 = HW_crc32_u16(h2, *reinterpret_cast(p + 2)); + nbytes -= 4; + p += 4; + } + switch (nbytes) { + case 3: + h1 = HW_crc32_u8(h1, p[2]); + // fallthrough + case 2: + h2 = HW_crc32_u8(h2, p[1]); + // fallthrough + case 1: + h1 = HW_crc32_u8(h1, p[0]); + // fallthrough + case 0: + break; + default: + assert(0); + } + + // A finalization step is recommended to mix up the result's bits + return (static_cast(h1) << 32) + h2; + } + + static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995; + static const int MURMUR_R = 47; + + /// Murmur2 hash implementation returning 64-bit hashes. + static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { + uint64_t h = seed ^ (len * MURMUR_PRIME); + + const uint64_t* data = reinterpret_cast(input); + const uint64_t* end = data + (len / sizeof(uint64_t)); + + while (data != end) { + uint64_t k = *data++; + k *= MURMUR_PRIME; + k ^= k >> MURMUR_R; + k *= MURMUR_PRIME; + h ^= k; + h *= MURMUR_PRIME; + } + + const uint8_t* data2 = reinterpret_cast(data); + switch (len & 7) { + case 7: + h ^= uint64_t(data2[6]) << 48; + case 6: + h ^= uint64_t(data2[5]) << 40; + case 5: + h ^= uint64_t(data2[4]) << 32; + case 4: + h ^= uint64_t(data2[3]) << 24; + case 3: + h ^= uint64_t(data2[2]) << 16; + case 2: + h ^= uint64_t(data2[1]) << 8; + case 1: + h ^= uint64_t(data2[0]); + h *= MURMUR_PRIME; + } + + h ^= h >> MURMUR_R; + h *= MURMUR_PRIME; + h ^= h >> MURMUR_R; + return h; + } + + /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/ + static const uint32_t FNV_PRIME = 0x01000193; // 16777619 + static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 + static const uint64_t FNV64_PRIME = 1099511628211UL; + static const uint64_t FNV64_SEED = 14695981039346656037UL; + + /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant + /// as boost's hash on int types (2x slower) but has bit entropy. + /// For ints, boost just returns the value of the int which can be pathological. + /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 + /// is taken on the hash, all values will collide to the same bucket. + /// For string values, Fnv is slightly faster than boost. + /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit, + /// which can lead to poor results when input bytes are duplicated. + /// See FnvHash64to32() for how this can be mitigated. + static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) { + const uint8_t* ptr = reinterpret_cast(data); + while (bytes--) { + hash = (*ptr ^ hash) * FNV64_PRIME; + ++ptr; + } + return hash; + } + + /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits. + /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the + /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs. + /// The input seed 'hash' is duplicated so the top half of the seed is not all zero. + /// Data length must be at least 1 byte: zero-length data should be handled separately, + /// for example using CombineHash with a unique constant value to avoid returning the + /// hash argument. Zero-length data gives terrible results: the initial hash value is + /// xored with itself cancelling all bits. + static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) { + // IMPALA-2270: this function should never be used for zero-byte inputs. + DCHECK_GT(bytes, 0); + uint64_t hash_u64 = hash | (static_cast(hash) << 32); + hash_u64 = FnvHash64(data, bytes, hash_u64); + return static_cast((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF)); + } + + // Hash template + template + static inline int Hash(const void* data, int32_t bytes, uint32_t seed); + + /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio). + static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9; + + /// Combine hashes 'value' and 'seed' to get a new hash value. Similar to + /// boost::hash_combine(), but for uint32_t. This function should be used with a + /// constant first argument to update the hash value for zero-length values such as + /// NULL, boolean, and empty strings. + static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) { + return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2)); + } + + // Get 32 more bits of randomness from a 32-bit hash: + static inline uint32_t Rehash32to32(const uint32_t hash) { + // Constants generated by uuidgen(1) with the -r flag + static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull; + // This is strongly universal hashing following Dietzfelbinger's "Universal hashing + // and k-wise independent random variables via integer arithmetic without primes". As + // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the + // randomness of the constants) that any subset of bit positions of + // Rehash32to32(hash1) is equal to the same subset of bit positions + // Rehash32to32(hash2) is minimal. + return static_cast((static_cast(hash) * m + a) >> 32); + } + + static inline uint64_t Rehash32to64(const uint32_t hash) { + static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull, + a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull; + const uint64_t hash1 = (static_cast(hash) * m1 + a1) >> 32; + const uint64_t hash2 = (static_cast(hash) * m2 + a2) >> 32; + return hash1 | (hash2 << 32); + } +}; + +// HW Hash +template <> +inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { +#ifdef ARROW_HAVE_ARM_CRC + // Need run time check for Arm + // if not support, fall back to Murmur + if (!crc32c_runtime_check()) + return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); + else +#endif + // Double CRC + return static_cast(HashUtil::DoubleCrcHash(data, bytes, seed)); +} + +// Murmur Hash +template <> +inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { + return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); +} + +} // namespace arrow + +#endif // ARROW_UTIL_HASH_UTIL_H diff --git a/r/R/inst/include/arrow/util/hashing.h b/r/R/inst/include/arrow/util/hashing.h new file mode 100644 index 00000000000..27301585fc6 --- /dev/null +++ b/r/R/inst/include/arrow/util/hashing.h @@ -0,0 +1,807 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Private header, not to be exported + +#ifndef ARROW_UTIL_HASHING_H +#define ARROW_UTIL_HASHING_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/hash-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace internal { + +// XXX would it help to have a 32-bit hash value on large datasets? +typedef uint64_t hash_t; + +// Notes about the choice of a hash function. +// - xxHash64 is extremely fast on large enough data +// - for small- to medium-sized data, there are better choices +// (see comprehensive benchmarks results at +// https://aras-p.info/blog/2016/08/09/More-Hash-Function-Tests/) +// - for very small fixed-size data (<= 16 bytes, e.g. Decimal128), it is +// beneficial to define specialized hash functions +// - while xxHash and others have good statistical properties, we can relax those +// a bit if it helps performance (especially if the hash table implementation +// has a good collision resolution strategy) + +template +inline hash_t ComputeStringHash(const void* data, int64_t length); + +template +struct ScalarHelperBase { + static bool CompareScalars(Scalar u, Scalar v) { return u == v; } + + static hash_t ComputeHash(const Scalar& value) { + // Generic hash computation for scalars. Simply apply the string hash + // to the bit representation of the value. + + // XXX in the case of FP values, we'd like equal values to have the same hash, + // even if they have different bit representations... + return ComputeStringHash(&value, sizeof(value)); + } +}; + +template +struct ScalarHelper : public ScalarHelperBase {}; + +template +struct ScalarHelper::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for integers + + static hash_t ComputeHash(const Scalar& value) { + // Faster hash computation for integers. + + // Two of xxhash's prime multipliers (which are chosen for their + // bit dispersion properties) + static constexpr uint64_t multipliers[] = {11400714785074694791ULL, + 14029467366897019727ULL}; + + // Multiplying by the prime number mixes the low bits into the high bits, + // then byte-swapping (which is a single CPU instruction) allows the + // combined high and low bits to participate in the initial hash table index. + auto h = static_cast(value); + return BitUtil::ByteSwap(multipliers[AlgNum] * h); + } +}; + +template +struct ScalarHelper< + Scalar, AlgNum, + typename std::enable_if::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for util::string_view + + static hash_t ComputeHash(const util::string_view& value) { + return ComputeStringHash(value.data(), static_cast(value.size())); + } +}; + +template +struct ScalarHelper::value>::type> + : public ScalarHelperBase { + // ScalarHelper specialization for reals + + static bool CompareScalars(Scalar u, Scalar v) { + if (std::isnan(u)) { + // XXX should we do a bit-precise comparison? + return std::isnan(v); + } + return u == v; + } +}; + +template +hash_t ComputeStringHash(const void* data, int64_t length) { + if (ARROW_PREDICT_TRUE(length <= 16)) { + // Specialize for small hash strings, as they are quite common as + // hash table keys. + auto p = reinterpret_cast(data); + auto n = static_cast(length); + if (n <= 8) { + if (n <= 3) { + if (n == 0) { + return 1U; + } + uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1]; + return ScalarHelper::ComputeHash(x); + } + // 4 <= length <= 8 + // We can read the string as two overlapping 32-bit ints, apply + // different hash functions to each of them in parallel, then XOR + // the results + uint32_t x, y; + hash_t hx, hy; + // XXX those are unaligned accesses. Should we have a facility for that? + x = *reinterpret_cast(p + n - 4); + y = *reinterpret_cast(p); + hx = ScalarHelper::ComputeHash(x); + hy = ScalarHelper::ComputeHash(y); + return n ^ hx ^ hy; + } + // 8 <= length <= 16 + // Apply the same principle as above + uint64_t x, y; + hash_t hx, hy; + x = *reinterpret_cast(p + n - 8); + y = *reinterpret_cast(p); + hx = ScalarHelper::ComputeHash(x); + hy = ScalarHelper::ComputeHash(y); + return n ^ hx ^ hy; + } + + if (HashUtil::have_hardware_crc32) { + // DoubleCrcHash is faster that Murmur2. + auto h = HashUtil::DoubleCrcHash(data, static_cast(length), AlgNum); + return ScalarHelper::ComputeHash(h); + } else { + // Fall back on 64-bit Murmur2 for longer strings. + // It has decent speed for medium-sized strings. There may be faster + // hashes on long strings such as xxHash, but that may not matter much + // for the typical length distribution of hash keys. + return HashUtil::MurmurHash2_64(data, static_cast(length), AlgNum); + } +} + +// XXX add a HashEq struct with both hash and compare functions? + +// ---------------------------------------------------------------------- +// An open-addressing insert-only hash table (no deletes) + +template +class HashTable { + public: + struct Entry { + hash_t h; + Payload payload; + }; + + explicit HashTable(uint64_t capacity) { + // Presize for at least 8 elements + capacity = std::max(capacity, static_cast(8U)); + size_ = BitUtil::NextPower2(capacity * 4U); + size_mask_ = size_ - 1; + n_filled_ = 0; + // This will zero out hash entries, marking them empty + entries_.resize(size_); + } + + // Lookup with non-linear probing + // cmp_func should have signature bool(const Payload*). + // Return a (Entry*, found) pair. + template + std::pair Lookup(hash_t h, CmpFunc&& cmp_func) { + auto p = Lookup(h, entries_.data(), size_mask_, + std::forward(cmp_func)); + return {&entries_[p.first], p.second}; + } + + template + std::pair Lookup(hash_t h, CmpFunc&& cmp_func) const { + auto p = Lookup(h, entries_.data(), size_mask_, + std::forward(cmp_func)); + return {&entries_[p.first], p.second}; + } + + void Insert(Entry* entry, hash_t h, const Payload& payload) { + assert(entry->h == 0); + entry->h = FixHash(h); + entry->payload = payload; + ++n_filled_; + if (NeedUpsizing()) { + // Resizing is expensive, avoid doing it too often + Upsize(size_ * 4); + } + } + + uint64_t size() const { return n_filled_; } + + // Visit all non-empty entries in the table + // The visit_func should have signature void(const Entry*) + template + void VisitEntries(VisitFunc&& visit_func) const { + for (const auto& entry : entries_) { + if (entry.h != 0U) { + visit_func(&entry); + } + } + } + + protected: + // NoCompare is for when the value is known not to exist in the table + enum CompareKind { DoCompare, NoCompare }; + + // The workhorse lookup function + template + std::pair Lookup(hash_t h, const Entry* entries, uint64_t size_mask, + CmpFunc&& cmp_func) const { + static constexpr uint8_t perturb_shift = 5; + + uint64_t index, perturb; + const Entry* entry; + + h = FixHash(h); + index = h & size_mask; + perturb = (h >> perturb_shift) + 1U; + + while (true) { + entry = &entries[index]; + if (CompareEntry(h, entry, std::forward(cmp_func))) { + // Found + return {index, true}; + } + if (entry->h == 0U) { + // Empty slot + return {index, false}; + } + + // Perturbation logic inspired from CPython's set / dict object. + // The goal is that all 64 bits of the unmasked hash value eventually + // participate in the probing sequence, to minimize clustering. + index = (index + perturb) & size_mask; + perturb = (perturb >> perturb_shift) + 1U; + } + } + + template + bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const { + if (CKind == NoCompare) { + return false; + } else { + return entry->h == h && cmp_func(&entry->payload); + } + } + + bool NeedUpsizing() const { + // Keep the load factor <= 1/2 + return n_filled_ * 2U >= size_; + } + + void Upsize(uint64_t new_size) { + assert(new_size > size_); + uint64_t new_mask = new_size - 1; + assert((new_size & new_mask) == 0); // it's a power of two + + std::vector new_entries(new_size); + for (auto& entry : entries_) { + hash_t h = entry.h; + if (h != 0) { + // Dummy compare function (will not be called) + auto cmp_func = [](const Payload*) { return false; }; + // Non-empty slot, move into new + auto p = Lookup(h, new_entries.data(), new_mask, cmp_func); + assert(!p.second); // shouldn't have found a matching entry + Entry* new_entry = &new_entries[p.first]; + new_entry->h = h; + new_entry->payload = entry.payload; + } + } + std::swap(entries_, new_entries); + size_ = new_size; + size_mask_ = new_mask; + } + + hash_t FixHash(hash_t h) const { + // 0 is used to indicate empty entries + return (h == 0U) ? 42U : h; + } + + uint64_t size_; + uint64_t size_mask_; + uint64_t n_filled_; + std::vector entries_; +}; + +// XXX typedef memo_index_t int32_t ? + +// ---------------------------------------------------------------------- +// A base class for memoization table. + +class MemoTable { + public: + virtual ~MemoTable() = default; + + virtual int32_t size() const = 0; +}; + +// ---------------------------------------------------------------------- +// A memoization table for memory-cheap scalar values. + +// The memoization table remembers and allows to look up the insertion +// index for each key. + +template class HashTableTemplateType = HashTable> +class ScalarMemoTable : public MemoTable { + public: + explicit ScalarMemoTable(int64_t entries = 0) + : hash_table_(static_cast(entries)) {} + + int32_t Get(const Scalar& value) const { + auto cmp_func = [value](const Payload* payload) -> bool { + return ScalarHelper::CompareScalars(payload->value, value); + }; + hash_t h = ComputeHash(value); + auto p = hash_table_.Lookup(h, cmp_func); + if (p.second) { + return p.first->payload.memo_index; + } else { + return -1; + } + } + + template + int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) { + auto cmp_func = [value](const Payload* payload) -> bool { + return ScalarHelper::CompareScalars(value, payload->value); + }; + hash_t h = ComputeHash(value); + auto p = hash_table_.Lookup(h, cmp_func); + int32_t memo_index; + if (p.second) { + memo_index = p.first->payload.memo_index; + on_found(memo_index); + } else { + memo_index = size(); + hash_table_.Insert(p.first, h, {value, memo_index}); + on_not_found(memo_index); + } + return memo_index; + } + + int32_t GetOrInsert(const Scalar& value) { + return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); + } + + // The number of entries in the memo table + // (which is also 1 + the largest memo index) + int32_t size() const override { return static_cast(hash_table_.size()); } + + // Copy values starting from index `start` into `out_data` + void CopyValues(int32_t start, Scalar* out_data) const { + hash_table_.VisitEntries([=](const HashTableEntry* entry) { + int32_t index = entry->payload.memo_index - start; + if (index >= 0) { + out_data[index] = entry->payload.value; + } + }); + } + + void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } + + protected: + struct Payload { + Scalar value; + int32_t memo_index; + }; + + using HashTableType = HashTableTemplateType; + using HashTableEntry = typename HashTableType::Entry; + HashTableType hash_table_; + + hash_t ComputeHash(const Scalar& value) const { + return ScalarHelper::ComputeHash(value); + } +}; + +// ---------------------------------------------------------------------- +// A memoization table for small scalar values, using direct indexing + +template +struct SmallScalarTraits {}; + +template <> +struct SmallScalarTraits { + static constexpr int32_t cardinality = 2; + + static uint32_t AsIndex(bool value) { return value ? 1 : 0; } +}; + +template +struct SmallScalarTraits::value>::type> { + using Unsigned = typename std::make_unsigned::type; + + static constexpr int32_t cardinality = 1U + std::numeric_limits::max(); + + static uint32_t AsIndex(Scalar value) { return static_cast(value); } +}; + +template class HashTableTemplateType = HashTable> +class SmallScalarMemoTable : public MemoTable { + public: + explicit SmallScalarMemoTable(int64_t entries = 0) { + std::fill(value_to_index_, value_to_index_ + cardinality, -1); + index_to_value_.reserve(cardinality); + } + + int32_t Get(const Scalar value) const { + auto value_index = AsIndex(value); + return value_to_index_[value_index]; + } + + template + int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) { + auto value_index = AsIndex(value); + auto memo_index = value_to_index_[value_index]; + if (memo_index < 0) { + memo_index = static_cast(index_to_value_.size()); + index_to_value_.push_back(value); + value_to_index_[value_index] = memo_index; + assert(memo_index < cardinality); + on_not_found(memo_index); + } else { + on_found(memo_index); + } + return memo_index; + } + + int32_t GetOrInsert(const Scalar value) { + return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); + } + + // The number of entries in the memo table + // (which is also 1 + the largest memo index) + int32_t size() const override { return static_cast(index_to_value_.size()); } + + // Copy values starting from index `start` into `out_data` + void CopyValues(int32_t start, Scalar* out_data) const { + DCHECK_GE(start, 0); + DCHECK_LE(static_cast(start), index_to_value_.size()); + int64_t offset = start * static_cast(sizeof(Scalar)); + memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar)); + } + + void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } + + const std::vector& values() const { return index_to_value_; } + + protected: + static constexpr auto cardinality = SmallScalarTraits::cardinality; + static_assert(cardinality <= 256, "cardinality too large for direct-addressed table"); + + uint32_t AsIndex(Scalar value) const { + return SmallScalarTraits::AsIndex(value); + } + + int32_t value_to_index_[cardinality]; + std::vector index_to_value_; +}; + +// ---------------------------------------------------------------------- +// A memoization table for variable-sized binary data. + +class BinaryMemoTable : public MemoTable { + public: + explicit BinaryMemoTable(int64_t entries = 0, int64_t values_size = -1) + : hash_table_(static_cast(entries)) { + offsets_.reserve(entries + 1); + offsets_.push_back(0); + if (values_size == -1) { + values_.reserve(entries * 4); // A conservative heuristic + } else { + values_.reserve(values_size); + } + } + + int32_t Get(const void* data, int32_t length) const { + hash_t h = ComputeStringHash<0>(data, length); + auto p = Lookup(h, data, length); + if (p.second) { + return p.first->payload.memo_index; + } else { + return -1; + } + } + + int32_t Get(const std::string& value) const { + return Get(value.data(), static_cast(value.length())); + } + + int32_t Get(const util::string_view& value) const { + return Get(value.data(), static_cast(value.length())); + } + + template + int32_t GetOrInsert(const void* data, int32_t length, Func1&& on_found, + Func2&& on_not_found) { + hash_t h = ComputeStringHash<0>(data, length); + auto p = Lookup(h, data, length); + int32_t memo_index; + if (p.second) { + memo_index = p.first->payload.memo_index; + on_found(memo_index); + } else { + memo_index = size(); + // Insert offset + auto offset = static_cast(values_.size()); + assert(offsets_.size() == static_cast(memo_index + 1)); + assert(offsets_[memo_index] == offset); + offsets_.push_back(offset + length); + // Insert string value + values_.append(static_cast(data), length); + // Insert hash entry + hash_table_.Insert(const_cast(p.first), h, {memo_index}); + + on_not_found(memo_index); + } + return memo_index; + } + + template + int32_t GetOrInsert(const util::string_view& value, Func1&& on_found, + Func2&& on_not_found) { + return GetOrInsert(value.data(), static_cast(value.length()), + std::forward(on_found), std::forward(on_not_found)); + } + + int32_t GetOrInsert(const void* data, int32_t length) { + return GetOrInsert(data, length, [](int32_t i) {}, [](int32_t i) {}); + } + + int32_t GetOrInsert(const util::string_view& value) { + return GetOrInsert(value.data(), static_cast(value.length())); + } + + int32_t GetOrInsert(const std::string& value) { + return GetOrInsert(value.data(), static_cast(value.length())); + } + + // The number of entries in the memo table + // (which is also 1 + the largest memo index) + int32_t size() const override { return static_cast(hash_table_.size()); } + + int32_t values_size() const { return static_cast(values_.size()); } + + const uint8_t* values_data() const { + return reinterpret_cast(values_.data()); + } + + // Copy (n + 1) offsets starting from index `start` into `out_data` + template + void CopyOffsets(int32_t start, Offset* out_data) const { + auto delta = offsets_[start]; + for (uint32_t i = start; i < offsets_.size(); ++i) { + auto adjusted_offset = offsets_[i] - delta; + auto cast_offset = static_cast(adjusted_offset); + assert(static_cast(cast_offset) == adjusted_offset); // avoid truncation + *out_data++ = cast_offset; + } + } + + template + void CopyOffsets(Offset* out_data) const { + CopyOffsets(0, out_data); + } + + // Copy values starting from index `start` into `out_data` + void CopyValues(int32_t start, uint8_t* out_data) const { + CopyValues(start, -1, out_data); + } + + // Same as above, but check output size in debug mode + void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const { + int32_t offset = offsets_[start]; + auto length = values_.size() - static_cast(offset); + if (out_size != -1) { + assert(static_cast(length) == out_size); + } + memcpy(out_data, values_.data() + offset, length); + } + + void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); } + + void CopyValues(int64_t out_size, uint8_t* out_data) const { + CopyValues(0, out_size, out_data); + } + + // Visit the stored values in insertion order. + // The visitor function should have the signature `void(util::string_view)` + // or `void(const util::string_view&)`. + template + void VisitValues(int32_t start, VisitFunc&& visit) const { + for (uint32_t i = start; i < offsets_.size() - 1; ++i) { + visit( + util::string_view(values_.data() + offsets_[i], offsets_[i + 1] - offsets_[i])); + } + } + + protected: + struct Payload { + int32_t memo_index; + }; + + using HashTableType = HashTable; + using HashTableEntry = typename HashTable::Entry; + HashTableType hash_table_; + + std::vector offsets_; + std::string values_; + + std::pair Lookup(hash_t h, const void* data, + int32_t length) const { + auto cmp_func = [=](const Payload* payload) { + int32_t start, stop; + start = offsets_[payload->memo_index]; + stop = offsets_[payload->memo_index + 1]; + return length == stop - start && memcmp(data, values_.data() + start, length) == 0; + }; + return hash_table_.Lookup(h, cmp_func); + } +}; + +template +struct HashTraits {}; + +template <> +struct HashTraits { + using MemoTableType = SmallScalarMemoTable; +}; + +template +struct HashTraits> { + using c_type = typename T::c_type; + using MemoTableType = SmallScalarMemoTable; +}; + +template +struct HashTraits< + T, typename std::enable_if::value && !is_8bit_int::value>::type> { + using c_type = typename T::c_type; + using MemoTableType = ScalarMemoTable; +}; + +template +struct HashTraits> { + using MemoTableType = BinaryMemoTable; +}; + +template +struct HashTraits> { + using MemoTableType = BinaryMemoTable; +}; + +template +struct DictionaryTraits {}; + +template <> +struct DictionaryTraits { + using T = BooleanType; + using MemoTableType = typename HashTraits::MemoTableType; + + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + BooleanBuilder builder(pool); + const auto& bool_values = memo_table.values(); + auto it = bool_values.begin() + start_offset; + for (; it != bool_values.end(); ++it) { + RETURN_NOT_OK(builder.Append(*it)); + } + return builder.FinishInternal(out); + } +}; + +template +struct DictionaryTraits> { + using c_type = typename T::c_type; + using MemoTableType = typename HashTraits::MemoTableType; + + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + std::shared_ptr dict_buffer; + auto dict_length = static_cast(memo_table.size()) - start_offset; + // This makes a copy, but we assume a dictionary array is usually small + // compared to the size of the dictionary-using array. + // (also, copying the dictionary values is cheap compared to the cost + // of building the memo table) + RETURN_NOT_OK( + AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); + memo_table.CopyValues(static_cast(start_offset), + reinterpret_cast(dict_buffer->mutable_data())); + *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); + return Status::OK(); + } +}; + +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + std::shared_ptr dict_offsets; + std::shared_ptr dict_data; + + // Create the offsets buffer + auto dict_length = static_cast(memo_table.size() - start_offset); + RETURN_NOT_OK(AllocateBuffer( + pool, TypeTraits::bytes_required(dict_length + 1), &dict_offsets)); + auto raw_offsets = reinterpret_cast(dict_offsets->mutable_data()); + memo_table.CopyOffsets(static_cast(start_offset), raw_offsets); + + // Create the data buffer + DCHECK_EQ(raw_offsets[0], 0); + RETURN_NOT_OK(AllocateBuffer(pool, raw_offsets[dict_length], &dict_data)); + memo_table.CopyValues(static_cast(start_offset), dict_data->size(), + dict_data->mutable_data()); + + *out = ArrayData::Make(type, dict_length, {nullptr, dict_offsets, dict_data}, + 0 /* null_count */); + return Status::OK(); + } +}; + +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static Status GetDictionaryArrayData(MemoryPool* pool, + const std::shared_ptr& type, + const MemoTableType& memo_table, + int64_t start_offset, + std::shared_ptr* out) { + const T& concrete_type = internal::checked_cast(*type); + std::shared_ptr dict_data; + + // Create the data buffer + auto dict_length = static_cast(memo_table.size() - start_offset); + auto data_length = dict_length * concrete_type.byte_width(); + RETURN_NOT_OK(AllocateBuffer(pool, data_length, &dict_data)); + memo_table.CopyValues(static_cast(start_offset), data_length, + dict_data->mutable_data()); + + *out = ArrayData::Make(type, dict_length, {nullptr, dict_data}, 0 /* null_count */); + return Status::OK(); + } +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_HASHING_H diff --git a/r/R/inst/include/arrow/util/int-util.h b/r/R/inst/include/arrow/util/int-util.h new file mode 100644 index 00000000000..d3ae09f75cf --- /dev/null +++ b/r/R/inst/include/arrow/util/int-util.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_INT_UTIL_H +#define ARROW_UTIL_INT_UTIL_H + +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +ARROW_EXPORT +uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width = 1); + +ARROW_EXPORT +uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes, + int64_t length, uint8_t min_width = 1); + +ARROW_EXPORT +uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width = 1); + +ARROW_EXPORT +uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length, + uint8_t min_width = 1); + +ARROW_EXPORT +void DowncastInts(const int64_t* source, int8_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastInts(const int64_t* source, int16_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastInts(const int64_t* source, int32_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastInts(const int64_t* source, int64_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); + +ARROW_EXPORT +void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); + +template +ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, + const int32_t* transpose_map); + +/// Signed addition with well-defined behaviour on overflow (as unsigned) +template +SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) + + static_cast(v)); +} + +/// Signed left shift with well-defined behaviour on negative numbers or overflow +template +SignedInt SafeLeftShift(SignedInt u, Shift shift) { + using UnsignedInt = typename std::make_unsigned::type; + return static_cast(static_cast(u) << shift); +} + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_INT_UTIL_H diff --git a/r/R/inst/include/arrow/util/io-util.h b/r/R/inst/include/arrow/util/io-util.h new file mode 100644 index 00000000000..2b48a5c4833 --- /dev/null +++ b/r/R/inst/include/arrow/util/io-util.h @@ -0,0 +1,263 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_IO_UTIL_H +#define ARROW_UTIL_IO_UTIL_H + +#ifndef _WIN32 +#define ARROW_HAVE_SIGACTION 1 +#endif + +#include +#include + +#if ARROW_HAVE_SIGACTION +#include // Needed for struct sigaction +#endif + +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" + +// The Windows API defines DeleteFile as a macro resolving to either +// DeleteFileA or DeleteFileW. Need to undo it. +#if defined(_WIN32) && defined(DeleteFile) +#undef DeleteFile +#endif + +namespace arrow { + +class Buffer; + +namespace io { + +// Output stream that just writes to stdout. +class ARROW_EXPORT StdoutStream : public OutputStream { + public: + StdoutStream(); + ~StdoutStream() override {} + + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Write(const void* data, int64_t nbytes) override; + + private: + int64_t pos_; +}; + +// Output stream that just writes to stderr. +class ARROW_EXPORT StderrStream : public OutputStream { + public: + StderrStream(); + ~StderrStream() override {} + + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Write(const void* data, int64_t nbytes) override; + + private: + int64_t pos_; +}; + +// Input stream that just reads from stdin. +class ARROW_EXPORT StdinStream : public InputStream { + public: + StdinStream(); + ~StdinStream() override {} + + Status Close() override; + bool closed() const override; + + Status Tell(int64_t* position) const override; + + Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + + Status Read(int64_t nbytes, std::shared_ptr* out) override; + + private: + int64_t pos_; +}; + +} // namespace io + +namespace internal { + +// NOTE: 8-bit path strings on Windows are encoded using UTF-8. +// Using MBCS would fail encoding some paths. + +#if defined(_WIN32) +using NativePathString = std::wstring; +#else +using NativePathString = std::string; +#endif + +class ARROW_EXPORT PlatformFilename { + public: + ~PlatformFilename(); + PlatformFilename(); + PlatformFilename(const PlatformFilename&); + PlatformFilename(PlatformFilename&&); + PlatformFilename& operator=(const PlatformFilename&); + PlatformFilename& operator=(PlatformFilename&&); + explicit PlatformFilename(const NativePathString& path); + + const NativePathString& ToNative() const; + std::string ToString() const; + + // These functions can fail for character encoding reasons. + static Status FromString(const std::string& file_name, PlatformFilename* out); + Status Join(const std::string& child_name, PlatformFilename* out) const; + + private: + struct Impl; + std::unique_ptr impl_; + + explicit PlatformFilename(const Impl& impl); + explicit PlatformFilename(Impl&& impl); + + // Those functions need access to the embedded path object + friend ARROW_EXPORT Status CreateDir(const PlatformFilename&, bool*); + friend ARROW_EXPORT Status CreateDirTree(const PlatformFilename&, bool*); + friend ARROW_EXPORT Status DeleteDirTree(const PlatformFilename&, bool*); + friend ARROW_EXPORT Status DeleteFile(const PlatformFilename&, bool*); + friend ARROW_EXPORT Status FileExists(const PlatformFilename&, bool*); +}; + +ARROW_EXPORT +Status CreateDir(const PlatformFilename& dir_path, bool* created = NULLPTR); +ARROW_EXPORT +Status CreateDirTree(const PlatformFilename& dir_path, bool* created = NULLPTR); +ARROW_EXPORT +Status DeleteDirTree(const PlatformFilename& dir_path, bool* deleted = NULLPTR); +ARROW_EXPORT +Status DeleteFile(const PlatformFilename& file_path, bool* deleted = NULLPTR); +ARROW_EXPORT +Status FileExists(const PlatformFilename& path, bool* out); + +ARROW_EXPORT +Status FileNameFromString(const std::string& file_name, PlatformFilename* out); + +ARROW_EXPORT +Status FileOpenReadable(const PlatformFilename& file_name, int* fd); +ARROW_EXPORT +Status FileOpenWritable(const PlatformFilename& file_name, bool write_only, bool truncate, + bool append, int* fd); + +ARROW_EXPORT +Status FileRead(int fd, uint8_t* buffer, const int64_t nbytes, int64_t* bytes_read); +ARROW_EXPORT +Status FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes, + int64_t* bytes_read); +ARROW_EXPORT +Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes); +ARROW_EXPORT +Status FileTruncate(int fd, const int64_t size); + +ARROW_EXPORT +Status FileTell(int fd, int64_t* pos); +ARROW_EXPORT +Status FileSeek(int fd, int64_t pos); +ARROW_EXPORT +Status FileSeek(int fd, int64_t pos, int whence); +ARROW_EXPORT +Status FileGetSize(int fd, int64_t* size); + +ARROW_EXPORT +Status FileClose(int fd); + +ARROW_EXPORT +Status CreatePipe(int fd[2]); + +ARROW_EXPORT +Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, + void** new_addr); + +ARROW_EXPORT +Status GetEnvVar(const char* name, std::string* out); +ARROW_EXPORT +Status GetEnvVar(const std::string& name, std::string* out); +ARROW_EXPORT +Status SetEnvVar(const char* name, const char* value); +ARROW_EXPORT +Status SetEnvVar(const std::string& name, const std::string& value); +ARROW_EXPORT +Status DelEnvVar(const char* name); +ARROW_EXPORT +Status DelEnvVar(const std::string& name); + +ARROW_EXPORT +std::string ErrnoMessage(int errnum); +#if _WIN32 +ARROW_EXPORT +std::string WinErrorMessage(int errnum); +#endif + +class ARROW_EXPORT TemporaryDir { + public: + ~TemporaryDir(); + + const PlatformFilename& path() { return path_; } + + static Status Make(const std::string& prefix, std::unique_ptr* out); + + private: + PlatformFilename path_; + + explicit TemporaryDir(PlatformFilename&&); +}; + +class ARROW_EXPORT SignalHandler { + public: + typedef void (*Callback)(int); + + SignalHandler(); + explicit SignalHandler(Callback cb); +#if ARROW_HAVE_SIGACTION + explicit SignalHandler(const struct sigaction& sa); +#endif + + Callback callback() const; +#if ARROW_HAVE_SIGACTION + const struct sigaction& action() const; +#endif + + protected: +#if ARROW_HAVE_SIGACTION + // Storing the full sigaction allows to restore the entire signal handling + // configuration. + struct sigaction sa_; +#else + Callback cb_; +#endif +}; + +ARROW_EXPORT +Status GetSignalHandler(int signum, SignalHandler* out); +ARROW_EXPORT +Status SetSignalHandler(int signum, SignalHandler handler, + SignalHandler* old_handler = NULLPTR); + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_IO_UTIL_H diff --git a/r/R/inst/include/arrow/util/key_value_metadata.h b/r/R/inst/include/arrow/util/key_value_metadata.h new file mode 100644 index 00000000000..2820c98200d --- /dev/null +++ b/r/R/inst/include/arrow/util/key_value_metadata.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_KEY_VALUE_METADATA_H +#define ARROW_UTIL_KEY_VALUE_METADATA_H + +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief A container for key-value pair type metadata. Not thread-safe +class ARROW_EXPORT KeyValueMetadata { + public: + KeyValueMetadata(); + KeyValueMetadata(const std::vector& keys, + const std::vector& values); + explicit KeyValueMetadata(const std::unordered_map& map); + virtual ~KeyValueMetadata() = default; + + void ToUnorderedMap(std::unordered_map* out) const; + + void Append(const std::string& key, const std::string& value); + + void reserve(int64_t n); + int64_t size() const; + + const std::string& key(int64_t i) const; + const std::string& value(int64_t i) const; + + /// \brief Perform linear search for key, returning -1 if not found + int FindKey(const std::string& key) const; + + std::shared_ptr Copy() const; + + bool Equals(const KeyValueMetadata& other) const; + std::string ToString() const; + + private: + std::vector keys_; + std::vector values_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata); +}; + +/// \brief Create a KeyValueMetadata instance +/// +/// \param pairs key-value mapping +std::shared_ptr ARROW_EXPORT +key_value_metadata(const std::unordered_map& pairs); + +/// \brief Create a KeyValueMetadata instance +/// +/// \param keys sequence of metadata keys +/// \param values sequence of corresponding metadata values +std::shared_ptr ARROW_EXPORT key_value_metadata( + const std::vector& keys, const std::vector& values); + +} // namespace arrow + +#endif // ARROW_UTIL_KEY_VALUE_METADATA_H diff --git a/r/R/inst/include/arrow/util/lazy.h b/r/R/inst/include/arrow/util/lazy.h new file mode 100644 index 00000000000..de32b5f22af --- /dev/null +++ b/r/R/inst/include/arrow/util/lazy.h @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_LAZY_H +#define ARROW_UTIL_LAZY_H + +#include +#include + +namespace arrow { +namespace internal { + +/// Create a range from a callable which takes a single index parameter +/// and returns the value of iterator on each call and a length. +/// Only iterators obtained from the same range should be compared, the +/// behaviour generally similar to other STL containers. +template +class LazyRange { + private: + // callable which generates the values + // has to be defined at the beginning of the class for type deduction + const Generator gen_; + // the length of the range + int64_t length_; +#ifdef _MSC_VER + // workaround to VS2010 not supporting decltype properly + // see https://stackoverflow.com/questions/21782846/decltype-for-class-member-function + static Generator gen_static_; +#endif + + public: +#ifdef _MSC_VER + using return_type = decltype(gen_static_(0)); +#else + using return_type = decltype(gen_(0)); +#endif + + /// Construct a new range from a callable and length + LazyRange(Generator gen, int64_t length) : gen_(gen), length_(length) {} + + // Class of the dependent iterator, created implicitly by begin and end + class RangeIter { + public: + using difference_type = int64_t; + using value_type = return_type; + using reference = const value_type&; + using pointer = const value_type*; + using iterator_category = std::forward_iterator_tag; + +#ifdef _MSC_VER + // msvc complains about unchecked iterators, + // see https://stackoverflow.com/questions/21655496/error-c4996-checked-iterators + using _Unchecked_type = typename LazyRange::RangeIter; +#endif + + RangeIter(const LazyRange& range, int64_t index) + : range_(range), index_(index) {} + + const return_type operator*() { return range_.gen_(index_); } + + RangeIter operator+(difference_type length) { + return RangeIter(range_, index_ + length); + } + + // pre-increment + RangeIter& operator++() { + ++index_; + return *this; + } + + // post-increment + RangeIter operator++(int) { + auto copy = RangeIter(*this); + ++index_; + return copy; + } + + bool operator==(const typename LazyRange::RangeIter& other) const { + return this->index_ == other.index_ && &this->range_ == &other.range_; + } + + bool operator!=(const typename LazyRange::RangeIter& other) const { + return this->index_ != other.index_ || &this->range_ != &other.range_; + } + + int64_t operator-(const typename LazyRange::RangeIter& other) { + return this->index_ - other.index_; + } + + private: + // parent range reference + const LazyRange& range_; + // current index + int64_t index_; + }; + + friend class RangeIter; + + // Create a new begin const iterator + RangeIter begin() { return RangeIter(*this, 0); } + + // Create a new end const iterator + RangeIter end() { return RangeIter(*this, length_); } +}; + +/// Helper function to create a lazy range from a callable (e.g. lambda) and length +template +LazyRange MakeLazyRange(Generator&& gen, int64_t length) { + return LazyRange(std::forward(gen), length); +} + +} // namespace internal +} // namespace arrow +#endif diff --git a/r/R/inst/include/arrow/util/logging.h b/r/R/inst/include/arrow/util/logging.h new file mode 100644 index 00000000000..999aca6fd7c --- /dev/null +++ b/r/R/inst/include/arrow/util/logging.h @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_LOGGING_H +#define ARROW_UTIL_LOGGING_H + +#ifdef GANDIVA_IR + +// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to +// streams or stdc++. So, making the DCHECK calls void in that case. + +#define ARROW_IGNORE_EXPR(expr) ((void)(expr)) + +#define DCHECK(condition) ARROW_IGNORE_EXPR(condition) +#define DCHECK_OK(status) ARROW_IGNORE_EXPR(status) +#define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1) +#define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1) + +#else // !GANDIVA_IR + +#include +#include +#include + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +enum class ArrowLogLevel : int { + ARROW_DEBUG = -1, + ARROW_INFO = 0, + ARROW_WARNING = 1, + ARROW_ERROR = 2, + ARROW_FATAL = 3 +}; + +#define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level) +#define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level) + +#define ARROW_IGNORE_EXPR(expr) ((void)(expr)) + +#define ARROW_CHECK(condition) \ + (condition) ? ARROW_IGNORE_EXPR(0) \ + : ::arrow::util::Voidify() & \ + ::arrow::util::ArrowLog(__FILE__, __LINE__, \ + ::arrow::util::ArrowLogLevel::ARROW_FATAL) \ + << " Check failed: " #condition " " + +// If 'to_call' returns a bad status, CHECK immediately with a logged message +// of 'msg' followed by the status. +#define ARROW_CHECK_OK_PREPEND(to_call, msg) \ + do { \ + ::arrow::Status _s = (to_call); \ + ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ + } while (false) + +// If the status is bad, CHECK immediately, appending the status to the +// logged message. +#define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status") + +#ifdef NDEBUG +#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING + +// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros +// only do so in debug mode. + +#define DCHECK(condition) \ + while (false) ARROW_IGNORE_EXPR(condition); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_OK(s) \ + ARROW_IGNORE_EXPR(s); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_EQ(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_NE(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_LE(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_LT(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_GE(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() +#define DCHECK_GT(val1, val2) \ + while (false) ARROW_IGNORE_EXPR(val1); \ + while (false) ARROW_IGNORE_EXPR(val2); \ + while (false) ::arrow::util::detail::NullLog() + +#else +#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL + +#define DCHECK(condition) ARROW_CHECK(condition) +#define DCHECK_OK(status) ARROW_CHECK_OK(status) +#define DCHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2)) +#define DCHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2)) +#define DCHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2)) +#define DCHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2)) +#define DCHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2)) +#define DCHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2)) + +#endif // NDEBUG + +// This code is adapted from +// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h. + +// To make the logging lib plugable with other logging libs and make +// the implementation unawared by the user, ArrowLog is only a declaration +// which hide the implementation into logging.cc file. +// In logging.cc, we can choose different log libs using different macros. + +// This is also a null log which does not output anything. +class ARROW_EXPORT ArrowLogBase { + public: + virtual ~ArrowLogBase() {} + + virtual bool IsEnabled() const { return false; } + + template + ArrowLogBase& operator<<(const T& t) { + if (IsEnabled()) { + Stream() << t; + } + return *this; + } + + protected: + virtual std::ostream& Stream() = 0; +}; + +class ARROW_EXPORT ArrowLog : public ArrowLogBase { + public: + ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity); + ~ArrowLog() override; + + /// Return whether or not current logging instance is enabled. + /// + /// \return True if logging is enabled and false otherwise. + bool IsEnabled() const override; + + /// The init function of arrow log for a program which should be called only once. + /// + /// \param appName The app name which starts the log. + /// \param severity_threshold Logging threshold for the program. + /// \param logDir Logging output file name. If empty, the log won't output to file. + static void StartArrowLog(const std::string& appName, + ArrowLogLevel severity_threshold = ArrowLogLevel::ARROW_INFO, + const std::string& logDir = ""); + + /// The shutdown function of arrow log, it should be used with StartArrowLog as a pair. + static void ShutDownArrowLog(); + + /// Install the failure signal handler to output call stack when crash. + /// If glog is not installed, this function won't do anything. + static void InstallFailureSignalHandler(); + + /// Uninstall the signal actions installed by InstallFailureSignalHandler. + static void UninstallSignalAction(); + + /// Return whether or not the log level is enabled in current setting. + /// + /// \param log_level The input log level to test. + /// \return True if input log level is not lower than the threshold. + static bool IsLevelEnabled(ArrowLogLevel log_level); + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog); + + // Hide the implementation of log provider by void *. + // Otherwise, lib user may define the same macro to use the correct header file. + void* logging_provider_; + /// True if log messages should be logged and false if they should be ignored. + bool is_enabled_; + + static ArrowLogLevel severity_threshold_; + + protected: + std::ostream& Stream() override; +}; + +// This class make ARROW_CHECK compilation pass to change the << operator to void. +// This class is copied from glog. +class ARROW_EXPORT Voidify { + public: + Voidify() {} + // This has to be an operator with a precedence lower than << but + // higher than ?: + void operator&(ArrowLogBase&) {} +}; + +namespace detail { + +/// @brief A helper for the nil log sink. +/// +/// Using this helper is analogous to sending log messages to /dev/null: +/// nothing gets logged. +class NullLog { + public: + /// The no-op output operator. + /// + /// @param [in] t + /// The object to send into the nil sink. + /// @return Reference to the updated object. + template + NullLog& operator<<(const T& t) { + return *this; + } +}; + +} // namespace detail +} // namespace util +} // namespace arrow + +#endif // GANDIVA_IR + +#endif // ARROW_UTIL_LOGGING_H diff --git a/r/R/inst/include/arrow/util/macros.h b/r/R/inst/include/arrow/util/macros.h new file mode 100644 index 00000000000..4516985e300 --- /dev/null +++ b/r/R/inst/include/arrow/util/macros.h @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MACROS_H +#define ARROW_UTIL_MACROS_H + +#include + +#define ARROW_STRINGIFY(x) #x +#define ARROW_CONCAT(x, y) x##y + +// From Google gutil +#ifndef ARROW_DISALLOW_COPY_AND_ASSIGN +#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete +#endif + +#define ARROW_UNUSED(x) (void)x +#define ARROW_ARG_UNUSED(x) +// +// GCC can be told that a certain branch is not likely to be taken (for +// instance, a CHECK failure), and use that information in static analysis. +// Giving it this information can help it optimize for the common case in +// the absence of better information (ie. -fprofile-arcs). +// +#if defined(__GNUC__) +#define ARROW_PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#define ARROW_NORETURN __attribute__((noreturn)) +#define ARROW_PREFETCH(addr) __builtin_prefetch(addr) +#elif defined(_MSC_VER) +#define ARROW_NORETURN __declspec(noreturn) +#define ARROW_PREDICT_FALSE(x) x +#define ARROW_PREDICT_TRUE(x) x +#define ARROW_PREFETCH(addr) +#else +#define ARROW_NORETURN +#define ARROW_PREDICT_FALSE(x) x +#define ARROW_PREDICT_TRUE(x) x +#define ARROW_PREFETCH(addr) +#endif + +#if (defined(__GNUC__) || defined(__APPLE__)) +#define ARROW_MUST_USE_RESULT __attribute__((warn_unused_result)) +#elif defined(_MSC_VER) +#define ARROW_MUST_USE_RESULT +#else +#define ARROW_MUST_USE_RESULT +#endif + +// ---------------------------------------------------------------------- +// C++/CLI support macros (see ARROW-1134) + +#ifndef NULLPTR + +#ifdef __cplusplus_cli +#define NULLPTR __nullptr +#else +#define NULLPTR nullptr +#endif + +#endif // ifndef NULLPTR + +// ---------------------------------------------------------------------- + +// clang-format off +// [[deprecated]] is only available in C++14, use this for the time being +// This macro takes an optional deprecation message +#if __cplusplus <= 201103L +# ifdef __GNUC__ +# define ARROW_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__))) +# elif defined(_MSC_VER) +# define ARROW_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__)) +# else +# define ARROW_DEPRECATED(...) +# endif +#else +# define ARROW_DEPRECATED(...) [[deprecated(__VA_ARGS__)]] +#endif + +// ---------------------------------------------------------------------- + +// macros to disable padding +// these macros are portable across different compilers and platforms +//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355] +#if !defined(MANUALLY_ALIGNED_STRUCT) +#if defined(_MSC_VER) +#define MANUALLY_ALIGNED_STRUCT(alignment) \ + __pragma(pack(1)); \ + struct __declspec(align(alignment)) +#define STRUCT_END(name, size) \ + __pragma(pack()); \ + static_assert(sizeof(name) == size, "compiler breaks packing rules") +#elif defined(__GNUC__) || defined(__clang__) +#define MANUALLY_ALIGNED_STRUCT(alignment) \ + _Pragma("pack(1)") struct __attribute__((aligned(alignment))) +#define STRUCT_END(name, size) \ + _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules") +#else +#error Unknown compiler, please define structure alignment macros +#endif +#endif // !defined(MANUALLY_ALIGNED_STRUCT) + +// ---------------------------------------------------------------------- +// Convenience macro disabling a particular UBSan check in a function + +#if defined(__clang__) +#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature))) +#else +#define ARROW_DISABLE_UBSAN(feature) +#endif + +// ---------------------------------------------------------------------- +// Machine information + +#if INTPTR_MAX == INT64_MAX +#define ARROW_BITNESS 64 +#elif INTPTR_MAX == INT32_MAX +#define ARROW_BITNESS 32 +#else +#error Unexpected INTPTR_MAX +#endif + +// ---------------------------------------------------------------------- +// From googletest +// (also in parquet-cpp) + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void MyMethod(); +// FRIEND_TEST(MyClassTest, MyMethod); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, MyMethod) { +// // Can call MyClass::MyMethod() here. +// } + +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test + +#endif // ARROW_UTIL_MACROS_H diff --git a/r/R/inst/include/arrow/util/memory.h b/r/R/inst/include/arrow/util/memory.h new file mode 100644 index 00000000000..2d2a1059214 --- /dev/null +++ b/r/R/inst/include/arrow/util/memory.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MEMORY_H +#define ARROW_UTIL_MEMORY_H + +#include +#include + +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +// A helper function for doing memcpy with multiple threads. This is required +// to saturate the memory bandwidth of modern cpus. +void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, + uintptr_t block_size, int num_threads); + +// A helper function for checking if two wrapped objects implementing `Equals` +// are equal. +template +bool SharedPtrEquals(const std::shared_ptr& left, const std::shared_ptr& right) { + if (left == right) return true; + if (left == NULLPTR || right == NULLPTR) return false; + return left->Equals(*right); +} + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_MEMORY_H diff --git a/r/R/inst/include/arrow/util/neon-util.h b/r/R/inst/include/arrow/util/neon-util.h new file mode 100644 index 00000000000..714d2324f05 --- /dev/null +++ b/r/R/inst/include/arrow/util/neon-util.h @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { + +#if defined(__aarch64__) || defined(__AARCH64__) +#ifdef __ARM_FEATURE_CRC32 +#define ARROW_HAVE_ARM_CRC +#include +#endif +#endif + +#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC) + +#include +#include +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +static inline uint32_t crc32c_runtime_check(void) { + uint64_t auxv = getauxval(AT_HWCAP); + return (auxv & HWCAP_CRC32) != 0; +} + +static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) { + return __crc32cb(crc, v); +} + +static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t v) { + return __crc32ch(crc, v); +} + +static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t v) { + return __crc32cw(crc, v); +} + +static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) { + return __crc32cd(crc, v); +} + +#endif // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC) + +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/parallel.h b/r/R/inst/include/arrow/util/parallel.h new file mode 100644 index 00000000000..8caba5f1f0d --- /dev/null +++ b/r/R/inst/include/arrow/util/parallel.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_PARALLEL_H +#define ARROW_UTIL_PARALLEL_H + +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/thread-pool.h" + +namespace arrow { +namespace internal { + +// A parallelizer that takes a `Status(int)` function and calls it with +// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads. + +template +Status ParallelFor(int num_tasks, FUNCTION&& func) { + auto pool = internal::GetCpuThreadPool(); + std::vector> futures(num_tasks); + + for (int i = 0; i < num_tasks; ++i) { + futures[i] = pool->Submit(func, i); + } + auto st = Status::OK(); + for (auto& fut : futures) { + st &= fut.get(); + } + return st; +} + +// A variant of ParallelFor() with an explicit number of dedicated threads. +// In most cases it's more appropriate to use the 2-argument ParallelFor (above), +// or directly the global CPU thread pool (arrow/util/thread-pool.h). + +template +Status ParallelFor(int nthreads, int num_tasks, FUNCTION&& func) { + std::vector thread_pool; + thread_pool.reserve(nthreads); + std::atomic task_counter(0); + + std::mutex error_mtx; + bool error_occurred = false; + Status error; + + for (int thread_id = 0; thread_id < nthreads; ++thread_id) { + thread_pool.emplace_back( + [&num_tasks, &task_counter, &error, &error_occurred, &error_mtx, &func]() { + int task_id; + while (!error_occurred) { + task_id = task_counter.fetch_add(1); + if (task_id >= num_tasks) { + break; + } + Status s = func(task_id); + if (!s.ok()) { + std::lock_guard lock(error_mtx); + error_occurred = true; + error = s; + break; + } + } + }); + } + for (auto&& thread : thread_pool) { + thread.join(); + } + if (error_occurred) { + return error; + } + return Status::OK(); +} + +} // namespace internal +} // namespace arrow + +#endif diff --git a/r/R/inst/include/arrow/util/parsing.h b/r/R/inst/include/arrow/util/parsing.h new file mode 100644 index 00000000000..20b749a4ecf --- /dev/null +++ b/r/R/inst/include/arrow/util/parsing.h @@ -0,0 +1,512 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This is a private header for string-to-number parsing utilitiers + +#ifndef ARROW_UTIL_PARSING_H +#define ARROW_UTIL_PARSING_H + +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" +#include "arrow/vendored/datetime.h" + +namespace arrow { +namespace internal { + +/// \brief A class providing conversion from strings to some Arrow data types +/// +/// Conversion is triggered by calling operator(). It returns true on +/// success, false on failure. +/// +/// The class may have a non-trivial construction cost in some cases, +/// so it's recommended to use a single instance many times, if doing bulk +/// conversion. +/// +template +class StringConverter; + +template <> +class StringConverter { + public: + explicit StringConverter(const std::shared_ptr& = NULLPTR) {} + + using value_type = bool; + + bool operator()(const char* s, size_t length, value_type* out) { + if (length == 1) { + // "0" or "1"? + if (s[0] == '0') { + *out = false; + return true; + } + if (s[0] == '1') { + *out = true; + return true; + } + return false; + } + if (length == 4) { + // "true"? + *out = true; + return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') && + (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E')); + } + if (length == 5) { + // "false"? + *out = false; + return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') && + (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') && + (s[4] == 'e' || s[4] == 'E')); + } + return false; + } +}; + +// Ideas for faster float parsing: +// - http://rapidjson.org/md_doc_internals.html#ParsingDouble +// - https://github.com/google/double-conversion [used here] +// - https://github.com/achan001/dtoa-fast + +template +class StringToFloatConverterMixin { + public: + using value_type = typename ARROW_TYPE::c_type; + + explicit StringToFloatConverterMixin(const std::shared_ptr& = NULLPTR) + : main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"), + fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf", + "nan") {} + + bool operator()(const char* s, size_t length, value_type* out) { + value_type v; + // double-conversion doesn't give us an error flag but signals parse + // errors with sentinel values. Since a sentinel value can appear as + // legitimate input, we fallback on a second converter with a different + // sentinel to eliminate false errors. + TryConvert(main_converter_, s, length, &v); + if (ARROW_PREDICT_FALSE(v == static_cast(main_junk_value_))) { + TryConvert(fallback_converter_, s, length, &v); + if (ARROW_PREDICT_FALSE(v == static_cast(fallback_junk_value_))) { + return false; + } + } + *out = v; + return true; + } + + protected: +// This is only support in double-conversion 3.1+ +#ifdef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY + static const int flags_ = + double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY; +#else + static const int flags_ = double_conversion::StringToDoubleConverter::NO_FLAGS; +#endif + // Two unlikely values to signal a parsing error + static constexpr double main_junk_value_ = 0.7066424364107089; + static constexpr double fallback_junk_value_ = 0.40088499148279166; + + double_conversion::StringToDoubleConverter main_converter_; + double_conversion::StringToDoubleConverter fallback_converter_; + + inline void TryConvert(double_conversion::StringToDoubleConverter& converter, + const char* s, size_t length, float* out) { + int processed_length; + *out = converter.StringToFloat(s, static_cast(length), &processed_length); + } + + inline void TryConvert(double_conversion::StringToDoubleConverter& converter, + const char* s, size_t length, double* out) { + int processed_length; + *out = converter.StringToDouble(s, static_cast(length), &processed_length); + } +}; + +template <> +class StringConverter : public StringToFloatConverterMixin { + using StringToFloatConverterMixin::StringToFloatConverterMixin; +}; + +template <> +class StringConverter : public StringToFloatConverterMixin { + using StringToFloatConverterMixin::StringToFloatConverterMixin; +}; + +// NOTE: HalfFloatType would require a half<->float conversion library + +namespace detail { + +inline uint8_t ParseDecimalDigit(char c) { return static_cast(c - '0'); } + +#define PARSE_UNSIGNED_ITERATION(C_TYPE) \ + if (length > 0) { \ + uint8_t digit = ParseDecimalDigit(*s++); \ + result = static_cast(result * 10U); \ + length--; \ + if (ARROW_PREDICT_FALSE(digit > 9U)) { \ + /* Non-digit */ \ + return false; \ + } \ + result = static_cast(result + digit); \ + } + +#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \ + if (length > 0) { \ + if (ARROW_PREDICT_FALSE(result > std::numeric_limits::max() / 10U)) { \ + /* Overflow */ \ + return false; \ + } \ + uint8_t digit = ParseDecimalDigit(*s++); \ + result = static_cast(result * 10U); \ + C_TYPE new_result = static_cast(result + digit); \ + if (ARROW_PREDICT_FALSE(--length > 0)) { \ + /* Too many digits */ \ + return false; \ + } \ + if (ARROW_PREDICT_FALSE(digit > 9U)) { \ + /* Non-digit */ \ + return false; \ + } \ + if (ARROW_PREDICT_FALSE(new_result < result)) { \ + /* Overflow */ \ + return false; \ + } \ + result = new_result; \ + } + +inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) { + uint8_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint8_t); + PARSE_UNSIGNED_ITERATION(uint8_t); + PARSE_UNSIGNED_ITERATION_LAST(uint8_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) { + uint16_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION(uint16_t); + PARSE_UNSIGNED_ITERATION_LAST(uint16_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) { + uint32_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + PARSE_UNSIGNED_ITERATION(uint32_t); + + PARSE_UNSIGNED_ITERATION_LAST(uint32_t); + *out = result; + return true; +} + +inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { + uint64_t result = 0; + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + PARSE_UNSIGNED_ITERATION(uint64_t); + + PARSE_UNSIGNED_ITERATION_LAST(uint64_t); + *out = result; + return true; +} + +#undef PARSE_UNSIGNED_ITERATION +#undef PARSE_UNSIGNED_ITERATION_LAST + +} // namespace detail + +template +class StringToUnsignedIntConverterMixin { + public: + using value_type = typename ARROW_TYPE::c_type; + + explicit StringToUnsignedIntConverterMixin(const std::shared_ptr& = NULLPTR) { + } + + bool operator()(const char* s, size_t length, value_type* out) { + if (ARROW_PREDICT_FALSE(length == 0)) { + return false; + } + // Skip leading zeros + while (length > 0 && *s == '0') { + length--; + s++; + } + return detail::ParseUnsigned(s, length, out); + } +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { + using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { + using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { + using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToUnsignedIntConverterMixin { + using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; +}; + +template +class StringToSignedIntConverterMixin { + public: + using value_type = typename ARROW_TYPE::c_type; + using unsigned_type = typename std::make_unsigned::type; + + explicit StringToSignedIntConverterMixin(const std::shared_ptr& = NULLPTR) {} + + bool operator()(const char* s, size_t length, value_type* out) { + static constexpr unsigned_type max_positive = + static_cast(std::numeric_limits::max()); + // Assuming two's complement + static constexpr unsigned_type max_negative = max_positive + 1; + bool negative = false; + unsigned_type unsigned_value = 0; + + if (ARROW_PREDICT_FALSE(length == 0)) { + return false; + } + if (*s == '-') { + negative = true; + s++; + if (--length == 0) { + return false; + } + } + // Skip leading zeros + while (length > 0 && *s == '0') { + length--; + s++; + } + if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) { + return false; + } + if (negative) { + if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { + return false; + } + // To avoid both compiler warnings (with unsigned negation) + // and undefined behaviour (with signed negation overflow), + // use the expanded formula for 2's complement negation. + *out = static_cast(~unsigned_value + 1); + } else { + if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { + return false; + } + *out = static_cast(unsigned_value); + } + return true; + } +}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin { + using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin { + using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin { + using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; +}; + +template <> +class StringConverter : public StringToSignedIntConverterMixin { + using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; +}; + +template <> +class StringConverter { + public: + using value_type = TimestampType::c_type; + + explicit StringConverter(const std::shared_ptr& type) + : unit_(checked_cast(type.get())->unit()) {} + + bool operator()(const char* s, size_t length, value_type* out) { + // We allow the following formats: + // - "YYYY-MM-DD" + // - "YYYY-MM-DD[ T]hh:mm:ss" + // - "YYYY-MM-DD[ T]hh:mm:ssZ" + // UTC is always assumed, and the DataType's timezone is ignored. + arrow_vendored::date::year_month_day ymd; + if (ARROW_PREDICT_FALSE(length < 10)) { + return false; + } + if (length == 10) { + if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { + return false; + } + return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out); + } + if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) { + return false; + } + if (s[length - 1] == 'Z') { + --length; + } + if (length == 19) { + if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { + return false; + } + std::chrono::duration seconds; + if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) { + return false; + } + return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out); + } + return false; + } + + protected: + template + bool ConvertTimePoint(TimePoint tp, value_type* out) { + auto duration = tp.time_since_epoch(); + switch (unit_) { + case TimeUnit::SECOND: + *out = std::chrono::duration_cast(duration).count(); + return true; + case TimeUnit::MILLI: + *out = std::chrono::duration_cast(duration).count(); + return true; + case TimeUnit::MICRO: + *out = std::chrono::duration_cast(duration).count(); + return true; + case TimeUnit::NANO: + *out = std::chrono::duration_cast(duration).count(); + return true; + } + // Unreachable, but suppress compiler warning + assert(0); + *out = 0; + return true; + } + + bool ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) { + uint16_t year; + uint8_t month, day; + if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) { + return false; + } + *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month}, + arrow_vendored::date::day{day}}; + return out->ok(); + } + + bool ParseHH_MM_SS(const char* s, std::chrono::duration* out) { + uint8_t hours, minutes, seconds; + if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) { + return false; + } + if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 6, 2, &seconds))) { + return false; + } + if (ARROW_PREDICT_FALSE(hours >= 24)) { + return false; + } + if (ARROW_PREDICT_FALSE(minutes >= 60)) { + return false; + } + if (ARROW_PREDICT_FALSE(seconds >= 60)) { + return false; + } + *out = std::chrono::duration(3600U * hours + 60U * minutes + seconds); + return true; + } + + const TimeUnit::type unit_; +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_PARSING_H diff --git a/r/R/inst/include/arrow/util/rle-encoding.h b/r/R/inst/include/arrow/util/rle-encoding.h new file mode 100644 index 00000000000..739158a59a1 --- /dev/null +++ b/r/R/inst/include/arrow/util/rle-encoding.h @@ -0,0 +1,604 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use +// in parquet-cpp, Arrow + +#ifndef ARROW_UTIL_RLE_ENCODING_H +#define ARROW_UTIL_RLE_ENCODING_H + +#include +#include + +#include "arrow/util/bit-stream-utils.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace util { + +/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs +/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed +/// (literal encoding). +/// For both types of runs, there is a byte-aligned indicator which encodes the length +/// of the run and the type of the run. +/// This encoding has the benefit that when there aren't any long enough runs, values +/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and +/// the run length are byte aligned. This allows for very efficient decoding +/// implementations. +/// The encoding is: +/// encoded-block := run* +/// run := literal-run | repeated-run +/// literal-run := literal-indicator < literal bytes > +/// repeated-run := repeated-indicator < repeated value. padded to byte boundary > +/// literal-indicator := varint_encode( number_of_groups << 1 | 1) +/// repeated-indicator := varint_encode( number_of_repetitions << 1 ) +// +/// Each run is preceded by a varint. The varint's least significant bit is +/// used to indicate whether the run is a literal run or a repeated run. The rest +/// of the varint is used to determine the length of the run (eg how many times the +/// value repeats). +// +/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode +/// in groups of 8), so that no matter the bit-width of the value, the sequence will end +/// on a byte boundary without padding. +/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than +/// the actual number of encoded ints. (This means that the total number of encoded values +/// can not be determined from the encoded data, since the number of values in the last +/// group may not be a multiple of 8). For the last group of literal runs, we pad +/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side +/// without the need for additional checks. +// +/// There is a break-even point when it is more storage efficient to do run length +/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes +/// for both the repeated encoding or the literal encoding. This value can always +/// be computed based on the bit-width. +/// TODO: think about how to use this for strings. The bit packing isn't quite the same. +// +/// Examples with bit-width 1 (eg encoding booleans): +/// ---------------------------------------- +/// 100 1s followed by 100 0s: +/// <1, padded to 1 byte> <0, padded to 1 byte> +/// - (total 4 bytes) +// +/// alternating 1s and 0s (200 total): +/// 200 ints = 25 groups of 8 +/// <25 bytes of values, bitpacked> +/// (total 26 bytes, 1 byte overhead) +// + +/// Decoder class for RLE encoded data. +class RleDecoder { + public: + /// Create a decoder object. buffer/buffer_len is the decoded data. + /// bit_width is the width of each value (before encoding). + RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + : bit_reader_(buffer, buffer_len), + bit_width_(bit_width), + current_value_(0), + repeat_count_(0), + literal_count_(0) { + DCHECK_GE(bit_width_, 0); + DCHECK_LE(bit_width_, 64); + } + + RleDecoder() : bit_width_(-1) {} + + void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { + DCHECK_GE(bit_width, 0); + DCHECK_LE(bit_width, 64); + bit_reader_.Reset(buffer, buffer_len); + bit_width_ = bit_width; + current_value_ = 0; + repeat_count_ = 0; + literal_count_ = 0; + } + + /// Gets the next value. Returns false if there are no more. + template + bool Get(T* val); + + /// Gets a batch of values. Returns the number of decoded elements. + template + int GetBatch(T* values, int batch_size); + + /// Like GetBatch but the values are then decoded using the provided dictionary + template + int GetBatchWithDict(const T* dictionary, T* values, int batch_size); + + /// Like GetBatchWithDict but add spacing for null entries + template + int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size, + int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset); + + protected: + BitUtil::BitReader bit_reader_; + /// Number of bits needed to encode the value. Must be between 0 and 64. + int bit_width_; + uint64_t current_value_; + uint32_t repeat_count_; + uint32_t literal_count_; + + private: + /// Fills literal_count_ and repeat_count_ with next values. Returns false if there + /// are no more. + template + bool NextCounts(); +}; + +/// Class to incrementally build the rle data. This class does not allocate any memory. +/// The encoding has two modes: encoding repeated runs and literal runs. +/// If the run is sufficiently short, it is more efficient to encode as a literal run. +/// This class does so by buffering 8 values at a time. If they are not all the same +/// they are added to the literal run. If they are the same, they are added to the +/// repeated run. When we switch modes, the previous run is flushed out. +class RleEncoder { + public: + /// buffer/buffer_len: preallocated output buffer. + /// bit_width: max number of bits for value. + /// TODO: consider adding a min_repeated_run_length so the caller can control + /// when values should be encoded as repeated runs. Currently this is derived + /// based on the bit_width, which can determine a storage optimal choice. + /// TODO: allow 0 bit_width (and have dict encoder use it) + RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) + : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { + DCHECK_GE(bit_width_, 0); + DCHECK_LE(bit_width_, 64); + max_run_byte_size_ = MinBufferSize(bit_width); + DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough."; + Clear(); + } + + /// Returns the minimum buffer size needed to use the encoder for 'bit_width' + /// This is the maximum length of a single run for 'bit_width'. + /// It is not valid to pass a buffer less than this length. + static int MinBufferSize(int bit_width) { + /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. + int max_literal_run_size = + 1 + + static_cast(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width)); + /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. + int max_repeated_run_size = BitUtil::BitReader::MAX_VLQ_BYTE_LEN + + static_cast(BitUtil::BytesForBits(bit_width)); + return std::max(max_literal_run_size, max_repeated_run_size); + } + + /// Returns the maximum byte size it could take to encode 'num_values'. + static int MaxBufferSize(int bit_width, int num_values) { + // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 + // and then a repeated run of length 8". + // 8 values per smallest run, 8 bits per byte + int bytes_per_run = bit_width; + int num_runs = static_cast(BitUtil::CeilDiv(num_values, 8)); + int literal_max_size = num_runs + num_runs * bytes_per_run; + + // In the very worst case scenario, the data is a concatenation of repeated + // runs of 8 values. Repeated run has a 1 byte varint followed by the + // bit-packed repeated value + int min_repeated_run_size = 1 + static_cast(BitUtil::BytesForBits(bit_width)); + int repeated_max_size = + static_cast(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size; + + return std::max(literal_max_size, repeated_max_size); + } + + /// Encode value. Returns true if the value fits in buffer, false otherwise. + /// This value must be representable with bit_width_ bits. + bool Put(uint64_t value); + + /// Flushes any pending values to the underlying buffer. + /// Returns the total number of bytes written + int Flush(); + + /// Resets all the state in the encoder. + void Clear(); + + /// Returns pointer to underlying buffer + uint8_t* buffer() { return bit_writer_.buffer(); } + int32_t len() { return bit_writer_.bytes_written(); } + + private: + /// Flushes any buffered values. If this is part of a repeated run, this is largely + /// a no-op. + /// If it is part of a literal run, this will call FlushLiteralRun, which writes + /// out the buffered literal values. + /// If 'done' is true, the current run would be written even if it would normally + /// have been buffered more. This should only be called at the end, when the + /// encoder has received all values even if it would normally continue to be + /// buffered. + void FlushBufferedValues(bool done); + + /// Flushes literal values to the underlying buffer. If update_indicator_byte, + /// then the current literal run is complete and the indicator byte is updated. + void FlushLiteralRun(bool update_indicator_byte); + + /// Flushes a repeated run to the underlying buffer. + void FlushRepeatedRun(); + + /// Checks and sets buffer_full_. This must be called after flushing a run to + /// make sure there are enough bytes remaining to encode the next run. + void CheckBufferFull(); + + /// The maximum number of values in a single literal run + /// (number of groups encodable by a 1-byte indicator * 8) + static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8; + + /// Number of bits needed to encode the value. Must be between 0 and 64. + const int bit_width_; + + /// Underlying buffer. + BitUtil::BitWriter bit_writer_; + + /// If true, the buffer is full and subsequent Put()'s will fail. + bool buffer_full_; + + /// The maximum byte size a single run can take. + int max_run_byte_size_; + + /// We need to buffer at most 8 values for literals. This happens when the + /// bit_width is 1 (so 8 values fit in one byte). + /// TODO: generalize this to other bit widths + int64_t buffered_values_[8]; + + /// Number of values in buffered_values_ + int num_buffered_values_; + + /// The current (also last) value that was written and the count of how + /// many times in a row that value has been seen. This is maintained even + /// if we are in a literal run. If the repeat_count_ get high enough, we switch + /// to encoding repeated runs. + uint64_t current_value_; + int repeat_count_; + + /// Number of literals in the current run. This does not include the literals + /// that might be in buffered_values_. Only after we've got a group big enough + /// can we decide if they should part of the literal_count_ or repeat_count_ + int literal_count_; + + /// Pointer to a byte in the underlying buffer that stores the indicator byte. + /// This is reserved as soon as we need a literal run but the value is written + /// when the literal run is complete. + uint8_t* literal_indicator_byte_; +}; + +template +inline bool RleDecoder::Get(T* val) { + return GetBatch(val, 1) == 1; +} + +template +inline int RleDecoder::GetBatch(T* values, int batch_size) { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + + while (values_read < batch_size) { + if (repeat_count_ > 0) { + int repeat_batch = + std::min(batch_size - values_read, static_cast(repeat_count_)); + std::fill(values + values_read, values + values_read + repeat_batch, + static_cast(current_value_)); + repeat_count_ -= repeat_batch; + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = + std::min(batch_size - values_read, static_cast(literal_count_)); + int actual_read = + bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch); + DCHECK_EQ(actual_read, literal_batch); + literal_count_ -= literal_batch; + values_read += literal_batch; + } else { + if (!NextCounts()) return values_read; + } + } + + return values_read; +} + +template +inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batch_size) { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + + while (values_read < batch_size) { + if (repeat_count_ > 0) { + int repeat_batch = + std::min(batch_size - values_read, static_cast(repeat_count_)); + std::fill(values + values_read, values + values_read + repeat_batch, + dictionary[current_value_]); + repeat_count_ -= repeat_batch; + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = + std::min(batch_size - values_read, static_cast(literal_count_)); + + const int buffer_size = 1024; + int indices[buffer_size]; + literal_batch = std::min(literal_batch, buffer_size); + int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); + DCHECK_EQ(actual_read, literal_batch); + for (int i = 0; i < literal_batch; ++i) { + values[values_read + i] = dictionary[indices[i]]; + } + literal_count_ -= literal_batch; + values_read += literal_batch; + } else { + if (!NextCounts()) return values_read; + } + } + + return values_read; +} + +template +inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, + int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset) { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + int remaining_nulls = null_count; + + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); + + while (values_read < batch_size) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + if ((repeat_count_ == 0) && (literal_count_ == 0)) { + if (!NextCounts()) return values_read; + } + if (repeat_count_ > 0) { + T value = dictionary[current_value_]; + // The current index is already valid, we don't need to check that again + int repeat_batch = 1; + repeat_count_--; + + while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { + if (bit_reader.IsSet()) { + repeat_count_--; + } else { + remaining_nulls--; + } + repeat_batch++; + + bit_reader.Next(); + } + std::fill(values + values_read, values + values_read + repeat_batch, value); + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = std::min(batch_size - values_read - remaining_nulls, + static_cast(literal_count_)); + + // Decode the literals + constexpr int kBufferSize = 1024; + int indices[kBufferSize]; + literal_batch = std::min(literal_batch, kBufferSize); + int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); + DCHECK_EQ(actual_read, literal_batch); + + int skipped = 0; + int literals_read = 1; + values[values_read] = dictionary[indices[0]]; + + // Read the first bitset to the end + while (literals_read < literal_batch) { + if (bit_reader.IsSet()) { + values[values_read + literals_read + skipped] = + dictionary[indices[literals_read]]; + literals_read++; + } else { + skipped++; + } + + bit_reader.Next(); + } + literal_count_ -= literal_batch; + values_read += literal_batch + skipped; + remaining_nulls -= skipped; + } + } else { + values_read++; + remaining_nulls--; + } + } + + return values_read; +} + +template +bool RleDecoder::NextCounts() { + // Read the next run's indicator int, it could be a literal or repeated run. + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (!result) return false; + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + } else { + repeat_count_ = indicator_value >> 1; + // XXX (ARROW-4018) this is not big-endian compatible + bool result = + bit_reader_.GetAligned(static_cast(BitUtil::CeilDiv(bit_width_, 8)), + reinterpret_cast(¤t_value_)); + DCHECK(result); + } + return true; +} + +/// This function buffers input values 8 at a time. After seeing all 8 values, +/// it decides whether they should be encoded as a literal or repeated run. +inline bool RleEncoder::Put(uint64_t value) { + DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_)); + if (ARROW_PREDICT_FALSE(buffer_full_)) return false; + + if (ARROW_PREDICT_TRUE(current_value_ == value)) { + ++repeat_count_; + if (repeat_count_ > 8) { + // This is just a continuation of the current run, no need to buffer the + // values. + // Note that this is the fast path for long repeated runs. + return true; + } + } else { + if (repeat_count_ >= 8) { + // We had a run that was long enough but it has ended. Flush the + // current repeated run. + DCHECK_EQ(literal_count_, 0); + FlushRepeatedRun(); + } + repeat_count_ = 1; + current_value_ = value; + } + + buffered_values_[num_buffered_values_] = value; + if (++num_buffered_values_ == 8) { + DCHECK_EQ(literal_count_ % 8, 0); + FlushBufferedValues(false); + } + return true; +} + +inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { + if (literal_indicator_byte_ == NULL) { + // The literal indicator byte has not been reserved yet, get one now. + literal_indicator_byte_ = bit_writer_.GetNextBytePtr(); + DCHECK(literal_indicator_byte_ != NULL); + } + + // Write all the buffered values as bit packed literals + for (int i = 0; i < num_buffered_values_; ++i) { + bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_); + DCHECK(success) << "There is a bug in using CheckBufferFull()"; + } + num_buffered_values_ = 0; + + if (update_indicator_byte) { + // At this point we need to write the indicator byte for the literal run. + // We only reserve one byte, to allow for streaming writes of literal values. + // The logic makes sure we flush literal runs often enough to not overrun + // the 1 byte. + DCHECK_EQ(literal_count_ % 8, 0); + int num_groups = literal_count_ / 8; + int32_t indicator_value = (num_groups << 1) | 1; + DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); + *literal_indicator_byte_ = static_cast(indicator_value); + literal_indicator_byte_ = NULL; + literal_count_ = 0; + CheckBufferFull(); + } +} + +inline void RleEncoder::FlushRepeatedRun() { + DCHECK_GT(repeat_count_, 0); + bool result = true; + // The lsb of 0 indicates this is a repeated run + int32_t indicator_value = repeat_count_ << 1 | 0; + result &= bit_writer_.PutVlqInt(indicator_value); + result &= bit_writer_.PutAligned(current_value_, + static_cast(BitUtil::CeilDiv(bit_width_, 8))); + DCHECK(result); + num_buffered_values_ = 0; + repeat_count_ = 0; + CheckBufferFull(); +} + +/// Flush the values that have been buffered. At this point we decide whether +/// we need to switch between the run types or continue the current one. +inline void RleEncoder::FlushBufferedValues(bool done) { + if (repeat_count_ >= 8) { + // Clear the buffered values. They are part of the repeated run now and we + // don't want to flush them out as literals. + num_buffered_values_ = 0; + if (literal_count_ != 0) { + // There was a current literal run. All the values in it have been flushed + // but we still need to update the indicator byte. + DCHECK_EQ(literal_count_ % 8, 0); + DCHECK_EQ(repeat_count_, 8); + FlushLiteralRun(true); + } + DCHECK_EQ(literal_count_, 0); + return; + } + + literal_count_ += num_buffered_values_; + DCHECK_EQ(literal_count_ % 8, 0); + int num_groups = literal_count_ / 8; + if (num_groups + 1 >= (1 << 6)) { + // We need to start a new literal run because the indicator byte we've reserved + // cannot store more values. + DCHECK(literal_indicator_byte_ != NULL); + FlushLiteralRun(true); + } else { + FlushLiteralRun(done); + } + repeat_count_ = 0; +} + +inline int RleEncoder::Flush() { + if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { + bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || + num_buffered_values_ == 0); + // There is something pending, figure out if it's a repeated or literal run + if (repeat_count_ > 0 && all_repeat) { + FlushRepeatedRun(); + } else { + DCHECK_EQ(literal_count_ % 8, 0); + // Buffer the last group of literals to 8 by padding with 0s. + for (; num_buffered_values_ != 0 && num_buffered_values_ < 8; + ++num_buffered_values_) { + buffered_values_[num_buffered_values_] = 0; + } + literal_count_ += num_buffered_values_; + FlushLiteralRun(true); + repeat_count_ = 0; + } + } + bit_writer_.Flush(); + DCHECK_EQ(num_buffered_values_, 0); + DCHECK_EQ(literal_count_, 0); + DCHECK_EQ(repeat_count_, 0); + + return bit_writer_.bytes_written(); +} + +inline void RleEncoder::CheckBufferFull() { + int bytes_written = bit_writer_.bytes_written(); + if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { + buffer_full_ = true; + } +} + +inline void RleEncoder::Clear() { + buffer_full_ = false; + current_value_ = 0; + repeat_count_ = 0; + num_buffered_values_ = 0; + literal_count_ = 0; + literal_indicator_byte_ = NULL; + bit_writer_.Clear(); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_RLE_ENCODING_H diff --git a/r/R/inst/include/arrow/util/sse-util.h b/r/R/inst/include/arrow/util/sse-util.h new file mode 100644 index 00000000000..6f451fd0efc --- /dev/null +++ b/r/R/inst/include/arrow/util/sse-util.h @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// From Apache Impala as of 2016-01-29. Pared down to a minimal set of +// functions needed for parquet-cpp + +#pragma once + +#include "arrow/util/macros.h" + +#ifdef ARROW_USE_SIMD + +// MSVC x86-64 + +#if (defined(_M_AMD64) || defined(_M_X64)) +#define ARROW_HAVE_SSE2 1 +#define ARROW_HAVE_SSE4_2 1 +#include +#endif + +// gcc/clang (possibly others) + +#if defined(__SSE2__) +#define ARROW_HAVE_SSE2 1 +#include +#endif + +#if defined(__SSE4_2__) +#define ARROW_HAVE_SSE4_2 1 +#include +#endif + +#endif // ARROW_USE_SIMD + +// MSVC x86-64 + +namespace arrow { + +/// This class contains constants useful for text processing with SSE4.2 intrinsics. +namespace SSEUtil { +/// Number of characters that fit in 64/128 bit register. SSE provides instructions +/// for loading 64 or 128 bits into a register at a time. +static const int CHARS_PER_64_BIT_REGISTER = 8; +static const int CHARS_PER_128_BIT_REGISTER = 16; + +/// SSE4.2 adds instructions for text processing. The instructions have a control +/// byte that determines some of functionality of the instruction. (Equivalent to +/// GCC's _SIDD_CMP_EQUAL_ANY, etc). +static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr +static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp +static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) +static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. + +/// In this mode, SSE text processing functions will return a mask of all the +/// characters that matched. +static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; + +/// In this mode, SSE text processing functions will return the number of +/// bytes that match consecutively from the beginning. +static const int STRCMP_MODE = + PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; + +/// Precomputed mask values up to 16 bits. +static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { + 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, + 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, +}; +} // namespace SSEUtil + +#ifdef ARROW_HAVE_SSE4_2 + +/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen +/// IR load time) that the processor supports SSE 4.2 before calling these. These are +/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. + +template +static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { + return _mm_cmpestrm(str1, len1, str2, len2, MODE); +} + +template +static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { + return _mm_cmpestri(str1, len1, str2, len2, MODE); +} + +static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { + return _mm_crc32_u8(crc, v); +} + +static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { + return _mm_crc32_u16(crc, v); +} + +static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { + return _mm_crc32_u32(crc, v); +} + +static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { +#if ARROW_BITNESS == 32 + return 0; +#else + return static_cast(_mm_crc32_u64(crc, v)); +#endif +} + +#endif // ARROW_HAVE_SSE4_2 + +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/stl.h b/r/R/inst/include/arrow/util/stl.h new file mode 100644 index 00000000000..48898140bf1 --- /dev/null +++ b/r/R/inst/include/arrow/util/stl.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_STL_H +#define ARROW_UTIL_STL_H + +#include +#include +#include +#include + +#include "arrow/util/logging.h" + +namespace arrow { +namespace internal { + +template +typename std::enable_if::value, std::unique_ptr>::type make_unique( + A&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +template +typename std::enable_if::value && std::extent::value == 0, + std::unique_ptr>::type +make_unique(std::size_t n) { + using value_type = typename std::remove_extent::type; + return std::unique_ptr(new value_type[n]); +} + +template +inline std::vector DeleteVectorElement(const std::vector& values, size_t index) { + DCHECK(!values.empty()); + DCHECK_LT(index, values.size()); + std::vector out; + out.reserve(values.size() - 1); + for (size_t i = 0; i < index; ++i) { + out.push_back(values[i]); + } + for (size_t i = index + 1; i < values.size(); ++i) { + out.push_back(values[i]); + } + return out; +} + +template +inline std::vector AddVectorElement(const std::vector& values, size_t index, + const T& new_element) { + DCHECK_LE(index, values.size()); + std::vector out; + out.reserve(values.size() + 1); + for (size_t i = 0; i < index; ++i) { + out.push_back(values[i]); + } + out.push_back(new_element); + for (size_t i = index; i < values.size(); ++i) { + out.push_back(values[i]); + } + return out; +} + +template +inline std::vector ReplaceVectorElement(const std::vector& values, size_t index, + const T& new_element) { + DCHECK_LE(index, values.size()); + std::vector out; + out.reserve(values.size()); + for (size_t i = 0; i < index; ++i) { + out.push_back(values[i]); + } + out.push_back(new_element); + for (size_t i = index + 1; i < values.size(); ++i) { + out.push_back(values[i]); + } + return out; +} + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_STL_H diff --git a/r/R/inst/include/arrow/util/stopwatch.h b/r/R/inst/include/arrow/util/stopwatch.h new file mode 100644 index 00000000000..db4e67f59ed --- /dev/null +++ b/r/R/inst/include/arrow/util/stopwatch.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace arrow { +namespace internal { + +class StopWatch { + // This clock should give us wall clock time + using ClockType = std::chrono::steady_clock; + + public: + StopWatch() {} + + void Start() { start_ = ClockType::now(); } + + // Returns time in nanoseconds. + uint64_t Stop() { + auto stop = ClockType::now(); + std::chrono::nanoseconds d = stop - start_; + assert(d.count() >= 0); + return static_cast(d.count()); + } + + private: + std::chrono::time_point start_; +}; + +} // namespace internal +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/string.h b/r/R/inst/include/arrow/util/string.h new file mode 100644 index 00000000000..1d716c5a156 --- /dev/null +++ b/r/R/inst/include/arrow/util/string.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_STRING_UTIL_H +#define ARROW_UTIL_STRING_UTIL_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +static const char* kAsciiTable = "0123456789ABCDEF"; + +static inline std::string HexEncode(const uint8_t* data, size_t length) { + std::string hex_string; + hex_string.reserve(length * 2); + for (size_t j = 0; j < length; ++j) { + // Convert to 2 base16 digits + hex_string.push_back(kAsciiTable[data[j] >> 4]); + hex_string.push_back(kAsciiTable[data[j] & 15]); + } + return hex_string; +} + +static inline std::string HexEncode(const char* data, size_t length) { + return HexEncode(reinterpret_cast(data), length); +} + +static inline std::string HexEncode(util::string_view str) { + return HexEncode(str.data(), str.size()); +} + +static inline Status ParseHexValue(const char* data, uint8_t* out) { + char c1 = data[0]; + char c2 = data[1]; + + const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1); + const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2); + + // Error checking + if (*pos1 != c1 || *pos2 != c2) { + return Status::Invalid("Encountered non-hex digit"); + } + + *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); + return Status::OK(); +} + +} // namespace arrow + +#endif // ARROW_UTIL_STRING_UTIL_H diff --git a/r/R/inst/include/arrow/util/string_builder.h b/r/R/inst/include/arrow/util/string_builder.h new file mode 100644 index 00000000000..9129f12c681 --- /dev/null +++ b/r/R/inst/include/arrow/util/string_builder.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. template + +#ifndef ARROW_UTIL_STRING_BUILDER_H +#define ARROW_UTIL_STRING_BUILDER_H + +#include +#include +#include +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +namespace detail { + +class ARROW_EXPORT StringStreamWrapper { + public: + StringStreamWrapper(); + ~StringStreamWrapper(); + + std::ostream& stream() { return ostream_; } + std::string str(); + + protected: + std::unique_ptr sstream_; + std::ostream& ostream_; +}; + +} // namespace detail + +template +void StringBuilderRecursive(std::ostream& stream, Head&& head) { + stream << head; +} + +template +void StringBuilderRecursive(std::ostream& stream, Head&& head, Tail&&... tail) { + StringBuilderRecursive(stream, std::forward(head)); + StringBuilderRecursive(stream, std::forward(tail)...); +} + +template +std::string StringBuilder(Args&&... args) { + detail::StringStreamWrapper ss; + StringBuilderRecursive(ss.stream(), std::forward(args)...); + return ss.str(); +} + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_STRING_BUILDER_H diff --git a/r/R/inst/include/arrow/util/string_view.h b/r/R/inst/include/arrow/util/string_view.h new file mode 100644 index 00000000000..88748429b7e --- /dev/null +++ b/r/R/inst/include/arrow/util/string_view.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_STRING_VIEW_H +#define ARROW_UTIL_STRING_VIEW_H + +#define nssv_CONFIG_SELECT_STRING_VIEW nssv_STRING_VIEW_NONSTD + +#include "arrow/vendored/string_view.hpp" // IWYU pragma: export + +namespace arrow { +namespace util { + +using nonstd::string_view; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_STRING_VIEW_H diff --git a/r/R/inst/include/arrow/util/task-group.h b/r/R/inst/include/arrow/util/task-group.h new file mode 100644 index 00000000000..390d9476e59 --- /dev/null +++ b/r/R/inst/include/arrow/util/task-group.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TASK_GROUP_H +#define ARROW_UTIL_TASK_GROUP_H + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +class ThreadPool; + +// TODO Simplify this. Subgroups don't seem necessary. + +/// \brief A group of related tasks +/// +/// A TaskGroup executes tasks with the signature `Status()`. +/// Execution can be serial or parallel, depending on the TaskGroup +/// implementation. When Finish() returns, it is guaranteed that all +/// tasks have finished, or at least one has errored. +/// +class ARROW_EXPORT TaskGroup { + public: + /// Add a Status-returning function to execute. Execution order is + /// undefined. The function may be executed immediately or later. + template + void Append(Function&& func) { + return AppendReal(std::forward(func)); + } + + /// Wait for execution of all tasks (and subgroups) to be finished, + /// or for at least one task (or subgroup) to error out. + /// The returned Status propagates the error status of the first failing + /// task (or subgroup). + virtual Status Finish() = 0; + + /// The current agregate error Status. Non-blocking, useful for stopping early. + virtual Status current_status() = 0; + + /// Whether some tasks have already failed. Non-blocking , useful for stopping early. + virtual bool ok() = 0; + + /// How many tasks can typically be executed in parallel. + /// This is only a hint, useful for testing or debugging. + virtual int parallelism() = 0; + + /// Create a subgroup of this group. This group can only finish + /// when all subgroups have finished (this means you must be + /// be careful to call Finish() on subgroups before calling it + /// on the main group). + // XXX if a subgroup errors out, should it propagate immediately to the parent + // and to children? + virtual std::shared_ptr MakeSubGroup() = 0; + + static std::shared_ptr MakeSerial(); + static std::shared_ptr MakeThreaded(internal::ThreadPool*); + + virtual ~TaskGroup() = default; + + protected: + TaskGroup() = default; + ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup); + + virtual void AppendReal(std::function task) = 0; +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TASK_GROUP_H diff --git a/r/R/inst/include/arrow/util/thread-pool.h b/r/R/inst/include/arrow/util/thread-pool.h new file mode 100644 index 00000000000..2de212e64c5 --- /dev/null +++ b/r/R/inst/include/arrow/util/thread-pool.h @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_THREAD_POOL_H +#define ARROW_UTIL_THREAD_POOL_H + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Get the capacity of the global thread pool +/// +/// Return the number of worker threads in the thread pool to which +/// Arrow dispatches various CPU-bound tasks. This is an ideal number, +/// not necessarily the exact number of threads at a given point in time. +/// +/// You can change this number using SetCpuThreadPoolCapacity(). +ARROW_EXPORT int GetCpuThreadPoolCapacity(); + +/// \brief Set the capacity of the global thread pool +/// +/// Set the number of worker threads int the thread pool to which +/// Arrow dispatches various CPU-bound tasks. +/// +/// The current number is returned by GetCpuThreadPoolCapacity(). +ARROW_EXPORT Status SetCpuThreadPoolCapacity(int threads); + +namespace internal { + +namespace detail { + +// Needed because std::packaged_task is not copyable and hence not convertible +// to std::function. +template +struct packaged_task_wrapper { + using PackagedTask = std::packaged_task; + + explicit packaged_task_wrapper(PackagedTask&& task) + : task_(std::make_shared(std::forward(task))) {} + + void operator()(Args&&... args) { return (*task_)(std::forward(args)...); } + std::shared_ptr task_; +}; + +} // namespace detail + +class ARROW_EXPORT ThreadPool { + public: + // Construct a thread pool with the given number of worker threads + static Status Make(int threads, std::shared_ptr* out); + + // Destroy thread pool; the pool will first be shut down + ~ThreadPool(); + + // Return the desired number of worker threads. + // The actual number of workers may lag a bit before being adjusted to + // match this value. + int GetCapacity(); + + // Dynamically change the number of worker threads. + // This function returns quickly, but it may take more time before the + // thread count is fully adjusted. + Status SetCapacity(int threads); + + // Heuristic for the default capacity of a thread pool for CPU-bound tasks. + // This is exposed as a static method to help with testing. + static int DefaultCapacity(); + + // Shutdown the pool. Once the pool starts shutting down, new tasks + // cannot be submitted anymore. + // If "wait" is true, shutdown waits for all pending tasks to be finished. + // If "wait" is false, workers are stopped as soon as currently executing + // tasks are finished. + Status Shutdown(bool wait = true); + + // Spawn a fire-and-forget task on one of the workers. + template + Status Spawn(Function&& func) { + return SpawnReal(std::forward(func)); + } + + // Submit a callable and arguments for execution. Return a future that + // will return the callable's result value once. + // The callable's arguments are copied before execution. + // Since the function is variadic and needs to return a result (the future), + // an exception is raised if the task fails spawning (which currently + // only occurs if the ThreadPool is shutting down). + template ::type> + std::future Submit(Function&& func, Args&&... args) { + // Trying to templatize std::packaged_task with Function doesn't seem + // to work, so go through std::bind to simplify the packaged signature + using PackagedTask = std::packaged_task; + auto task = PackagedTask(std::bind(std::forward(func), args...)); + auto fut = task.get_future(); + + Status st = SpawnReal(detail::packaged_task_wrapper(std::move(task))); + if (!st.ok()) { + st.Abort("ThreadPool::Submit() was probably called after Shutdown()"); + } + return fut; + } + + struct State; + + protected: + FRIEND_TEST(TestThreadPool, SetCapacity); + FRIEND_TEST(TestGlobalThreadPool, Capacity); + friend ARROW_EXPORT ThreadPool* GetCpuThreadPool(); + + ThreadPool(); + + ARROW_DISALLOW_COPY_AND_ASSIGN(ThreadPool); + + Status SpawnReal(std::function task); + // Collect finished worker threads, making sure the OS threads have exited + void CollectFinishedWorkersUnlocked(); + // Launch a given number of additional workers + void LaunchWorkersUnlocked(int threads); + // Get the current actual capacity + int GetActualCapacity(); + // Reinitialize the thread pool if the pid changed + void ProtectAgainstFork(); + + static std::shared_ptr MakeCpuThreadPool(); + + std::shared_ptr sp_state_; + State* state_; + bool shutdown_on_destroy_; +#ifndef _WIN32 + pid_t pid_; +#endif +}; + +// Return the process-global thread pool for CPU-bound tasks. +ARROW_EXPORT ThreadPool* GetCpuThreadPool(); + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_THREAD_POOL_H diff --git a/r/R/inst/include/arrow/util/trie.h b/r/R/inst/include/arrow/util/trie.h new file mode 100644 index 00000000000..3e82bfd8ee2 --- /dev/null +++ b/r/R/inst/include/arrow/util/trie.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TRIE_H +#define ARROW_UTIL_TRIE_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +// A non-zero-terminated small string class. +// std::string usually has a small string optimization +// (see review at https://shaharmike.com/cpp/std-string/) +// but this one allows tight control and optimization of memory layout. +template +class SmallString { + public: + SmallString() : length_(0) {} + + template + SmallString(const T& v) { // NOLINT implicit constructor + *this = util::string_view(v); + } + + SmallString& operator=(const util::string_view s) { +#ifndef NDEBUG + CheckSize(s.size()); +#endif + length_ = static_cast(s.size()); + std::memcpy(data_, s.data(), length_); + return *this; + } + + SmallString& operator=(const std::string& s) { + *this = util::string_view(s); + return *this; + } + + SmallString& operator=(const char* s) { + *this = util::string_view(s); + return *this; + } + + explicit operator util::string_view() const { + return util::string_view(data_, length_); + } + + const char* data() const { return data_; } + size_t length() const { return length_; } + bool empty() const { return length_ == 0; } + char operator[](size_t pos) const { +#ifdef NDEBUG + assert(pos <= length_); +#endif + return data_[pos]; + } + + SmallString substr(size_t pos) const { + return SmallString(util::string_view(*this).substr(pos)); + } + + SmallString substr(size_t pos, size_t count) const { + return SmallString(util::string_view(*this).substr(pos, count)); + } + + template + bool operator==(T&& other) const { + return util::string_view(*this) == util::string_view(std::forward(other)); + } + + template + bool operator!=(T&& other) const { + return util::string_view(*this) != util::string_view(std::forward(other)); + } + + protected: + uint8_t length_; + char data_[N]; + +#ifndef NDEBUG + void CheckSize(size_t n) { assert(n <= N); } +#endif +}; + +template +std::ostream& operator<<(std::ostream& os, const SmallString& str) { + return os << util::string_view(str); +} + +// A trie class for byte strings, optimized for small sets of short strings. +// This class is immutable by design, use a TrieBuilder to construct it. +class ARROW_EXPORT Trie { + using index_type = int16_t; + using fast_index_type = int_fast16_t; + + public: + Trie() : size_(0) {} + Trie(Trie&&) = default; + Trie& operator=(Trie&&) = default; + + int32_t Find(util::string_view s) const { + const Node* node = &nodes_[0]; + fast_index_type pos = 0; + fast_index_type remaining = static_cast(s.length()); + + while (remaining > 0) { + auto substring_length = node->substring_length(); + if (substring_length > 0) { + auto substring_data = node->substring_data(); + if (remaining < substring_length) { + // Input too short + return -1; + } + for (fast_index_type i = 0; i < substring_length; ++i) { + if (s[pos++] != substring_data[i]) { + // Mismatching substring + return -1; + } + --remaining; + } + if (remaining == 0) { + // Matched node exactly + return node->found_index_; + } + } + // Lookup child using next input character + if (node->child_lookup_ == -1) { + // Input too long + return -1; + } + auto c = static_cast(s[pos++]); + --remaining; + auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; + if (child_index == -1) { + // Child not found + return -1; + } + node = &nodes_[child_index]; + } + + // Input exhausted + if (node->substring_.empty()) { + // Matched node exactly + return node->found_index_; + } else { + return -1; + } + } + + Status Validate() const; + + void Dump() const; + + protected: + static constexpr size_t kNodeSize = 16; + static constexpr auto kMaxSubstringLength = + kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); + + struct Node { + // If this node is a valid end of string, index of found string, otherwise -1 + index_type found_index_; + // Base index for child lookup in lookup_table_ (-1 if no child nodes) + index_type child_lookup_; + // The substring for this node. + SmallString substring_; + + fast_index_type substring_length() const { + return static_cast(substring_.length()); + } + const char* substring_data() const { return substring_.data(); } + }; + + static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); + + ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); + + void Dump(const Node* node, const std::string& indent) const; + + // Node table: entry 0 is the root node + std::vector nodes_; + + // Indexed lookup structure: gives index in node table, or -1 if not found + std::vector lookup_table_; + + // Number of entries + index_type size_; + + friend class TrieBuilder; +}; + +class ARROW_EXPORT TrieBuilder { + using index_type = Trie::index_type; + using fast_index_type = Trie::fast_index_type; + + public: + TrieBuilder(); + Status Append(util::string_view s, bool allow_duplicate = false); + Trie Finish(); + + protected: + // Extend the lookup table by 256 entries, return the index of the new span + Status ExtendLookupTable(index_type* out_lookup_index); + // Split the node given by the index at the substring index `split_at` + Status SplitNode(fast_index_type node_index, fast_index_type split_at); + // Append an already constructed child node to the parent + Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); + // Create a matching child node from this parent + Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); + Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); + + Trie trie_; + + static constexpr auto kMaxIndex = std::numeric_limits::max(); +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TRIE_H diff --git a/r/R/inst/include/arrow/util/type_traits.h b/r/R/inst/include/arrow/util/type_traits.h new file mode 100644 index 00000000000..570f6486789 --- /dev/null +++ b/r/R/inst/include/arrow/util/type_traits.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TYPE_TRAITS_H +#define ARROW_UTIL_TYPE_TRAITS_H + +#include + +namespace arrow { +namespace internal { + +/// \brief Metafunction to allow checking if a type matches any of another set of types +template +struct IsOneOf : std::false_type {}; /// Base case: nothing has matched + +template +struct IsOneOf { + /// Recursive case: T == U or T matches any other types provided (not including U). + static constexpr bool value = std::is_same::value || IsOneOf::value; +}; + +/// \brief Shorthand for using IsOneOf + std::enable_if +template +using EnableIfIsOneOf = typename std::enable_if::value, T>::type; + +/// \brief is_null_pointer from C++17 +template +struct is_null_pointer : std::is_same::type> { +}; + +} // namespace internal +} // namespace arrow + +#endif // ARROW_UTIL_TYPE_TRAITS_H diff --git a/r/R/inst/include/arrow/util/ubsan.h b/r/R/inst/include/arrow/util/ubsan.h new file mode 100644 index 00000000000..f9fcfb54022 --- /dev/null +++ b/r/R/inst/include/arrow/util/ubsan.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Contains utilities for making UBSan happy. + +#pragma once + +#include + +#include "arrow/util/macros.h" + +namespace arrow { +namespace util { + +namespace internal { + +static uint8_t non_null_filler; + +} // namespace internal + +/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory +/// that shouldn't be dereferenced. +/// +/// Memset/Memcpy are undefinfed when a nullptr is passed as an argument use this utility +/// method to wrap locations where this could happen. +/// +/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed. +/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve them. +template +inline T* MakeNonNull(T* maybe_null) { + if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) { + return maybe_null; + } + + return reinterpret_cast(&internal::non_null_filler); +} + +} // namespace util +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/uri.h b/r/R/inst/include/arrow/util/uri.h new file mode 100644 index 00000000000..ce082ccc8e6 --- /dev/null +++ b/r/R/inst/include/arrow/util/uri.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief A parsed URI +class ARROW_EXPORT Uri { + public: + Uri(); + ~Uri(); + + // XXX Should we use util::string_view instead? These functions are + // not performance-critical. + + /// The URI scheme, such as "http", or the empty string if the URI has no + /// explicit scheme. + std::string scheme() const; + /// Whether the URI has an explicit host name. This may return true if + /// the URI has an empty host (e.g. "file:///tmp/foo"), while it returns + /// false is the URI has not host component at all (e.g. "file:/tmp/foo"). + bool has_host() const; + /// The URI host name, such as "localhost", "127.0.0.1" or "::1", or the empty + /// string is the URI does not have a host component. + std::string host() const; + /// The URI port number, as a string such as "80", or the empty string is the URI + /// does not have a port number component. + std::string port_text() const; + /// The URI port parsed as an integer, or -1 if the URI does not have a port + /// number component. + int32_t port() const; + /// The URI path component. + std::string path() const; + + /// Get the string representation of this URI. + const std::string& ToString() const; + + /// Factory function to parse a URI from its string representation. + Status Parse(const std::string& uri_string); + + private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace internal +} // namespace arrow diff --git a/r/R/inst/include/arrow/util/utf8.h b/r/R/inst/include/arrow/util/utf8.h new file mode 100644 index 00000000000..739c7566c05 --- /dev/null +++ b/r/R/inst/include/arrow/util/utf8.h @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_UTF8_H +#define ARROW_UTIL_UTF8_H + +#include +#include +#include +#include + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +namespace internal { + +// Copyright (c) 2008-2010 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +// A compact state table allowing UTF8 decoding using two dependent +// lookups per byte. The first lookup determines the character class +// and the second lookup reads the next state. +// In this table states are multiples of 12. +ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12]; + +// Success / reject states when looked up in the small table +static constexpr uint8_t kUTF8DecodeAccept = 0; +static constexpr uint8_t kUTF8DecodeReject = 12; + +// An expanded state table allowing transitions using a single lookup +// at the expense of a larger memory footprint (but on non-random data, +// not all the table will end up accessed and cached). +// In this table states are multiples of 256. +ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256]; + +// Success / reject states when looked up in the large table +static constexpr uint16_t kUTF8ValidateAccept = 0; +static constexpr uint16_t kUTF8ValidateReject = 256; + +static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) { + uint8_t type = utf8_small_table[byte]; + + *codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6) + : (0xff >> type) & (byte); + + state = utf8_small_table[256 + state + type]; + return state; +} + +static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) { + return utf8_large_table[state + byte]; +} + +#ifndef NDEBUG +ARROW_EXPORT void CheckUTF8Initialized(); +#endif + +} // namespace internal + +// This function needs to be called before doing UTF8 validation. +ARROW_EXPORT void InitializeUTF8(); + +inline bool ValidateUTF8(const uint8_t* data, int64_t size) { + static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL; + // For some reason, defining this variable outside the loop helps clang + uint64_t mask; + +#ifndef NDEBUG + internal::CheckUTF8Initialized(); +#endif + + while (size >= 8) { + // XXX This is doing an unaligned access. Contemporary architectures + // (x86-64, AArch64, PPC64) support it natively and often have good + // performance nevertheless. + memcpy(&mask, data, 8); + if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) { + // 8 bytes of pure ASCII, move forward + size -= 8; + data += 8; + continue; + } + // Non-ASCII run detected. + // We process at least 4 bytes, to avoid too many spurious 64-bit reads + // in case the non-ASCII bytes are at the end of the tested 64-bit word. + // We also only check for rejection at the end since that state is stable + // (once in reject state, we always remain in reject state). + // It is guaranteed that size >= 8 when arriving here, which allows + // us to avoid size checks. + uint16_t state = internal::kUTF8ValidateAccept; + // Byte 0 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + // Byte 1 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + // Byte 2 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + // Byte 3 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + // Byte 4 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + if (state == internal::kUTF8ValidateAccept) { + continue; // Got full char, switch back to ASCII detection + } + // Byte 5 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + if (state == internal::kUTF8ValidateAccept) { + continue; // Got full char, switch back to ASCII detection + } + // Byte 6 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + if (state == internal::kUTF8ValidateAccept) { + continue; // Got full char, switch back to ASCII detection + } + // Byte 7 + state = internal::ValidateOneUTF8Byte(*data++, state); + --size; + if (state == internal::kUTF8ValidateAccept) { + continue; // Got full char, switch back to ASCII detection + } + // kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection + assert(state == internal::kUTF8ValidateReject); + return false; + } + + // Validate string tail one byte at a time + // Note the state table is designed so that, once in the reject state, + // we remain in that state until the end. So we needn't check for + // rejection at each char (we don't gain much by short-circuiting here). + uint16_t state = internal::kUTF8ValidateAccept; + while (size-- > 0) { + state = internal::ValidateOneUTF8Byte(*data++, state); + } + return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); +} + +inline bool ValidateUTF8(const util::string_view& str) { + const uint8_t* data = reinterpret_cast(str.data()); + const size_t length = str.size(); + + return ValidateUTF8(data, length); +} + +// Skip UTF8 byte order mark, if any. +ARROW_EXPORT +Status SkipUTF8BOM(const uint8_t* data, int64_t size, const uint8_t** out); + +} // namespace util +} // namespace arrow + +#endif diff --git a/r/R/inst/include/arrow/util/variant.h b/r/R/inst/include/arrow/util/variant.h new file mode 100644 index 00000000000..0097c5afb2a --- /dev/null +++ b/r/R/inst/include/arrow/util/variant.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_VARIANT_H +#define ARROW_UTIL_VARIANT_H + +#include "arrow/vendored/variant.hpp" // IWYU pragma: export + +namespace arrow { +namespace util { + +using ::mpark::bad_variant_access; +using ::mpark::get; +using ::mpark::get_if; +using ::mpark::holds_alternative; +using ::mpark::variant; +using ::mpark::visit; + +} // namespace util +} // namespace arrow + +#endif // ARROW_UTIL_VARIANT_H diff --git a/r/R/inst/include/arrow/util/visibility.h b/r/R/inst/include/arrow/util/visibility.h new file mode 100644 index 00000000000..b224717a62d --- /dev/null +++ b/r/R/inst/include/arrow/util/visibility.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_VISIBILITY_H +#define ARROW_UTIL_VISIBILITY_H + +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_STATIC +#define ARROW_EXPORT +#elif defined(ARROW_EXPORTING) +#define ARROW_EXPORT __declspec(dllexport) +#else +#define ARROW_EXPORT __declspec(dllimport) +#endif + +#define ARROW_NO_EXPORT +#else // Not Windows +#ifndef ARROW_EXPORT +#define ARROW_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef ARROW_NO_EXPORT +#define ARROW_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +// This is a complicated topic, some reading on it: +// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ +#if defined(_MSC_VER) || defined(__clang__) +#define ARROW_TEMPLATE_CLASS_EXPORT +#define ARROW_TEMPLATE_EXPORT ARROW_EXPORT +#else +#define ARROW_TEMPLATE_CLASS_EXPORT ARROW_EXPORT +#define ARROW_TEMPLATE_EXPORT +#endif + +#endif // ARROW_UTIL_VISIBILITY_H diff --git a/r/R/inst/include/arrow/util/windows_compatibility.h b/r/R/inst/include/arrow/util/windows_compatibility.h new file mode 100644 index 00000000000..70c4313a542 --- /dev/null +++ b/r/R/inst/include/arrow/util/windows_compatibility.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#ifdef _WIN32 + +// Windows defines min and max macros that mess up std::min/max +#ifndef NOMINMAX +#define NOMINMAX +#endif + +#define WIN32_LEAN_AND_MEAN + +// Set Windows 7 as a conservative minimum for Apache Arrow +#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601 +#undef _WIN32_WINNT +#endif +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x601 +#endif + +#include +#include + +#endif // _WIN32 diff --git a/r/R/inst/include/arrow/vendored/datetime.h b/r/R/inst/include/arrow/vendored/datetime.h new file mode 100644 index 00000000000..424313a5f5d --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/vendored/datetime/date.h" +#include "arrow/vendored/datetime/tz.h" diff --git a/r/R/inst/include/arrow/vendored/datetime/date.h b/r/R/inst/include/arrow/vendored/datetime/date.h new file mode 100644 index 00000000000..c8e14e53704 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime/date.h @@ -0,0 +1,8025 @@ +#ifndef DATE_H +#define DATE_H + +// The MIT License (MIT) +// +// Copyright (c) 2015, 2016, 2017 Howard Hinnant +// Copyright (c) 2016 Adrian Colomitchi +// Copyright (c) 2017 Florian Dang +// Copyright (c) 2017 Paul Thompson +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Our apologies. When the previous paragraph was written, lowercase had not yet +// been invented (that would involve another several millennia of evolution). +// We did not mean to shout. + +#ifndef HAS_STRING_VIEW +# if __cplusplus >= 201703 +# define HAS_STRING_VIEW 1 +# else +# define HAS_STRING_VIEW 0 +# endif +#endif // HAS_STRING_VIEW + +#include +#include +#include +#include +#include +#if !(__cplusplus >= 201402) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if HAS_STRING_VIEW +# include +#endif +#include +#include + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wpedantic" +# if __GNUC__ < 5 + // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers +# pragma GCC diagnostic ignored "-Wmissing-field-initializers" +# endif +#endif + +namespace arrow_vendored +{ +namespace date +{ + +//---------------+ +// Configuration | +//---------------+ + +#ifndef ONLY_C_LOCALE +# define ONLY_C_LOCALE 0 +#endif + +#if defined(_MSC_VER) && (!defined(__clang__) || (_MSC_VER < 1910)) +// MSVC +# if _MSC_VER < 1910 +// before VS2017 +# define CONSTDATA const +# define CONSTCD11 +# define CONSTCD14 +# define NOEXCEPT _NOEXCEPT +# else +// VS2017 and later +# define CONSTDATA constexpr const +# define CONSTCD11 constexpr +# define CONSTCD14 constexpr +# define NOEXCEPT noexcept +# endif + +#elif defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5150 +// Oracle Developer Studio 12.6 and earlier +# define CONSTDATA constexpr const +# define CONSTCD11 constexpr +# define CONSTCD14 +# define NOEXCEPT noexcept + +#elif __cplusplus >= 201402 +// C++14 +# define CONSTDATA constexpr const +# define CONSTCD11 constexpr +# define CONSTCD14 constexpr +# define NOEXCEPT noexcept +#else +// C++11 +# define CONSTDATA constexpr const +# define CONSTCD11 constexpr +# define CONSTCD14 +# define NOEXCEPT noexcept +#endif + +#ifndef HAS_VOID_T +# if __cplusplus >= 201703 +# define HAS_VOID_T 1 +# else +# define HAS_VOID_T 0 +# endif +#endif // HAS_VOID_T + +// Protect from Oracle sun macro +#ifdef sun +# undef sun +#endif + +//-----------+ +// Interface | +//-----------+ + +// durations + +using days = std::chrono::duration + , std::chrono::hours::period>>; + +using weeks = std::chrono::duration + , days::period>>; + +using years = std::chrono::duration + , days::period>>; + +using months = std::chrono::duration + >>; + +// time_point + +template + using sys_time = std::chrono::time_point; + +using sys_days = sys_time; +using sys_seconds = sys_time; + +struct local_t {}; + +template + using local_time = std::chrono::time_point; + +using local_seconds = local_time; +using local_days = local_time; + +// types + +struct last_spec +{ + explicit last_spec() = default; +}; + +class day; +class month; +class year; + +class weekday; +class weekday_indexed; +class weekday_last; + +class month_day; +class month_day_last; +class month_weekday; +class month_weekday_last; + +class year_month; + +class year_month_day; +class year_month_day_last; +class year_month_weekday; +class year_month_weekday_last; + +// date composition operators + +CONSTCD11 year_month operator/(const year& y, const month& m) NOEXCEPT; +CONSTCD11 year_month operator/(const year& y, int m) NOEXCEPT; + +CONSTCD11 month_day operator/(const day& d, const month& m) NOEXCEPT; +CONSTCD11 month_day operator/(const day& d, int m) NOEXCEPT; +CONSTCD11 month_day operator/(const month& m, const day& d) NOEXCEPT; +CONSTCD11 month_day operator/(const month& m, int d) NOEXCEPT; +CONSTCD11 month_day operator/(int m, const day& d) NOEXCEPT; + +CONSTCD11 month_day_last operator/(const month& m, last_spec) NOEXCEPT; +CONSTCD11 month_day_last operator/(int m, last_spec) NOEXCEPT; +CONSTCD11 month_day_last operator/(last_spec, const month& m) NOEXCEPT; +CONSTCD11 month_day_last operator/(last_spec, int m) NOEXCEPT; + +CONSTCD11 month_weekday operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT; +CONSTCD11 month_weekday operator/(int m, const weekday_indexed& wdi) NOEXCEPT; +CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT; +CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, int m) NOEXCEPT; + +CONSTCD11 month_weekday_last operator/(const month& m, const weekday_last& wdl) NOEXCEPT; +CONSTCD11 month_weekday_last operator/(int m, const weekday_last& wdl) NOEXCEPT; +CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, const month& m) NOEXCEPT; +CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, int m) NOEXCEPT; + +CONSTCD11 year_month_day operator/(const year_month& ym, const day& d) NOEXCEPT; +CONSTCD11 year_month_day operator/(const year_month& ym, int d) NOEXCEPT; +CONSTCD11 year_month_day operator/(const year& y, const month_day& md) NOEXCEPT; +CONSTCD11 year_month_day operator/(int y, const month_day& md) NOEXCEPT; +CONSTCD11 year_month_day operator/(const month_day& md, const year& y) NOEXCEPT; +CONSTCD11 year_month_day operator/(const month_day& md, int y) NOEXCEPT; + +CONSTCD11 + year_month_day_last operator/(const year_month& ym, last_spec) NOEXCEPT; +CONSTCD11 + year_month_day_last operator/(const year& y, const month_day_last& mdl) NOEXCEPT; +CONSTCD11 + year_month_day_last operator/(int y, const month_day_last& mdl) NOEXCEPT; +CONSTCD11 + year_month_day_last operator/(const month_day_last& mdl, const year& y) NOEXCEPT; +CONSTCD11 + year_month_day_last operator/(const month_day_last& mdl, int y) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator/(const year& y, const month_weekday& mwd) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator/(int y, const month_weekday& mwd) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator/(const month_weekday& mwd, const year& y) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator/(const month_weekday& mwd, int y) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator/(int y, const month_weekday_last& mwdl) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator/(const month_weekday_last& mwdl, int y) NOEXCEPT; + +// Detailed interface + +// day + +class day +{ + unsigned char d_; + +public: + day() = default; + explicit CONSTCD11 day(unsigned d) NOEXCEPT; + + CONSTCD14 day& operator++() NOEXCEPT; + CONSTCD14 day operator++(int) NOEXCEPT; + CONSTCD14 day& operator--() NOEXCEPT; + CONSTCD14 day operator--(int) NOEXCEPT; + + CONSTCD14 day& operator+=(const days& d) NOEXCEPT; + CONSTCD14 day& operator-=(const days& d) NOEXCEPT; + + CONSTCD11 explicit operator unsigned() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const day& x, const day& y) NOEXCEPT; +CONSTCD11 bool operator!=(const day& x, const day& y) NOEXCEPT; +CONSTCD11 bool operator< (const day& x, const day& y) NOEXCEPT; +CONSTCD11 bool operator> (const day& x, const day& y) NOEXCEPT; +CONSTCD11 bool operator<=(const day& x, const day& y) NOEXCEPT; +CONSTCD11 bool operator>=(const day& x, const day& y) NOEXCEPT; + +CONSTCD11 day operator+(const day& x, const days& y) NOEXCEPT; +CONSTCD11 day operator+(const days& x, const day& y) NOEXCEPT; +CONSTCD11 day operator-(const day& x, const days& y) NOEXCEPT; +CONSTCD11 days operator-(const day& x, const day& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const day& d); + +// month + +class month +{ + unsigned char m_; + +public: + month() = default; + explicit CONSTCD11 month(unsigned m) NOEXCEPT; + + CONSTCD14 month& operator++() NOEXCEPT; + CONSTCD14 month operator++(int) NOEXCEPT; + CONSTCD14 month& operator--() NOEXCEPT; + CONSTCD14 month operator--(int) NOEXCEPT; + + CONSTCD14 month& operator+=(const months& m) NOEXCEPT; + CONSTCD14 month& operator-=(const months& m) NOEXCEPT; + + CONSTCD11 explicit operator unsigned() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const month& x, const month& y) NOEXCEPT; +CONSTCD11 bool operator!=(const month& x, const month& y) NOEXCEPT; +CONSTCD11 bool operator< (const month& x, const month& y) NOEXCEPT; +CONSTCD11 bool operator> (const month& x, const month& y) NOEXCEPT; +CONSTCD11 bool operator<=(const month& x, const month& y) NOEXCEPT; +CONSTCD11 bool operator>=(const month& x, const month& y) NOEXCEPT; + +CONSTCD14 month operator+(const month& x, const months& y) NOEXCEPT; +CONSTCD14 month operator+(const months& x, const month& y) NOEXCEPT; +CONSTCD14 month operator-(const month& x, const months& y) NOEXCEPT; +CONSTCD14 months operator-(const month& x, const month& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const month& m); + +// year + +class year +{ + short y_; + +public: + year() = default; + explicit CONSTCD11 year(int y) NOEXCEPT; + + CONSTCD14 year& operator++() NOEXCEPT; + CONSTCD14 year operator++(int) NOEXCEPT; + CONSTCD14 year& operator--() NOEXCEPT; + CONSTCD14 year operator--(int) NOEXCEPT; + + CONSTCD14 year& operator+=(const years& y) NOEXCEPT; + CONSTCD14 year& operator-=(const years& y) NOEXCEPT; + + CONSTCD11 year operator-() const NOEXCEPT; + CONSTCD11 year operator+() const NOEXCEPT; + + CONSTCD11 bool is_leap() const NOEXCEPT; + + CONSTCD11 explicit operator int() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; + + static CONSTCD11 year min() NOEXCEPT; + static CONSTCD11 year max() NOEXCEPT; +}; + +CONSTCD11 bool operator==(const year& x, const year& y) NOEXCEPT; +CONSTCD11 bool operator!=(const year& x, const year& y) NOEXCEPT; +CONSTCD11 bool operator< (const year& x, const year& y) NOEXCEPT; +CONSTCD11 bool operator> (const year& x, const year& y) NOEXCEPT; +CONSTCD11 bool operator<=(const year& x, const year& y) NOEXCEPT; +CONSTCD11 bool operator>=(const year& x, const year& y) NOEXCEPT; + +CONSTCD11 year operator+(const year& x, const years& y) NOEXCEPT; +CONSTCD11 year operator+(const years& x, const year& y) NOEXCEPT; +CONSTCD11 year operator-(const year& x, const years& y) NOEXCEPT; +CONSTCD11 years operator-(const year& x, const year& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year& y); + +// weekday + +class weekday +{ + unsigned char wd_; +public: + weekday() = default; + explicit CONSTCD11 weekday(unsigned wd) NOEXCEPT; + CONSTCD11 weekday(const sys_days& dp) NOEXCEPT; + CONSTCD11 explicit weekday(const local_days& dp) NOEXCEPT; + + CONSTCD14 weekday& operator++() NOEXCEPT; + CONSTCD14 weekday operator++(int) NOEXCEPT; + CONSTCD14 weekday& operator--() NOEXCEPT; + CONSTCD14 weekday operator--(int) NOEXCEPT; + + CONSTCD14 weekday& operator+=(const days& d) NOEXCEPT; + CONSTCD14 weekday& operator-=(const days& d) NOEXCEPT; + + CONSTCD11 explicit operator unsigned() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; + + CONSTCD11 weekday_indexed operator[](unsigned index) const NOEXCEPT; + CONSTCD11 weekday_last operator[](last_spec) const NOEXCEPT; + +private: + static CONSTCD11 unsigned char weekday_from_days(int z) NOEXCEPT; +}; + +CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT; +CONSTCD11 bool operator!=(const weekday& x, const weekday& y) NOEXCEPT; + +CONSTCD14 weekday operator+(const weekday& x, const days& y) NOEXCEPT; +CONSTCD14 weekday operator+(const days& x, const weekday& y) NOEXCEPT; +CONSTCD14 weekday operator-(const weekday& x, const days& y) NOEXCEPT; +CONSTCD14 days operator-(const weekday& x, const weekday& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday& wd); + +// weekday_indexed + +class weekday_indexed +{ + unsigned char wd_ : 4; + unsigned char index_ : 4; + +public: + weekday_indexed() = default; + CONSTCD11 weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT; + + CONSTCD11 date::weekday weekday() const NOEXCEPT; + CONSTCD11 unsigned index() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT; +CONSTCD11 bool operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday_indexed& wdi); + +// weekday_last + +class weekday_last +{ + date::weekday wd_; + +public: + explicit CONSTCD11 weekday_last(const date::weekday& wd) NOEXCEPT; + + CONSTCD11 date::weekday weekday() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT; +CONSTCD11 bool operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday_last& wdl); + +// year_month + +class year_month +{ + date::year y_; + date::month m_; + +public: + year_month() = default; + CONSTCD11 year_month(const date::year& y, const date::month& m) NOEXCEPT; + + CONSTCD11 date::year year() const NOEXCEPT; + CONSTCD11 date::month month() const NOEXCEPT; + + CONSTCD14 year_month& operator+=(const months& dm) NOEXCEPT; + CONSTCD14 year_month& operator-=(const months& dm) NOEXCEPT; + CONSTCD14 year_month& operator+=(const years& dy) NOEXCEPT; + CONSTCD14 year_month& operator-=(const years& dy) NOEXCEPT; + + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 bool operator!=(const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 bool operator< (const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 bool operator> (const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 bool operator<=(const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 bool operator>=(const year_month& x, const year_month& y) NOEXCEPT; + +CONSTCD14 year_month operator+(const year_month& ym, const months& dm) NOEXCEPT; +CONSTCD14 year_month operator+(const months& dm, const year_month& ym) NOEXCEPT; +CONSTCD14 year_month operator-(const year_month& ym, const months& dm) NOEXCEPT; + +CONSTCD11 months operator-(const year_month& x, const year_month& y) NOEXCEPT; +CONSTCD11 year_month operator+(const year_month& ym, const years& dy) NOEXCEPT; +CONSTCD11 year_month operator+(const years& dy, const year_month& ym) NOEXCEPT; +CONSTCD11 year_month operator-(const year_month& ym, const years& dy) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month& ym); + +// month_day + +class month_day +{ + date::month m_; + date::day d_; + +public: + month_day() = default; + CONSTCD11 month_day(const date::month& m, const date::day& d) NOEXCEPT; + + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::day day() const NOEXCEPT; + + CONSTCD14 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const month_day& x, const month_day& y) NOEXCEPT; +CONSTCD11 bool operator!=(const month_day& x, const month_day& y) NOEXCEPT; +CONSTCD11 bool operator< (const month_day& x, const month_day& y) NOEXCEPT; +CONSTCD11 bool operator> (const month_day& x, const month_day& y) NOEXCEPT; +CONSTCD11 bool operator<=(const month_day& x, const month_day& y) NOEXCEPT; +CONSTCD11 bool operator>=(const month_day& x, const month_day& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_day& md); + +// month_day_last + +class month_day_last +{ + date::month m_; + +public: + CONSTCD11 explicit month_day_last(const date::month& m) NOEXCEPT; + + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT; +CONSTCD11 bool operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT; +CONSTCD11 bool operator< (const month_day_last& x, const month_day_last& y) NOEXCEPT; +CONSTCD11 bool operator> (const month_day_last& x, const month_day_last& y) NOEXCEPT; +CONSTCD11 bool operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT; +CONSTCD11 bool operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_day_last& mdl); + +// month_weekday + +class month_weekday +{ + date::month m_; + date::weekday_indexed wdi_; +public: + CONSTCD11 month_weekday(const date::month& m, + const date::weekday_indexed& wdi) NOEXCEPT; + + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT; + + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT; +CONSTCD11 bool operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_weekday& mwd); + +// month_weekday_last + +class month_weekday_last +{ + date::month m_; + date::weekday_last wdl_; + +public: + CONSTCD11 month_weekday_last(const date::month& m, + const date::weekday_last& wd) NOEXCEPT; + + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT; + + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 + bool operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT; +CONSTCD11 + bool operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_weekday_last& mwdl); + +// class year_month_day + +class year_month_day +{ + date::year y_; + date::month m_; + date::day d_; + +public: + year_month_day() = default; + CONSTCD11 year_month_day(const date::year& y, const date::month& m, + const date::day& d) NOEXCEPT; + CONSTCD14 year_month_day(const year_month_day_last& ymdl) NOEXCEPT; + + CONSTCD14 year_month_day(sys_days dp) NOEXCEPT; + CONSTCD14 explicit year_month_day(local_days dp) NOEXCEPT; + + CONSTCD14 year_month_day& operator+=(const months& m) NOEXCEPT; + CONSTCD14 year_month_day& operator-=(const months& m) NOEXCEPT; + CONSTCD14 year_month_day& operator+=(const years& y) NOEXCEPT; + CONSTCD14 year_month_day& operator-=(const years& y) NOEXCEPT; + + CONSTCD11 date::year year() const NOEXCEPT; + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::day day() const NOEXCEPT; + + CONSTCD14 operator sys_days() const NOEXCEPT; + CONSTCD14 explicit operator local_days() const NOEXCEPT; + CONSTCD14 bool ok() const NOEXCEPT; + +private: + static CONSTCD14 year_month_day from_days(days dp) NOEXCEPT; + CONSTCD14 days to_days() const NOEXCEPT; +}; + +CONSTCD11 bool operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT; +CONSTCD11 bool operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT; +CONSTCD11 bool operator< (const year_month_day& x, const year_month_day& y) NOEXCEPT; +CONSTCD11 bool operator> (const year_month_day& x, const year_month_day& y) NOEXCEPT; +CONSTCD11 bool operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT; +CONSTCD11 bool operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT; + +CONSTCD14 year_month_day operator+(const year_month_day& ymd, const months& dm) NOEXCEPT; +CONSTCD14 year_month_day operator+(const months& dm, const year_month_day& ymd) NOEXCEPT; +CONSTCD14 year_month_day operator-(const year_month_day& ymd, const months& dm) NOEXCEPT; +CONSTCD11 year_month_day operator+(const year_month_day& ymd, const years& dy) NOEXCEPT; +CONSTCD11 year_month_day operator+(const years& dy, const year_month_day& ymd) NOEXCEPT; +CONSTCD11 year_month_day operator-(const year_month_day& ymd, const years& dy) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_day& ymd); + +// year_month_day_last + +class year_month_day_last +{ + date::year y_; + date::month_day_last mdl_; + +public: + CONSTCD11 year_month_day_last(const date::year& y, + const date::month_day_last& mdl) NOEXCEPT; + + CONSTCD14 year_month_day_last& operator+=(const months& m) NOEXCEPT; + CONSTCD14 year_month_day_last& operator-=(const months& m) NOEXCEPT; + CONSTCD14 year_month_day_last& operator+=(const years& y) NOEXCEPT; + CONSTCD14 year_month_day_last& operator-=(const years& y) NOEXCEPT; + + CONSTCD11 date::year year() const NOEXCEPT; + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::month_day_last month_day_last() const NOEXCEPT; + CONSTCD14 date::day day() const NOEXCEPT; + + CONSTCD14 operator sys_days() const NOEXCEPT; + CONSTCD14 explicit operator local_days() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; +}; + +CONSTCD11 + bool operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; +CONSTCD11 + bool operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; +CONSTCD11 + bool operator< (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; +CONSTCD11 + bool operator> (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; +CONSTCD11 + bool operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; +CONSTCD11 + bool operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; + +CONSTCD14 +year_month_day_last +operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT; + +CONSTCD14 +year_month_day_last +operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT; + +CONSTCD11 +year_month_day_last +operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT; + +CONSTCD11 +year_month_day_last +operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT; + +CONSTCD14 +year_month_day_last +operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT; + +CONSTCD11 +year_month_day_last +operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_day_last& ymdl); + +// year_month_weekday + +class year_month_weekday +{ + date::year y_; + date::month m_; + date::weekday_indexed wdi_; + +public: + year_month_weekday() = default; + CONSTCD11 year_month_weekday(const date::year& y, const date::month& m, + const date::weekday_indexed& wdi) NOEXCEPT; + CONSTCD14 year_month_weekday(const sys_days& dp) NOEXCEPT; + CONSTCD14 explicit year_month_weekday(const local_days& dp) NOEXCEPT; + + CONSTCD14 year_month_weekday& operator+=(const months& m) NOEXCEPT; + CONSTCD14 year_month_weekday& operator-=(const months& m) NOEXCEPT; + CONSTCD14 year_month_weekday& operator+=(const years& y) NOEXCEPT; + CONSTCD14 year_month_weekday& operator-=(const years& y) NOEXCEPT; + + CONSTCD11 date::year year() const NOEXCEPT; + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::weekday weekday() const NOEXCEPT; + CONSTCD11 unsigned index() const NOEXCEPT; + CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT; + + CONSTCD14 operator sys_days() const NOEXCEPT; + CONSTCD14 explicit operator local_days() const NOEXCEPT; + CONSTCD14 bool ok() const NOEXCEPT; + +private: + static CONSTCD14 year_month_weekday from_days(days dp) NOEXCEPT; + CONSTCD14 days to_days() const NOEXCEPT; +}; + +CONSTCD11 + bool operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT; +CONSTCD11 + bool operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT; + +CONSTCD14 +year_month_weekday +operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT; + +CONSTCD14 +year_month_weekday +operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT; + +CONSTCD14 +year_month_weekday +operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT; + +CONSTCD11 +year_month_weekday +operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_weekday& ymwdi); + +// year_month_weekday_last + +class year_month_weekday_last +{ + date::year y_; + date::month m_; + date::weekday_last wdl_; + +public: + CONSTCD11 year_month_weekday_last(const date::year& y, const date::month& m, + const date::weekday_last& wdl) NOEXCEPT; + + CONSTCD14 year_month_weekday_last& operator+=(const months& m) NOEXCEPT; + CONSTCD14 year_month_weekday_last& operator-=(const months& m) NOEXCEPT; + CONSTCD14 year_month_weekday_last& operator+=(const years& y) NOEXCEPT; + CONSTCD14 year_month_weekday_last& operator-=(const years& y) NOEXCEPT; + + CONSTCD11 date::year year() const NOEXCEPT; + CONSTCD11 date::month month() const NOEXCEPT; + CONSTCD11 date::weekday weekday() const NOEXCEPT; + CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT; + + CONSTCD14 operator sys_days() const NOEXCEPT; + CONSTCD14 explicit operator local_days() const NOEXCEPT; + CONSTCD11 bool ok() const NOEXCEPT; + +private: + CONSTCD14 days to_days() const NOEXCEPT; +}; + +CONSTCD11 +bool +operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT; + +CONSTCD11 +bool +operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT; + +CONSTCD14 +year_month_weekday_last +operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT; + +CONSTCD14 +year_month_weekday_last +operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT; + +CONSTCD14 +year_month_weekday_last +operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT; + +CONSTCD11 +year_month_weekday_last +operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_weekday_last& ymwdl); + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +inline namespace literals +{ + +CONSTCD11 date::day operator "" _d(unsigned long long d) NOEXCEPT; +CONSTCD11 date::year operator "" _y(unsigned long long y) NOEXCEPT; + +// CONSTDATA date::month jan{1}; +// CONSTDATA date::month feb{2}; +// CONSTDATA date::month mar{3}; +// CONSTDATA date::month apr{4}; +// CONSTDATA date::month may{5}; +// CONSTDATA date::month jun{6}; +// CONSTDATA date::month jul{7}; +// CONSTDATA date::month aug{8}; +// CONSTDATA date::month sep{9}; +// CONSTDATA date::month oct{10}; +// CONSTDATA date::month nov{11}; +// CONSTDATA date::month dec{12}; +// +// CONSTDATA date::weekday sun{0u}; +// CONSTDATA date::weekday mon{1u}; +// CONSTDATA date::weekday tue{2u}; +// CONSTDATA date::weekday wed{3u}; +// CONSTDATA date::weekday thu{4u}; +// CONSTDATA date::weekday fri{5u}; +// CONSTDATA date::weekday sat{6u}; + +} // inline namespace literals +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) + +#if HAS_VOID_T + +template > +struct is_clock + : std::false_type +{}; + +template +struct is_clock> + : std::true_type +{}; + +#endif // HAS_VOID_T + +//----------------+ +// Implementation | +//----------------+ + +// utilities +namespace detail { + +template> +class save_stream +{ + std::basic_ostream& os_; + CharT fill_; + std::ios::fmtflags flags_; + std::locale loc_; + +public: + ~save_stream() + { + os_.fill(fill_); + os_.flags(flags_); + os_.imbue(loc_); + } + + save_stream(const save_stream&) = delete; + save_stream& operator=(const save_stream&) = delete; + + explicit save_stream(std::basic_ostream& os) + : os_(os) + , fill_(os.fill()) + , flags_(os.flags()) + , loc_(os.getloc()) + {} +}; + +template +struct choose_trunc_type +{ + static const int digits = std::numeric_limits::digits; + using type = typename std::conditional + < + digits < 32, + std::int32_t, + typename std::conditional + < + digits < 64, + std::int64_t, +#ifdef __SIZEOF_INT128__ + __int128 +#else + std::int64_t +#endif + >::type + >::type; +}; + +template +CONSTCD11 +inline +typename std::enable_if +< + !std::chrono::treat_as_floating_point::value, + T +>::type +trunc(T t) NOEXCEPT +{ + return t; +} + +template +CONSTCD14 +inline +typename std::enable_if +< + std::chrono::treat_as_floating_point::value, + T +>::type +trunc(T t) NOEXCEPT +{ + using namespace std; + using I = typename choose_trunc_type::type; + CONSTDATA auto digits = numeric_limits::digits; + static_assert(digits < numeric_limits::digits, ""); + CONSTDATA auto max = I{1} << (digits-1); + CONSTDATA auto min = -max; + const auto negative = t < T{0}; + if (min <= t && t <= max && t != 0 && t == t) + { + t = static_cast(static_cast(t)); + if (t == 0 && negative) + t = -t; + } + return t; +} + +template +struct static_gcd +{ + static const std::intmax_t value = static_gcd::value; +}; + +template +struct static_gcd +{ + static const std::intmax_t value = Xp; +}; + +template <> +struct static_gcd<0, 0> +{ + static const std::intmax_t value = 1; +}; + +template +struct no_overflow +{ +private: + static const std::intmax_t gcd_n1_n2 = static_gcd::value; + static const std::intmax_t gcd_d1_d2 = static_gcd::value; + static const std::intmax_t n1 = R1::num / gcd_n1_n2; + static const std::intmax_t d1 = R1::den / gcd_d1_d2; + static const std::intmax_t n2 = R2::num / gcd_n1_n2; + static const std::intmax_t d2 = R2::den / gcd_d1_d2; + static const std::intmax_t max = -((std::intmax_t(1) << + (sizeof(std::intmax_t) * CHAR_BIT - 1)) + 1); + + template + struct mul // overflow == false + { + static const std::intmax_t value = Xp * Yp; + }; + + template + struct mul + { + static const std::intmax_t value = 1; + }; + +public: + static const bool value = (n1 <= max / d2) && (n2 <= max / d1); + typedef std::ratio::value, + mul::value> type; +}; + +} // detail + +// trunc towards zero +template +CONSTCD11 +inline +typename std::enable_if +< + detail::no_overflow::value, + To +>::type +trunc(const std::chrono::duration& d) +{ + return To{detail::trunc(std::chrono::duration_cast(d).count())}; +} + +template +CONSTCD11 +inline +typename std::enable_if +< + !detail::no_overflow::value, + To +>::type +trunc(const std::chrono::duration& d) +{ + using namespace std::chrono; + using rep = typename std::common_type::type; + return To{detail::trunc(duration_cast(duration_cast>(d)).count())}; +} + +#ifndef HAS_CHRONO_ROUNDING +# if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023918 || (_MSC_FULL_VER >= 190000000 && defined (__clang__))) +# define HAS_CHRONO_ROUNDING 1 +# elif defined(__cpp_lib_chrono) && __cplusplus > 201402 && __cpp_lib_chrono >= 201510 +# define HAS_CHRONO_ROUNDING 1 +# elif defined(_LIBCPP_VERSION) && __cplusplus > 201402 && _LIBCPP_VERSION >= 3800 +# define HAS_CHRONO_ROUNDING 1 +# else +# define HAS_CHRONO_ROUNDING 0 +# endif +#endif // HAS_CHRONO_ROUNDING + +#if HAS_CHRONO_ROUNDING == 0 + +// round down +template +CONSTCD14 +inline +typename std::enable_if +< + detail::no_overflow::value, + To +>::type +floor(const std::chrono::duration& d) +{ + auto t = trunc(d); + if (t > d) + return t - To{1}; + return t; +} + +template +CONSTCD14 +inline +typename std::enable_if +< + !detail::no_overflow::value, + To +>::type +floor(const std::chrono::duration& d) +{ + using namespace std::chrono; + using rep = typename std::common_type::type; + return floor(floor>(d)); +} + +// round to nearest, to even on tie +template +CONSTCD14 +inline +To +round(const std::chrono::duration& d) +{ + auto t0 = floor(d); + auto t1 = t0 + To{1}; + if (t1 == To{0} && t0 < To{0}) + t1 = -t1; + auto diff0 = d - t0; + auto diff1 = t1 - d; + if (diff0 == diff1) + { + if (t0 - trunc(t0/2)*2 == To{0}) + return t0; + return t1; + } + if (diff0 < diff1) + return t0; + return t1; +} + +// round up +template +CONSTCD14 +inline +To +ceil(const std::chrono::duration& d) +{ + auto t = trunc(d); + if (t < d) + return t + To{1}; + return t; +} + +template ::is_signed + >::type> +CONSTCD11 +std::chrono::duration +abs(std::chrono::duration d) +{ + return d >= d.zero() ? d : -d; +} + +// round down +template +CONSTCD11 +inline +std::chrono::time_point +floor(const std::chrono::time_point& tp) +{ + using std::chrono::time_point; + return time_point{date::floor(tp.time_since_epoch())}; +} + +// round to nearest, to even on tie +template +CONSTCD11 +inline +std::chrono::time_point +round(const std::chrono::time_point& tp) +{ + using std::chrono::time_point; + return time_point{round(tp.time_since_epoch())}; +} + +// round up +template +CONSTCD11 +inline +std::chrono::time_point +ceil(const std::chrono::time_point& tp) +{ + using std::chrono::time_point; + return time_point{ceil(tp.time_since_epoch())}; +} + +#else // HAS_CHRONO_ROUNDING == 1 + +using std::chrono::floor; +using std::chrono::ceil; +using std::chrono::round; +using std::chrono::abs; + +#endif // HAS_CHRONO_ROUNDING + +// trunc towards zero +template +CONSTCD11 +inline +std::chrono::time_point +trunc(const std::chrono::time_point& tp) +{ + using std::chrono::time_point; + return time_point{trunc(tp.time_since_epoch())}; +} + +// day + +CONSTCD11 inline day::day(unsigned d) NOEXCEPT : d_(static_cast(d)) {} +CONSTCD14 inline day& day::operator++() NOEXCEPT {++d_; return *this;} +CONSTCD14 inline day day::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} +CONSTCD14 inline day& day::operator--() NOEXCEPT {--d_; return *this;} +CONSTCD14 inline day day::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} +CONSTCD14 inline day& day::operator+=(const days& d) NOEXCEPT {*this = *this + d; return *this;} +CONSTCD14 inline day& day::operator-=(const days& d) NOEXCEPT {*this = *this - d; return *this;} +CONSTCD11 inline day::operator unsigned() const NOEXCEPT {return d_;} +CONSTCD11 inline bool day::ok() const NOEXCEPT {return 1 <= d_ && d_ <= 31;} + +CONSTCD11 +inline +bool +operator==(const day& x, const day& y) NOEXCEPT +{ + return static_cast(x) == static_cast(y); +} + +CONSTCD11 +inline +bool +operator!=(const day& x, const day& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const day& x, const day& y) NOEXCEPT +{ + return static_cast(x) < static_cast(y); +} + +CONSTCD11 +inline +bool +operator>(const day& x, const day& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const day& x, const day& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const day& x, const day& y) NOEXCEPT +{ + return !(x < y); +} + +CONSTCD11 +inline +days +operator-(const day& x, const day& y) NOEXCEPT +{ + return days{static_cast(static_cast(x) + - static_cast(y))}; +} + +CONSTCD11 +inline +day +operator+(const day& x, const days& y) NOEXCEPT +{ + return day{static_cast(x) + static_cast(y.count())}; +} + +CONSTCD11 +inline +day +operator+(const days& x, const day& y) NOEXCEPT +{ + return y + x; +} + +CONSTCD11 +inline +day +operator-(const day& x, const days& y) NOEXCEPT +{ + return x + -y; +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const day& d) +{ + detail::save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << static_cast(d); + if (!d.ok()) + os << " is not a valid day"; + return os; +} + +// month + +CONSTCD11 inline month::month(unsigned m) NOEXCEPT : m_(static_cast(m)) {} +CONSTCD14 inline month& month::operator++() NOEXCEPT {*this += months{1}; return *this;} +CONSTCD14 inline month month::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} +CONSTCD14 inline month& month::operator--() NOEXCEPT {*this -= months{1}; return *this;} +CONSTCD14 inline month month::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} + +CONSTCD14 +inline +month& +month::operator+=(const months& m) NOEXCEPT +{ + *this = *this + m; + return *this; +} + +CONSTCD14 +inline +month& +month::operator-=(const months& m) NOEXCEPT +{ + *this = *this - m; + return *this; +} + +CONSTCD11 inline month::operator unsigned() const NOEXCEPT {return m_;} +CONSTCD11 inline bool month::ok() const NOEXCEPT {return 1 <= m_ && m_ <= 12;} + +CONSTCD11 +inline +bool +operator==(const month& x, const month& y) NOEXCEPT +{ + return static_cast(x) == static_cast(y); +} + +CONSTCD11 +inline +bool +operator!=(const month& x, const month& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const month& x, const month& y) NOEXCEPT +{ + return static_cast(x) < static_cast(y); +} + +CONSTCD11 +inline +bool +operator>(const month& x, const month& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const month& x, const month& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const month& x, const month& y) NOEXCEPT +{ + return !(x < y); +} + +CONSTCD14 +inline +months +operator-(const month& x, const month& y) NOEXCEPT +{ + auto const d = static_cast(x) - static_cast(y); + return months(d <= 11 ? d : d + 12); +} + +CONSTCD14 +inline +month +operator+(const month& x, const months& y) NOEXCEPT +{ + auto const mu = static_cast(static_cast(x)) + (y.count() - 1); + auto const yr = (mu >= 0 ? mu : mu-11) / 12; + return month{static_cast(mu - yr * 12 + 1)}; +} + +CONSTCD14 +inline +month +operator+(const months& x, const month& y) NOEXCEPT +{ + return y + x; +} + +CONSTCD14 +inline +month +operator-(const month& x, const months& y) NOEXCEPT +{ + return x + -y; +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const month& m) +{ + if (m.ok()) + { + CharT fmt[] = {'%', 'b', 0}; + os << format(os.getloc(), fmt, m); + } + else + os << static_cast(m) << " is not a valid month"; + return os; +} + +// year + +CONSTCD11 inline year::year(int y) NOEXCEPT : y_(static_cast(y)) {} +CONSTCD14 inline year& year::operator++() NOEXCEPT {++y_; return *this;} +CONSTCD14 inline year year::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} +CONSTCD14 inline year& year::operator--() NOEXCEPT {--y_; return *this;} +CONSTCD14 inline year year::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} +CONSTCD14 inline year& year::operator+=(const years& y) NOEXCEPT {*this = *this + y; return *this;} +CONSTCD14 inline year& year::operator-=(const years& y) NOEXCEPT {*this = *this - y; return *this;} +CONSTCD11 inline year year::operator-() const NOEXCEPT {return year{-y_};} +CONSTCD11 inline year year::operator+() const NOEXCEPT {return *this;} + +CONSTCD11 +inline +bool +year::is_leap() const NOEXCEPT +{ + return y_ % 4 == 0 && (y_ % 100 != 0 || y_ % 400 == 0); +} + +CONSTCD11 inline year::operator int() const NOEXCEPT {return y_;} + +CONSTCD11 +inline +bool +year::ok() const NOEXCEPT +{ + return y_ != std::numeric_limits::min(); +} + +CONSTCD11 +inline +year +year::min() NOEXCEPT +{ + return year{-32767}; +} + +CONSTCD11 +inline +year +year::max() NOEXCEPT +{ + return year{32767}; +} + +CONSTCD11 +inline +bool +operator==(const year& x, const year& y) NOEXCEPT +{ + return static_cast(x) == static_cast(y); +} + +CONSTCD11 +inline +bool +operator!=(const year& x, const year& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const year& x, const year& y) NOEXCEPT +{ + return static_cast(x) < static_cast(y); +} + +CONSTCD11 +inline +bool +operator>(const year& x, const year& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const year& x, const year& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const year& x, const year& y) NOEXCEPT +{ + return !(x < y); +} + +CONSTCD11 +inline +years +operator-(const year& x, const year& y) NOEXCEPT +{ + return years{static_cast(x) - static_cast(y)}; +} + +CONSTCD11 +inline +year +operator+(const year& x, const years& y) NOEXCEPT +{ + return year{static_cast(x) + y.count()}; +} + +CONSTCD11 +inline +year +operator+(const years& x, const year& y) NOEXCEPT +{ + return y + x; +} + +CONSTCD11 +inline +year +operator-(const year& x, const years& y) NOEXCEPT +{ + return year{static_cast(x) - y.count()}; +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year& y) +{ + detail::save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::internal); + os.width(4 + (y < year{0})); + os << static_cast(y); + if (!y.ok()) + os << " is not a valid year"; + return os; +} + +// weekday + +CONSTCD11 +inline +unsigned char +weekday::weekday_from_days(int z) NOEXCEPT +{ + return static_cast(static_cast( + z >= -4 ? (z+4) % 7 : (z+5) % 7 + 6)); +} + +CONSTCD11 +inline +weekday::weekday(unsigned wd) NOEXCEPT + : wd_(static_cast(wd)) + {} + +CONSTCD11 +inline +weekday::weekday(const sys_days& dp) NOEXCEPT + : wd_(weekday_from_days(dp.time_since_epoch().count())) + {} + +CONSTCD11 +inline +weekday::weekday(const local_days& dp) NOEXCEPT + : wd_(weekday_from_days(dp.time_since_epoch().count())) + {} + +CONSTCD14 inline weekday& weekday::operator++() NOEXCEPT {*this += days{1}; return *this;} +CONSTCD14 inline weekday weekday::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} +CONSTCD14 inline weekday& weekday::operator--() NOEXCEPT {*this -= days{1}; return *this;} +CONSTCD14 inline weekday weekday::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} + +CONSTCD14 +inline +weekday& +weekday::operator+=(const days& d) NOEXCEPT +{ + *this = *this + d; + return *this; +} + +CONSTCD14 +inline +weekday& +weekday::operator-=(const days& d) NOEXCEPT +{ + *this = *this - d; + return *this; +} + +CONSTCD11 +inline +weekday::operator unsigned() const NOEXCEPT +{ + return static_cast(wd_); +} + +CONSTCD11 inline bool weekday::ok() const NOEXCEPT {return wd_ <= 6;} + +CONSTCD11 +inline +bool +operator==(const weekday& x, const weekday& y) NOEXCEPT +{ + return static_cast(x) == static_cast(y); +} + +CONSTCD11 +inline +bool +operator!=(const weekday& x, const weekday& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD14 +inline +days +operator-(const weekday& x, const weekday& y) NOEXCEPT +{ + auto const diff = static_cast(x) - static_cast(y); + return days{diff <= 6 ? diff : diff + 7}; +} + +CONSTCD14 +inline +weekday +operator+(const weekday& x, const days& y) NOEXCEPT +{ + auto const wdu = static_cast(static_cast(x)) + y.count(); + auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7; + return weekday{static_cast(wdu - wk * 7)}; +} + +CONSTCD14 +inline +weekday +operator+(const days& x, const weekday& y) NOEXCEPT +{ + return y + x; +} + +CONSTCD14 +inline +weekday +operator-(const weekday& x, const days& y) NOEXCEPT +{ + return x + -y; +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday& wd) +{ + if (wd.ok()) + { + CharT fmt[] = {'%', 'a', 0}; + os << format(fmt, wd); + } + else + os << static_cast(wd) << " is not a valid weekday"; + return os; +} + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +inline namespace literals +{ + +CONSTCD11 +inline +date::day +operator "" _d(unsigned long long d) NOEXCEPT +{ + return date::day{static_cast(d)}; +} + +CONSTCD11 +inline +date::year +operator "" _y(unsigned long long y) NOEXCEPT +{ + return date::year(static_cast(y)); +} +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) + +CONSTDATA date::last_spec last{}; + +CONSTDATA date::month jan{1}; +CONSTDATA date::month feb{2}; +CONSTDATA date::month mar{3}; +CONSTDATA date::month apr{4}; +CONSTDATA date::month may{5}; +CONSTDATA date::month jun{6}; +CONSTDATA date::month jul{7}; +CONSTDATA date::month aug{8}; +CONSTDATA date::month sep{9}; +CONSTDATA date::month oct{10}; +CONSTDATA date::month nov{11}; +CONSTDATA date::month dec{12}; + +CONSTDATA date::weekday sun{0u}; +CONSTDATA date::weekday mon{1u}; +CONSTDATA date::weekday tue{2u}; +CONSTDATA date::weekday wed{3u}; +CONSTDATA date::weekday thu{4u}; +CONSTDATA date::weekday fri{5u}; +CONSTDATA date::weekday sat{6u}; + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +} // inline namespace literals +#endif + +CONSTDATA date::month January{1}; +CONSTDATA date::month February{2}; +CONSTDATA date::month March{3}; +CONSTDATA date::month April{4}; +CONSTDATA date::month May{5}; +CONSTDATA date::month June{6}; +CONSTDATA date::month July{7}; +CONSTDATA date::month August{8}; +CONSTDATA date::month September{9}; +CONSTDATA date::month October{10}; +CONSTDATA date::month November{11}; +CONSTDATA date::month December{12}; + +CONSTDATA date::weekday Sunday{0u}; +CONSTDATA date::weekday Monday{1u}; +CONSTDATA date::weekday Tuesday{2u}; +CONSTDATA date::weekday Wednesday{3u}; +CONSTDATA date::weekday Thursday{4u}; +CONSTDATA date::weekday Friday{5u}; +CONSTDATA date::weekday Saturday{6u}; + +// weekday_indexed + +CONSTCD11 +inline +weekday +weekday_indexed::weekday() const NOEXCEPT +{ + return date::weekday{static_cast(wd_)}; +} + +CONSTCD11 inline unsigned weekday_indexed::index() const NOEXCEPT {return index_;} + +CONSTCD11 +inline +bool +weekday_indexed::ok() const NOEXCEPT +{ + return weekday().ok() && 1 <= index_ && index_ <= 5; +} + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" +#endif // __GNUC__ + +CONSTCD11 +inline +weekday_indexed::weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT + : wd_(static_cast(static_cast(wd))) + , index_(static_cast(index)) + {} + +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif // __GNUC__ + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday_indexed& wdi) +{ + os << wdi.weekday() << '[' << wdi.index(); + if (!(1 <= wdi.index() && wdi.index() <= 5)) + os << " is not a valid index"; + os << ']'; + return os; +} + +CONSTCD11 +inline +weekday_indexed +weekday::operator[](unsigned index) const NOEXCEPT +{ + return {*this, index}; +} + +CONSTCD11 +inline +bool +operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT +{ + return x.weekday() == y.weekday() && x.index() == y.index(); +} + +CONSTCD11 +inline +bool +operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT +{ + return !(x == y); +} + +// weekday_last + +CONSTCD11 inline date::weekday weekday_last::weekday() const NOEXCEPT {return wd_;} +CONSTCD11 inline bool weekday_last::ok() const NOEXCEPT {return wd_.ok();} +CONSTCD11 inline weekday_last::weekday_last(const date::weekday& wd) NOEXCEPT : wd_(wd) {} + +CONSTCD11 +inline +bool +operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT +{ + return x.weekday() == y.weekday(); +} + +CONSTCD11 +inline +bool +operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT +{ + return !(x == y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const weekday_last& wdl) +{ + return os << wdl.weekday() << "[last]"; +} + +CONSTCD11 +inline +weekday_last +weekday::operator[](last_spec) const NOEXCEPT +{ + return weekday_last{*this}; +} + +// year_month + +CONSTCD11 +inline +year_month::year_month(const date::year& y, const date::month& m) NOEXCEPT + : y_(y) + , m_(m) + {} + +CONSTCD11 inline year year_month::year() const NOEXCEPT {return y_;} +CONSTCD11 inline month year_month::month() const NOEXCEPT {return m_;} +CONSTCD11 inline bool year_month::ok() const NOEXCEPT {return y_.ok() && m_.ok();} + +CONSTCD14 +inline +year_month& +year_month::operator+=(const months& dm) NOEXCEPT +{ + *this = *this + dm; + return *this; +} + +CONSTCD14 +inline +year_month& +year_month::operator-=(const months& dm) NOEXCEPT +{ + *this = *this - dm; + return *this; +} + +CONSTCD14 +inline +year_month& +year_month::operator+=(const years& dy) NOEXCEPT +{ + *this = *this + dy; + return *this; +} + +CONSTCD14 +inline +year_month& +year_month::operator-=(const years& dy) NOEXCEPT +{ + *this = *this - dy; + return *this; +} + +CONSTCD11 +inline +bool +operator==(const year_month& x, const year_month& y) NOEXCEPT +{ + return x.year() == y.year() && x.month() == y.month(); +} + +CONSTCD11 +inline +bool +operator!=(const year_month& x, const year_month& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const year_month& x, const year_month& y) NOEXCEPT +{ + return x.year() < y.year() ? true + : (x.year() > y.year() ? false + : (x.month() < y.month())); +} + +CONSTCD11 +inline +bool +operator>(const year_month& x, const year_month& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const year_month& x, const year_month& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const year_month& x, const year_month& y) NOEXCEPT +{ + return !(x < y); +} + +CONSTCD14 +inline +year_month +operator+(const year_month& ym, const months& dm) NOEXCEPT +{ + auto dmi = static_cast(static_cast(ym.month())) - 1 + dm.count(); + auto dy = (dmi >= 0 ? dmi : dmi-11) / 12; + dmi = dmi - dy * 12 + 1; + return (ym.year() + years(dy)) / month(static_cast(dmi)); +} + +CONSTCD14 +inline +year_month +operator+(const months& dm, const year_month& ym) NOEXCEPT +{ + return ym + dm; +} + +CONSTCD14 +inline +year_month +operator-(const year_month& ym, const months& dm) NOEXCEPT +{ + return ym + -dm; +} + +CONSTCD11 +inline +months +operator-(const year_month& x, const year_month& y) NOEXCEPT +{ + return (x.year() - y.year()) + + months(static_cast(x.month()) - static_cast(y.month())); +} + +CONSTCD11 +inline +year_month +operator+(const year_month& ym, const years& dy) NOEXCEPT +{ + return (ym.year() + dy) / ym.month(); +} + +CONSTCD11 +inline +year_month +operator+(const years& dy, const year_month& ym) NOEXCEPT +{ + return ym + dy; +} + +CONSTCD11 +inline +year_month +operator-(const year_month& ym, const years& dy) NOEXCEPT +{ + return ym + -dy; +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month& ym) +{ + return os << ym.year() << '/' << ym.month(); +} + +// month_day + +CONSTCD11 +inline +month_day::month_day(const date::month& m, const date::day& d) NOEXCEPT + : m_(m) + , d_(d) + {} + +CONSTCD11 inline date::month month_day::month() const NOEXCEPT {return m_;} +CONSTCD11 inline date::day month_day::day() const NOEXCEPT {return d_;} + +CONSTCD14 +inline +bool +month_day::ok() const NOEXCEPT +{ + CONSTDATA date::day d[] = + { + date::day(31), date::day(29), date::day(31), + date::day(30), date::day(31), date::day(30), + date::day(31), date::day(31), date::day(30), + date::day(31), date::day(30), date::day(31) + }; + return m_.ok() && date::day{1} <= d_ && d_ <= d[static_cast(m_)-1]; +} + +CONSTCD11 +inline +bool +operator==(const month_day& x, const month_day& y) NOEXCEPT +{ + return x.month() == y.month() && x.day() == y.day(); +} + +CONSTCD11 +inline +bool +operator!=(const month_day& x, const month_day& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const month_day& x, const month_day& y) NOEXCEPT +{ + return x.month() < y.month() ? true + : (x.month() > y.month() ? false + : (x.day() < y.day())); +} + +CONSTCD11 +inline +bool +operator>(const month_day& x, const month_day& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const month_day& x, const month_day& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const month_day& x, const month_day& y) NOEXCEPT +{ + return !(x < y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_day& md) +{ + return os << md.month() << '/' << md.day(); +} + +// month_day_last + +CONSTCD11 inline month month_day_last::month() const NOEXCEPT {return m_;} +CONSTCD11 inline bool month_day_last::ok() const NOEXCEPT {return m_.ok();} +CONSTCD11 inline month_day_last::month_day_last(const date::month& m) NOEXCEPT : m_(m) {} + +CONSTCD11 +inline +bool +operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return x.month() == y.month(); +} + +CONSTCD11 +inline +bool +operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return x.month() < y.month(); +} + +CONSTCD11 +inline +bool +operator>(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT +{ + return !(x < y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_day_last& mdl) +{ + return os << mdl.month() << "/last"; +} + +// month_weekday + +CONSTCD11 +inline +month_weekday::month_weekday(const date::month& m, + const date::weekday_indexed& wdi) NOEXCEPT + : m_(m) + , wdi_(wdi) + {} + +CONSTCD11 inline month month_weekday::month() const NOEXCEPT {return m_;} + +CONSTCD11 +inline +weekday_indexed +month_weekday::weekday_indexed() const NOEXCEPT +{ + return wdi_; +} + +CONSTCD11 +inline +bool +month_weekday::ok() const NOEXCEPT +{ + return m_.ok() && wdi_.ok(); +} + +CONSTCD11 +inline +bool +operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT +{ + return x.month() == y.month() && x.weekday_indexed() == y.weekday_indexed(); +} + +CONSTCD11 +inline +bool +operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT +{ + return !(x == y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_weekday& mwd) +{ + return os << mwd.month() << '/' << mwd.weekday_indexed(); +} + +// month_weekday_last + +CONSTCD11 +inline +month_weekday_last::month_weekday_last(const date::month& m, + const date::weekday_last& wdl) NOEXCEPT + : m_(m) + , wdl_(wdl) + {} + +CONSTCD11 inline month month_weekday_last::month() const NOEXCEPT {return m_;} + +CONSTCD11 +inline +weekday_last +month_weekday_last::weekday_last() const NOEXCEPT +{ + return wdl_; +} + +CONSTCD11 +inline +bool +month_weekday_last::ok() const NOEXCEPT +{ + return m_.ok() && wdl_.ok(); +} + +CONSTCD11 +inline +bool +operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT +{ + return x.month() == y.month() && x.weekday_last() == y.weekday_last(); +} + +CONSTCD11 +inline +bool +operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT +{ + return !(x == y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const month_weekday_last& mwdl) +{ + return os << mwdl.month() << '/' << mwdl.weekday_last(); +} + +// year_month_day_last + +CONSTCD11 +inline +year_month_day_last::year_month_day_last(const date::year& y, + const date::month_day_last& mdl) NOEXCEPT + : y_(y) + , mdl_(mdl) + {} + +CONSTCD14 +inline +year_month_day_last& +year_month_day_last::operator+=(const months& m) NOEXCEPT +{ + *this = *this + m; + return *this; +} + +CONSTCD14 +inline +year_month_day_last& +year_month_day_last::operator-=(const months& m) NOEXCEPT +{ + *this = *this - m; + return *this; +} + +CONSTCD14 +inline +year_month_day_last& +year_month_day_last::operator+=(const years& y) NOEXCEPT +{ + *this = *this + y; + return *this; +} + +CONSTCD14 +inline +year_month_day_last& +year_month_day_last::operator-=(const years& y) NOEXCEPT +{ + *this = *this - y; + return *this; +} + +CONSTCD11 inline year year_month_day_last::year() const NOEXCEPT {return y_;} +CONSTCD11 inline month year_month_day_last::month() const NOEXCEPT {return mdl_.month();} + +CONSTCD11 +inline +month_day_last +year_month_day_last::month_day_last() const NOEXCEPT +{ + return mdl_; +} + +CONSTCD14 +inline +day +year_month_day_last::day() const NOEXCEPT +{ + CONSTDATA date::day d[] = + { + date::day(31), date::day(28), date::day(31), + date::day(30), date::day(31), date::day(30), + date::day(31), date::day(31), date::day(30), + date::day(31), date::day(30), date::day(31) + }; + return month() != feb || !y_.is_leap() ? + d[static_cast(month()) - 1] : date::day{29}; +} + +CONSTCD14 +inline +year_month_day_last::operator sys_days() const NOEXCEPT +{ + return sys_days(year()/month()/day()); +} + +CONSTCD14 +inline +year_month_day_last::operator local_days() const NOEXCEPT +{ + return local_days(year()/month()/day()); +} + +CONSTCD11 +inline +bool +year_month_day_last::ok() const NOEXCEPT +{ + return y_.ok() && mdl_.ok(); +} + +CONSTCD11 +inline +bool +operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return x.year() == y.year() && x.month_day_last() == y.month_day_last(); +} + +CONSTCD11 +inline +bool +operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return x.year() < y.year() ? true + : (x.year() > y.year() ? false + : (x.month_day_last() < y.month_day_last())); +} + +CONSTCD11 +inline +bool +operator>(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT +{ + return !(x < y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_day_last& ymdl) +{ + return os << ymdl.year() << '/' << ymdl.month_day_last(); +} + +CONSTCD14 +inline +year_month_day_last +operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT +{ + return (ymdl.year() / ymdl.month() + dm) / last; +} + +CONSTCD14 +inline +year_month_day_last +operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT +{ + return ymdl + dm; +} + +CONSTCD14 +inline +year_month_day_last +operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT +{ + return ymdl + (-dm); +} + +CONSTCD11 +inline +year_month_day_last +operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT +{ + return {ymdl.year()+dy, ymdl.month_day_last()}; +} + +CONSTCD11 +inline +year_month_day_last +operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT +{ + return ymdl + dy; +} + +CONSTCD11 +inline +year_month_day_last +operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT +{ + return ymdl + (-dy); +} + +// year_month_day + +CONSTCD11 +inline +year_month_day::year_month_day(const date::year& y, const date::month& m, + const date::day& d) NOEXCEPT + : y_(y) + , m_(m) + , d_(d) + {} + +CONSTCD14 +inline +year_month_day::year_month_day(const year_month_day_last& ymdl) NOEXCEPT + : y_(ymdl.year()) + , m_(ymdl.month()) + , d_(ymdl.day()) + {} + +CONSTCD14 +inline +year_month_day::year_month_day(sys_days dp) NOEXCEPT + : year_month_day(from_days(dp.time_since_epoch())) + {} + +CONSTCD14 +inline +year_month_day::year_month_day(local_days dp) NOEXCEPT + : year_month_day(from_days(dp.time_since_epoch())) + {} + +CONSTCD11 inline year year_month_day::year() const NOEXCEPT {return y_;} +CONSTCD11 inline month year_month_day::month() const NOEXCEPT {return m_;} +CONSTCD11 inline day year_month_day::day() const NOEXCEPT {return d_;} + +CONSTCD14 +inline +year_month_day& +year_month_day::operator+=(const months& m) NOEXCEPT +{ + *this = *this + m; + return *this; +} + +CONSTCD14 +inline +year_month_day& +year_month_day::operator-=(const months& m) NOEXCEPT +{ + *this = *this - m; + return *this; +} + +CONSTCD14 +inline +year_month_day& +year_month_day::operator+=(const years& y) NOEXCEPT +{ + *this = *this + y; + return *this; +} + +CONSTCD14 +inline +year_month_day& +year_month_day::operator-=(const years& y) NOEXCEPT +{ + *this = *this - y; + return *this; +} + +CONSTCD14 +inline +days +year_month_day::to_days() const NOEXCEPT +{ + static_assert(std::numeric_limits::digits >= 18, + "This algorithm has not been ported to a 16 bit unsigned integer"); + static_assert(std::numeric_limits::digits >= 20, + "This algorithm has not been ported to a 16 bit signed integer"); + auto const y = static_cast(y_) - (m_ <= feb); + auto const m = static_cast(m_); + auto const d = static_cast(d_); + auto const era = (y >= 0 ? y : y-399) / 400; + auto const yoe = static_cast(y - era * 400); // [0, 399] + auto const doy = (153*(m > 2 ? m-3 : m+9) + 2)/5 + d-1; // [0, 365] + auto const doe = yoe * 365 + yoe/4 - yoe/100 + doy; // [0, 146096] + return days{era * 146097 + static_cast(doe) - 719468}; +} + +CONSTCD14 +inline +year_month_day::operator sys_days() const NOEXCEPT +{ + return sys_days{to_days()}; +} + +CONSTCD14 +inline +year_month_day::operator local_days() const NOEXCEPT +{ + return local_days{to_days()}; +} + +CONSTCD14 +inline +bool +year_month_day::ok() const NOEXCEPT +{ + if (!(y_.ok() && m_.ok())) + return false; + return date::day{1} <= d_ && d_ <= (y_ / m_ / last).day(); +} + +CONSTCD11 +inline +bool +operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return x.year() == y.year() && x.month() == y.month() && x.day() == y.day(); +} + +CONSTCD11 +inline +bool +operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return !(x == y); +} + +CONSTCD11 +inline +bool +operator<(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return x.year() < y.year() ? true + : (x.year() > y.year() ? false + : (x.month() < y.month() ? true + : (x.month() > y.month() ? false + : (x.day() < y.day())))); +} + +CONSTCD11 +inline +bool +operator>(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return y < x; +} + +CONSTCD11 +inline +bool +operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return !(y < x); +} + +CONSTCD11 +inline +bool +operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT +{ + return !(x < y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_day& ymd) +{ + detail::save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os << ymd.year() << '-'; + os.width(2); + os << static_cast(ymd.month()) << '-'; + os << ymd.day(); + if (!ymd.ok()) + os << " is not a valid date"; + return os; +} + +CONSTCD14 +inline +year_month_day +year_month_day::from_days(days dp) NOEXCEPT +{ + static_assert(std::numeric_limits::digits >= 18, + "This algorithm has not been ported to a 16 bit unsigned integer"); + static_assert(std::numeric_limits::digits >= 20, + "This algorithm has not been ported to a 16 bit signed integer"); + auto const z = dp.count() + 719468; + auto const era = (z >= 0 ? z : z - 146096) / 146097; + auto const doe = static_cast(z - era * 146097); // [0, 146096] + auto const yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365; // [0, 399] + auto const y = static_cast(yoe) + era * 400; + auto const doy = doe - (365*yoe + yoe/4 - yoe/100); // [0, 365] + auto const mp = (5*doy + 2)/153; // [0, 11] + auto const d = doy - (153*mp+2)/5 + 1; // [1, 31] + auto const m = mp < 10 ? mp+3 : mp-9; // [1, 12] + return year_month_day{date::year{y + (m <= 2)}, date::month(m), date::day(d)}; +} + +CONSTCD14 +inline +year_month_day +operator+(const year_month_day& ymd, const months& dm) NOEXCEPT +{ + return (ymd.year() / ymd.month() + dm) / ymd.day(); +} + +CONSTCD14 +inline +year_month_day +operator+(const months& dm, const year_month_day& ymd) NOEXCEPT +{ + return ymd + dm; +} + +CONSTCD14 +inline +year_month_day +operator-(const year_month_day& ymd, const months& dm) NOEXCEPT +{ + return ymd + (-dm); +} + +CONSTCD11 +inline +year_month_day +operator+(const year_month_day& ymd, const years& dy) NOEXCEPT +{ + return (ymd.year() + dy) / ymd.month() / ymd.day(); +} + +CONSTCD11 +inline +year_month_day +operator+(const years& dy, const year_month_day& ymd) NOEXCEPT +{ + return ymd + dy; +} + +CONSTCD11 +inline +year_month_day +operator-(const year_month_day& ymd, const years& dy) NOEXCEPT +{ + return ymd + (-dy); +} + +// year_month_weekday + +CONSTCD11 +inline +year_month_weekday::year_month_weekday(const date::year& y, const date::month& m, + const date::weekday_indexed& wdi) + NOEXCEPT + : y_(y) + , m_(m) + , wdi_(wdi) + {} + +CONSTCD14 +inline +year_month_weekday::year_month_weekday(const sys_days& dp) NOEXCEPT + : year_month_weekday(from_days(dp.time_since_epoch())) + {} + +CONSTCD14 +inline +year_month_weekday::year_month_weekday(const local_days& dp) NOEXCEPT + : year_month_weekday(from_days(dp.time_since_epoch())) + {} + +CONSTCD14 +inline +year_month_weekday& +year_month_weekday::operator+=(const months& m) NOEXCEPT +{ + *this = *this + m; + return *this; +} + +CONSTCD14 +inline +year_month_weekday& +year_month_weekday::operator-=(const months& m) NOEXCEPT +{ + *this = *this - m; + return *this; +} + +CONSTCD14 +inline +year_month_weekday& +year_month_weekday::operator+=(const years& y) NOEXCEPT +{ + *this = *this + y; + return *this; +} + +CONSTCD14 +inline +year_month_weekday& +year_month_weekday::operator-=(const years& y) NOEXCEPT +{ + *this = *this - y; + return *this; +} + +CONSTCD11 inline year year_month_weekday::year() const NOEXCEPT {return y_;} +CONSTCD11 inline month year_month_weekday::month() const NOEXCEPT {return m_;} + +CONSTCD11 +inline +weekday +year_month_weekday::weekday() const NOEXCEPT +{ + return wdi_.weekday(); +} + +CONSTCD11 +inline +unsigned +year_month_weekday::index() const NOEXCEPT +{ + return wdi_.index(); +} + +CONSTCD11 +inline +weekday_indexed +year_month_weekday::weekday_indexed() const NOEXCEPT +{ + return wdi_; +} + +CONSTCD14 +inline +year_month_weekday::operator sys_days() const NOEXCEPT +{ + return sys_days{to_days()}; +} + +CONSTCD14 +inline +year_month_weekday::operator local_days() const NOEXCEPT +{ + return local_days{to_days()}; +} + +CONSTCD14 +inline +bool +year_month_weekday::ok() const NOEXCEPT +{ + if (!y_.ok() || !m_.ok() || !wdi_.weekday().ok() || wdi_.index() < 1) + return false; + if (wdi_.index() <= 4) + return true; + auto d2 = wdi_.weekday() - date::weekday(static_cast(y_/m_/1)) + days((wdi_.index()-1)*7 + 1); + return static_cast(d2.count()) <= static_cast((y_/m_/last).day()); +} + +CONSTCD14 +inline +year_month_weekday +year_month_weekday::from_days(days d) NOEXCEPT +{ + sys_days dp{d}; + auto const wd = date::weekday(dp); + auto const ymd = year_month_day(dp); + return {ymd.year(), ymd.month(), wd[(static_cast(ymd.day())-1)/7+1]}; +} + +CONSTCD14 +inline +days +year_month_weekday::to_days() const NOEXCEPT +{ + auto d = sys_days(y_/m_/1); + return (d + (wdi_.weekday() - date::weekday(d) + days{(wdi_.index()-1)*7}) + ).time_since_epoch(); +} + +CONSTCD11 +inline +bool +operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT +{ + return x.year() == y.year() && x.month() == y.month() && + x.weekday_indexed() == y.weekday_indexed(); +} + +CONSTCD11 +inline +bool +operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT +{ + return !(x == y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_weekday& ymwdi) +{ + return os << ymwdi.year() << '/' << ymwdi.month() + << '/' << ymwdi.weekday_indexed(); +} + +CONSTCD14 +inline +year_month_weekday +operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT +{ + return (ymwd.year() / ymwd.month() + dm) / ymwd.weekday_indexed(); +} + +CONSTCD14 +inline +year_month_weekday +operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT +{ + return ymwd + dm; +} + +CONSTCD14 +inline +year_month_weekday +operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT +{ + return ymwd + (-dm); +} + +CONSTCD11 +inline +year_month_weekday +operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT +{ + return {ymwd.year()+dy, ymwd.month(), ymwd.weekday_indexed()}; +} + +CONSTCD11 +inline +year_month_weekday +operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT +{ + return ymwd + dy; +} + +CONSTCD11 +inline +year_month_weekday +operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT +{ + return ymwd + (-dy); +} + +// year_month_weekday_last + +CONSTCD11 +inline +year_month_weekday_last::year_month_weekday_last(const date::year& y, + const date::month& m, + const date::weekday_last& wdl) NOEXCEPT + : y_(y) + , m_(m) + , wdl_(wdl) + {} + +CONSTCD14 +inline +year_month_weekday_last& +year_month_weekday_last::operator+=(const months& m) NOEXCEPT +{ + *this = *this + m; + return *this; +} + +CONSTCD14 +inline +year_month_weekday_last& +year_month_weekday_last::operator-=(const months& m) NOEXCEPT +{ + *this = *this - m; + return *this; +} + +CONSTCD14 +inline +year_month_weekday_last& +year_month_weekday_last::operator+=(const years& y) NOEXCEPT +{ + *this = *this + y; + return *this; +} + +CONSTCD14 +inline +year_month_weekday_last& +year_month_weekday_last::operator-=(const years& y) NOEXCEPT +{ + *this = *this - y; + return *this; +} + +CONSTCD11 inline year year_month_weekday_last::year() const NOEXCEPT {return y_;} +CONSTCD11 inline month year_month_weekday_last::month() const NOEXCEPT {return m_;} + +CONSTCD11 +inline +weekday +year_month_weekday_last::weekday() const NOEXCEPT +{ + return wdl_.weekday(); +} + +CONSTCD11 +inline +weekday_last +year_month_weekday_last::weekday_last() const NOEXCEPT +{ + return wdl_; +} + +CONSTCD14 +inline +year_month_weekday_last::operator sys_days() const NOEXCEPT +{ + return sys_days{to_days()}; +} + +CONSTCD14 +inline +year_month_weekday_last::operator local_days() const NOEXCEPT +{ + return local_days{to_days()}; +} + +CONSTCD11 +inline +bool +year_month_weekday_last::ok() const NOEXCEPT +{ + return y_.ok() && m_.ok() && wdl_.ok(); +} + +CONSTCD14 +inline +days +year_month_weekday_last::to_days() const NOEXCEPT +{ + auto const d = sys_days(y_/m_/last); + return (d - (date::weekday{d} - wdl_.weekday())).time_since_epoch(); +} + +CONSTCD11 +inline +bool +operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT +{ + return x.year() == y.year() && x.month() == y.month() && + x.weekday_last() == y.weekday_last(); +} + +CONSTCD11 +inline +bool +operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT +{ + return !(x == y); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const year_month_weekday_last& ymwdl) +{ + return os << ymwdl.year() << '/' << ymwdl.month() << '/' << ymwdl.weekday_last(); +} + +CONSTCD14 +inline +year_month_weekday_last +operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT +{ + return (ymwdl.year() / ymwdl.month() + dm) / ymwdl.weekday_last(); +} + +CONSTCD14 +inline +year_month_weekday_last +operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT +{ + return ymwdl + dm; +} + +CONSTCD14 +inline +year_month_weekday_last +operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT +{ + return ymwdl + (-dm); +} + +CONSTCD11 +inline +year_month_weekday_last +operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT +{ + return {ymwdl.year()+dy, ymwdl.month(), ymwdl.weekday_last()}; +} + +CONSTCD11 +inline +year_month_weekday_last +operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT +{ + return ymwdl + dy; +} + +CONSTCD11 +inline +year_month_weekday_last +operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT +{ + return ymwdl + (-dy); +} + +// year_month from operator/() + +CONSTCD11 +inline +year_month +operator/(const year& y, const month& m) NOEXCEPT +{ + return {y, m}; +} + +CONSTCD11 +inline +year_month +operator/(const year& y, int m) NOEXCEPT +{ + return y / month(static_cast(m)); +} + +// month_day from operator/() + +CONSTCD11 +inline +month_day +operator/(const month& m, const day& d) NOEXCEPT +{ + return {m, d}; +} + +CONSTCD11 +inline +month_day +operator/(const day& d, const month& m) NOEXCEPT +{ + return m / d; +} + +CONSTCD11 +inline +month_day +operator/(const month& m, int d) NOEXCEPT +{ + return m / day(static_cast(d)); +} + +CONSTCD11 +inline +month_day +operator/(int m, const day& d) NOEXCEPT +{ + return month(static_cast(m)) / d; +} + +CONSTCD11 inline month_day operator/(const day& d, int m) NOEXCEPT {return m / d;} + +// month_day_last from operator/() + +CONSTCD11 +inline +month_day_last +operator/(const month& m, last_spec) NOEXCEPT +{ + return month_day_last{m}; +} + +CONSTCD11 +inline +month_day_last +operator/(last_spec, const month& m) NOEXCEPT +{ + return m/last; +} + +CONSTCD11 +inline +month_day_last +operator/(int m, last_spec) NOEXCEPT +{ + return month(static_cast(m))/last; +} + +CONSTCD11 +inline +month_day_last +operator/(last_spec, int m) NOEXCEPT +{ + return m/last; +} + +// month_weekday from operator/() + +CONSTCD11 +inline +month_weekday +operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT +{ + return {m, wdi}; +} + +CONSTCD11 +inline +month_weekday +operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT +{ + return m / wdi; +} + +CONSTCD11 +inline +month_weekday +operator/(int m, const weekday_indexed& wdi) NOEXCEPT +{ + return month(static_cast(m)) / wdi; +} + +CONSTCD11 +inline +month_weekday +operator/(const weekday_indexed& wdi, int m) NOEXCEPT +{ + return m / wdi; +} + +// month_weekday_last from operator/() + +CONSTCD11 +inline +month_weekday_last +operator/(const month& m, const weekday_last& wdl) NOEXCEPT +{ + return {m, wdl}; +} + +CONSTCD11 +inline +month_weekday_last +operator/(const weekday_last& wdl, const month& m) NOEXCEPT +{ + return m / wdl; +} + +CONSTCD11 +inline +month_weekday_last +operator/(int m, const weekday_last& wdl) NOEXCEPT +{ + return month(static_cast(m)) / wdl; +} + +CONSTCD11 +inline +month_weekday_last +operator/(const weekday_last& wdl, int m) NOEXCEPT +{ + return m / wdl; +} + +// year_month_day from operator/() + +CONSTCD11 +inline +year_month_day +operator/(const year_month& ym, const day& d) NOEXCEPT +{ + return {ym.year(), ym.month(), d}; +} + +CONSTCD11 +inline +year_month_day +operator/(const year_month& ym, int d) NOEXCEPT +{ + return ym / day(static_cast(d)); +} + +CONSTCD11 +inline +year_month_day +operator/(const year& y, const month_day& md) NOEXCEPT +{ + return y / md.month() / md.day(); +} + +CONSTCD11 +inline +year_month_day +operator/(int y, const month_day& md) NOEXCEPT +{ + return year(y) / md; +} + +CONSTCD11 +inline +year_month_day +operator/(const month_day& md, const year& y) NOEXCEPT +{ + return y / md; +} + +CONSTCD11 +inline +year_month_day +operator/(const month_day& md, int y) NOEXCEPT +{ + return year(y) / md; +} + +// year_month_day_last from operator/() + +CONSTCD11 +inline +year_month_day_last +operator/(const year_month& ym, last_spec) NOEXCEPT +{ + return {ym.year(), month_day_last{ym.month()}}; +} + +CONSTCD11 +inline +year_month_day_last +operator/(const year& y, const month_day_last& mdl) NOEXCEPT +{ + return {y, mdl}; +} + +CONSTCD11 +inline +year_month_day_last +operator/(int y, const month_day_last& mdl) NOEXCEPT +{ + return year(y) / mdl; +} + +CONSTCD11 +inline +year_month_day_last +operator/(const month_day_last& mdl, const year& y) NOEXCEPT +{ + return y / mdl; +} + +CONSTCD11 +inline +year_month_day_last +operator/(const month_day_last& mdl, int y) NOEXCEPT +{ + return year(y) / mdl; +} + +// year_month_weekday from operator/() + +CONSTCD11 +inline +year_month_weekday +operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT +{ + return {ym.year(), ym.month(), wdi}; +} + +CONSTCD11 +inline +year_month_weekday +operator/(const year& y, const month_weekday& mwd) NOEXCEPT +{ + return {y, mwd.month(), mwd.weekday_indexed()}; +} + +CONSTCD11 +inline +year_month_weekday +operator/(int y, const month_weekday& mwd) NOEXCEPT +{ + return year(y) / mwd; +} + +CONSTCD11 +inline +year_month_weekday +operator/(const month_weekday& mwd, const year& y) NOEXCEPT +{ + return y / mwd; +} + +CONSTCD11 +inline +year_month_weekday +operator/(const month_weekday& mwd, int y) NOEXCEPT +{ + return year(y) / mwd; +} + +// year_month_weekday_last from operator/() + +CONSTCD11 +inline +year_month_weekday_last +operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT +{ + return {ym.year(), ym.month(), wdl}; +} + +CONSTCD11 +inline +year_month_weekday_last +operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT +{ + return {y, mwdl.month(), mwdl.weekday_last()}; +} + +CONSTCD11 +inline +year_month_weekday_last +operator/(int y, const month_weekday_last& mwdl) NOEXCEPT +{ + return year(y) / mwdl; +} + +CONSTCD11 +inline +year_month_weekday_last +operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT +{ + return y / mwdl; +} + +CONSTCD11 +inline +year_month_weekday_last +operator/(const month_weekday_last& mwdl, int y) NOEXCEPT +{ + return year(y) / mwdl; +} + +template +struct fields; + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const fields& fds, const std::string* abbrev = nullptr, + const std::chrono::seconds* offset_sec = nullptr); + +template +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + fields& fds, std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr); + +// time_of_day + +enum {am = 1, pm}; + +namespace detail +{ + +// width::value is the number of fractional decimal digits in 1/n +// width<0>::value and width<1>::value are defined to be 0 +// If 1/n takes more than 18 fractional decimal digits, +// the result is truncated to 19. +// Example: width<2>::value == 1 +// Example: width<3>::value == 19 +// Example: width<4>::value == 2 +// Example: width<10>::value == 1 +// Example: width<1000>::value == 3 +template +struct width +{ + static CONSTDATA unsigned value = 1 + width::value; +}; + +template +struct width +{ + static CONSTDATA unsigned value = 0; +}; + +template +struct static_pow10 +{ +private: + static CONSTDATA std::uint64_t h = static_pow10::value; +public: + static CONSTDATA std::uint64_t value = h * h * (exp % 2 ? 10 : 1); +}; + +template <> +struct static_pow10<0> +{ + static CONSTDATA std::uint64_t value = 1; +}; + +template +struct make_precision +{ + using type = std::chrono::duration::value>>; + static CONSTDATA unsigned width = w; +}; + +template +struct make_precision +{ + using type = std::chrono::duration; + static CONSTDATA unsigned width = 6; +}; + +template ::type::period::den>::value> +class decimal_format_seconds +{ +public: + using rep = typename std::common_type::type::rep; + using precision = typename make_precision::type; + static auto CONSTDATA width = make_precision::width; + +private: + std::chrono::seconds s_; + precision sub_s_; + +public: + CONSTCD11 decimal_format_seconds() + : s_() + , sub_s_() + {} + + CONSTCD11 explicit decimal_format_seconds(const Duration& d) NOEXCEPT + : s_(std::chrono::duration_cast(d)) + , sub_s_(std::chrono::duration_cast(d - s_)) + {} + + CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;} + CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;} + CONSTCD11 precision subseconds() const NOEXCEPT {return sub_s_;} + + CONSTCD14 precision to_duration() const NOEXCEPT + { + return s_ + sub_s_; + } + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + using namespace std::chrono; + return sub_s_ < std::chrono::seconds{1} && s_ < minutes{1}; + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const decimal_format_seconds& x) + { + date::detail::save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << x.s_.count() << + std::use_facet>(os.getloc()).decimal_point(); + os.width(width); + os << static_cast(x.sub_s_.count()); + return os; + } +}; + +template +class decimal_format_seconds +{ + static CONSTDATA unsigned w = 0; +public: + using rep = typename std::common_type::type::rep; + using precision = std::chrono::duration; + static auto CONSTDATA width = make_precision::width; +private: + + std::chrono::seconds s_; + +public: + CONSTCD11 decimal_format_seconds() : s_() {} + CONSTCD11 explicit decimal_format_seconds(const precision& s) NOEXCEPT + : s_(s) + {} + + CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;} + CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;} + CONSTCD14 precision to_duration() const NOEXCEPT {return s_;} + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + using namespace std::chrono; + return s_ < minutes{1}; + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const decimal_format_seconds& x) + { + date::detail::save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << x.s_.count(); + return os; + } +}; + +enum class classify +{ + not_valid, + hour, + minute, + second, + subsecond +}; + +template +struct classify_duration +{ + static CONSTDATA classify value = + std::is_convertible::value + ? classify::hour : + std::is_convertible::value + ? classify::minute : + std::is_convertible::value + ? classify::second : + std::chrono::treat_as_floating_point::value + ? classify::not_valid : + classify::subsecond; +}; + +template +inline +CONSTCD11 +typename std::enable_if + < + std::numeric_limits::is_signed, + std::chrono::duration + >::type +abs(std::chrono::duration d) +{ + return d >= d.zero() ? d : -d; +} + +template +inline +CONSTCD11 +typename std::enable_if + < + !std::numeric_limits::is_signed, + std::chrono::duration + >::type +abs(std::chrono::duration d) +{ + return d; +} + +class time_of_day_base +{ +protected: + std::chrono::hours h_; + unsigned char mode_; + bool neg_; + + enum {is24hr}; + + CONSTCD11 time_of_day_base() NOEXCEPT + : h_(0) + , mode_(static_cast(is24hr)) + , neg_(false) + {} + + + CONSTCD11 time_of_day_base(std::chrono::hours h, bool neg, unsigned m) NOEXCEPT + : h_(detail::abs(h)) + , mode_(static_cast(m)) + , neg_(neg) + {} + + CONSTCD14 void make24() NOEXCEPT; + CONSTCD14 void make12() NOEXCEPT; + + CONSTCD14 std::chrono::hours to24hr() const; + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + return !neg_ && h_ < days{1}; + } +}; + +CONSTCD14 +inline +std::chrono::hours +time_of_day_base::to24hr() const +{ + auto h = h_; + if (mode_ == am || mode_ == pm) + { + CONSTDATA auto h12 = std::chrono::hours(12); + if (mode_ == pm) + { + if (h != h12) + h = h + h12; + } + else if (h == h12) + h = std::chrono::hours(0); + } + return h; +} + +CONSTCD14 +inline +void +time_of_day_base::make24() NOEXCEPT +{ + h_ = to24hr(); + mode_ = is24hr; +} + +CONSTCD14 +inline +void +time_of_day_base::make12() NOEXCEPT +{ + if (mode_ == is24hr) + { + CONSTDATA auto h12 = std::chrono::hours(12); + if (h_ >= h12) + { + if (h_ > h12) + h_ = h_ - h12; + mode_ = pm; + } + else + { + if (h_ == std::chrono::hours(0)) + h_ = h12; + mode_ = am; + } + } +} + +template ::value> +class time_of_day_storage; + +template +class time_of_day_storage, detail::classify::hour> + : private detail::time_of_day_base +{ + using base = detail::time_of_day_base; + +public: + using precision = std::chrono::hours; + +#if !defined(_MSC_VER) || _MSC_VER >= 1900 + CONSTCD11 time_of_day_storage() NOEXCEPT = default; +#else + CONSTCD11 time_of_day_storage() = default; +#endif /* !defined(_MSC_VER) || _MSC_VER >= 1900 */ + + CONSTCD11 explicit time_of_day_storage(std::chrono::hours since_midnight) NOEXCEPT + : base(since_midnight, since_midnight < std::chrono::hours{0}, is24hr) + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, unsigned md) NOEXCEPT + : base(h, h < std::chrono::hours{0}, md) + {} + + CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} + CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} + + CONSTCD14 explicit operator precision() const NOEXCEPT + { + auto p = to24hr(); + if (neg_) + p = -p; + return p; + } + + CONSTCD14 precision to_duration() const NOEXCEPT + { + return static_cast(*this); + } + + CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} + CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + return base::in_conventional_range(); + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const time_of_day_storage& t) + { + using namespace std; + detail::save_stream _(os); + if (t.neg_) + os << '-'; + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + if (t.mode_ != am && t.mode_ != pm) + os.width(2); + os << t.h_.count(); + switch (t.mode_) + { + case time_of_day_storage::is24hr: + os << "00"; + break; + case am: + os << "am"; + break; + case pm: + os << "pm"; + break; + } + return os; + } +}; + +template +class time_of_day_storage, detail::classify::minute> + : private detail::time_of_day_base +{ + using base = detail::time_of_day_base; + + std::chrono::minutes m_; + +public: + using precision = std::chrono::minutes; + + CONSTCD11 time_of_day_storage() NOEXCEPT + : base() + , m_(0) + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::minutes since_midnight) NOEXCEPT + : base(std::chrono::duration_cast(since_midnight), + since_midnight < std::chrono::minutes{0}, is24hr) + , m_(detail::abs(since_midnight) - h_) + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, + unsigned md) NOEXCEPT + : base(h, false, md) + , m_(m) + {} + + CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} + CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} + CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} + + CONSTCD14 explicit operator precision() const NOEXCEPT + { + auto p = to24hr() + m_; + if (neg_) + p = -p; + return p; + } + + CONSTCD14 precision to_duration() const NOEXCEPT + { + return static_cast(*this); + } + + CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} + CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + return base::in_conventional_range() && m_ < std::chrono::hours{1}; + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const time_of_day_storage& t) + { + using namespace std; + detail::save_stream _(os); + if (t.neg_) + os << '-'; + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + if (t.mode_ != am && t.mode_ != pm) + os.width(2); + os << t.h_.count() << ':'; + os.width(2); + os << t.m_.count(); + switch (t.mode_) + { + case am: + os << "am"; + break; + case pm: + os << "pm"; + break; + } + return os; + } +}; + +template +class time_of_day_storage, detail::classify::second> + : private detail::time_of_day_base +{ + using base = detail::time_of_day_base; + using dfs = decimal_format_seconds; + + std::chrono::minutes m_; + dfs s_; + +public: + using precision = std::chrono::seconds; + + CONSTCD11 time_of_day_storage() NOEXCEPT + : base() + , m_(0) + , s_() + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::seconds since_midnight) NOEXCEPT + : base(std::chrono::duration_cast(since_midnight), + since_midnight < std::chrono::seconds{0}, is24hr) + , m_(std::chrono::duration_cast(detail::abs(since_midnight) - h_)) + , s_(detail::abs(since_midnight) - h_ - m_) + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, + std::chrono::seconds s, unsigned md) NOEXCEPT + : base(h, false, md) + , m_(m) + , s_(s) + {} + + CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} + CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} + CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_.seconds();} + CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();} + CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} + + CONSTCD14 explicit operator precision() const NOEXCEPT + { + auto p = to24hr() + s_.to_duration() + m_; + if (neg_) + p = -p; + return p; + } + + CONSTCD14 precision to_duration() const NOEXCEPT + { + return static_cast(*this); + } + + CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} + CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + return base::in_conventional_range() && m_ < std::chrono::hours{1} && + s_.in_conventional_range(); + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const time_of_day_storage& t) + { + using namespace std; + detail::save_stream _(os); + if (t.neg_) + os << '-'; + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + if (t.mode_ != am && t.mode_ != pm) + os.width(2); + os << t.h_.count() << ':'; + os.width(2); + os << t.m_.count() << ':' << t.s_; + switch (t.mode_) + { + case am: + os << "am"; + break; + case pm: + os << "pm"; + break; + } + return os; + } + + template + friend + std::basic_ostream& + date::to_stream(std::basic_ostream& os, const CharT* fmt, + const fields& fds, const std::string* abbrev, + const std::chrono::seconds* offset_sec); + + template + friend + std::basic_istream& + date::from_stream(std::basic_istream& is, const CharT* fmt, + fields& fds, + std::basic_string* abbrev, std::chrono::minutes* offset); +}; + +template +class time_of_day_storage, detail::classify::subsecond> + : private detail::time_of_day_base +{ +public: + using Duration = std::chrono::duration; + using dfs = decimal_format_seconds::type>; + using precision = typename dfs::precision; + +private: + using base = detail::time_of_day_base; + + std::chrono::minutes m_; + dfs s_; + +public: + CONSTCD11 time_of_day_storage() NOEXCEPT + : base() + , m_(0) + , s_() + {} + + CONSTCD11 explicit time_of_day_storage(Duration since_midnight) NOEXCEPT + : base(date::trunc(since_midnight), + since_midnight < Duration{0}, is24hr) + , m_(date::trunc(detail::abs(since_midnight) - h_)) + , s_(detail::abs(since_midnight) - h_ - m_) + {} + + CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, + std::chrono::seconds s, precision sub_s, + unsigned md) NOEXCEPT + : base(h, false, md) + , m_(m) + , s_(s + sub_s) + {} + + CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} + CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} + CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_.seconds();} + CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();} + CONSTCD11 precision subseconds() const NOEXCEPT {return s_.subseconds();} + CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} + + CONSTCD14 explicit operator precision() const NOEXCEPT + { + auto p = to24hr() + s_.to_duration() + m_; + if (neg_) + p = -p; + return p; + } + + CONSTCD14 precision to_duration() const NOEXCEPT + { + return static_cast(*this); + } + + CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} + CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} + + CONSTCD11 bool in_conventional_range() const NOEXCEPT + { + return base::in_conventional_range() && m_ < std::chrono::hours{1} && + s_.in_conventional_range(); + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const time_of_day_storage& t) + { + using namespace std; + detail::save_stream _(os); + if (t.neg_) + os << '-'; + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + if (t.mode_ != am && t.mode_ != pm) + os.width(2); + os << t.h_.count() << ':'; + os.width(2); + os << t.m_.count() << ':' << t.s_; + switch (t.mode_) + { + case am: + os << "am"; + break; + case pm: + os << "pm"; + break; + } + return os; + } + + template + friend + std::basic_ostream& + date::to_stream(std::basic_ostream& os, const CharT* fmt, + const fields& fds, const std::string* abbrev, + const std::chrono::seconds* offset_sec); + + template + friend + std::basic_istream& + date::from_stream(std::basic_istream& is, const CharT* fmt, + fields& fds, + std::basic_string* abbrev, std::chrono::minutes* offset); +}; + +} // namespace detail + +template +class time_of_day + : public detail::time_of_day_storage +{ + using base = detail::time_of_day_storage; +public: +#if !defined(_MSC_VER) || _MSC_VER >= 1900 + CONSTCD11 time_of_day() NOEXCEPT = default; +#else + CONSTCD11 time_of_day() = default; +#endif /* !defined(_MSC_VER) || _MSC_VER >= 1900 */ + + CONSTCD11 explicit time_of_day(Duration since_midnight) NOEXCEPT + : base(since_midnight) + {} + + template + CONSTCD11 + explicit time_of_day(Arg0&& arg0, Arg1&& arg1, Args&& ...args) NOEXCEPT + : base(std::forward(arg0), std::forward(arg1), std::forward(args)...) + {} +}; + +template ::value>::type> +CONSTCD11 +inline +time_of_day> +make_time(const std::chrono::duration& d) +{ + return time_of_day>(d); +} + +CONSTCD11 +inline +time_of_day +make_time(const std::chrono::hours& h, unsigned md) +{ + return time_of_day(h, md); +} + +CONSTCD11 +inline +time_of_day +make_time(const std::chrono::hours& h, const std::chrono::minutes& m, + unsigned md) +{ + return time_of_day(h, m, md); +} + +CONSTCD11 +inline +time_of_day +make_time(const std::chrono::hours& h, const std::chrono::minutes& m, + const std::chrono::seconds& s, unsigned md) +{ + return time_of_day(h, m, s, md); +} + +template >::value>::type> +CONSTCD11 +inline +time_of_day> +make_time(const std::chrono::hours& h, const std::chrono::minutes& m, + const std::chrono::seconds& s, const std::chrono::duration& sub_s, + unsigned md) +{ + return time_of_day>(h, m, s, sub_s, md); +} + +template +inline +typename std::enable_if +< + !std::chrono::treat_as_floating_point::value && + std::ratio_less::value + , std::basic_ostream& +>::type +operator<<(std::basic_ostream& os, const sys_time& tp) +{ + auto const dp = date::floor(tp); + return os << year_month_day(dp) << ' ' << make_time(tp-dp); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const sys_days& dp) +{ + return os << year_month_day(dp); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const local_time& ut) +{ + return (os << sys_time{ut.time_since_epoch()}); +} + +// to_stream + +template +struct fields +{ + year_month_day ymd{year{0}/0/0}; + weekday wd{7u}; + time_of_day tod{}; + + fields() = default; + + fields(year_month_day ymd_) : ymd(ymd_) {} + fields(weekday wd_) : wd(wd_) {} + fields(time_of_day tod_) : tod(tod_) {} + + fields(year_month_day ymd_, weekday wd_) : ymd(ymd_), wd(wd_) {} + fields(year_month_day ymd_, time_of_day tod_) : ymd(ymd_), tod(tod_) {} + + fields(weekday wd_, time_of_day tod_) : wd(wd_), tod(tod_) {} + + fields(year_month_day ymd_, weekday wd_, time_of_day tod_) + : ymd(ymd_) + , wd(wd_) + , tod(tod_) + {} +}; + +namespace detail +{ + +template +unsigned +extract_weekday(std::basic_ostream& os, const fields& fds) +{ + if (!fds.ymd.ok() && !fds.wd.ok()) + { + // fds does not contain a valid weekday + os.setstate(std::ios::failbit); + return 7; + } + unsigned wd; + if (fds.ymd.ok()) + { + wd = static_cast(weekday{fds.ymd}); + if (fds.wd.ok() && wd != static_cast(fds.wd)) + { + // fds.ymd and fds.wd are inconsistent + os.setstate(std::ios::failbit); + return 7; + } + } + else + wd = static_cast(fds.wd); + return wd; +} + +template +unsigned +extract_month(std::basic_ostream& os, const fields& fds) +{ + if (!fds.ymd.month().ok()) + { + // fds does not contain a valid month + os.setstate(std::ios::failbit); + return 0; + } + return static_cast(fds.ymd.month()); +} + +} // namespace detail + +#if ONLY_C_LOCALE + +namespace detail +{ + +inline +std::pair +weekday_names() +{ + using namespace std; + static const string nm[] = + { + "Sunday", + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sun", + "Mon", + "Tue", + "Wed", + "Thu", + "Fri", + "Sat" + }; + return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); +} + +inline +std::pair +month_names() +{ + using namespace std; + static const string nm[] = + { + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec" + }; + return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); +} + +inline +std::pair +ampm_names() +{ + using namespace std; + static const string nm[] = + { + "AM", + "PM" + }; + return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); +} + +template +FwdIter +scan_keyword(std::basic_istream& is, FwdIter kb, FwdIter ke) +{ + using namespace std; + size_t nkw = static_cast(std::distance(kb, ke)); + const unsigned char doesnt_match = '\0'; + const unsigned char might_match = '\1'; + const unsigned char does_match = '\2'; + unsigned char statbuf[100]; + unsigned char* status = statbuf; + unique_ptr stat_hold(0, free); + if (nkw > sizeof(statbuf)) + { + status = (unsigned char*)malloc(nkw); + if (status == nullptr) + throw bad_alloc(); + stat_hold.reset(status); + } + size_t n_might_match = nkw; // At this point, any keyword might match + size_t n_does_match = 0; // but none of them definitely do + // Initialize all statuses to might_match, except for "" keywords are does_match + unsigned char* st = status; + for (auto ky = kb; ky != ke; ++ky, ++st) + { + if (!ky->empty()) + *st = might_match; + else + { + *st = does_match; + --n_might_match; + ++n_does_match; + } + } + // While there might be a match, test keywords against the next CharT + for (size_t indx = 0; is && n_might_match > 0; ++indx) + { + // Peek at the next CharT but don't consume it + auto ic = is.peek(); + if (ic == EOF) + { + is.setstate(ios::eofbit); + break; + } + auto c = static_cast(toupper(ic)); + bool consume = false; + // For each keyword which might match, see if the indx character is c + // If a match if found, consume c + // If a match is found, and that is the last character in the keyword, + // then that keyword matches. + // If the keyword doesn't match this character, then change the keyword + // to doesn't match + st = status; + for (auto ky = kb; ky != ke; ++ky, ++st) + { + if (*st == might_match) + { + if (c == static_cast(toupper((*ky)[indx]))) + { + consume = true; + if (ky->size() == indx+1) + { + *st = does_match; + --n_might_match; + ++n_does_match; + } + } + else + { + *st = doesnt_match; + --n_might_match; + } + } + } + // consume if we matched a character + if (consume) + { + (void)is.get(); + // If we consumed a character and there might be a matched keyword that + // was marked matched on a previous iteration, then such keywords + // are now marked as not matching. + if (n_might_match + n_does_match > 1) + { + st = status; + for (auto ky = kb; ky != ke; ++ky, ++st) + { + if (*st == does_match && ky->size() != indx+1) + { + *st = doesnt_match; + --n_does_match; + } + } + } + } + } + // We've exited the loop because we hit eof and/or we have no more "might matches". + // Return the first matching result + for (st = status; kb != ke; ++kb, ++st) + if (*st == does_match) + break; + if (kb == ke) + is.setstate(ios_base::failbit); + return kb; +} + +} // namespace detail + +#endif // ONLY_C_LOCALE + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const fields& fds, const std::string* abbrev, + const std::chrono::seconds* offset_sec) +{ + using namespace std; + using namespace std::chrono; + using namespace detail; + tm tm{}; +#if !ONLY_C_LOCALE + auto& facet = use_facet>(os.getloc()); +#endif + const CharT* command = nullptr; + CharT modified = CharT{}; + for (; *fmt; ++fmt) + { + switch (*fmt) + { + case 'a': + case 'A': + if (command) + { + if (modified == CharT{}) + { + tm.tm_wday = static_cast(extract_weekday(os, fds)); + if (os.fail()) + return os; +#if !ONLY_C_LOCALE + const CharT f[] = {'%', *fmt}; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); +#else // ONLY_C_LOCALE + os << weekday_names().first[tm.tm_wday+7*(*fmt == 'a')]; +#endif // ONLY_C_LOCALE + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'b': + case 'B': + case 'h': + if (command) + { + if (modified == CharT{}) + { + tm.tm_mon = static_cast(extract_month(os, fds)) - 1; +#if !ONLY_C_LOCALE + const CharT f[] = {'%', *fmt}; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); +#else // ONLY_C_LOCALE + os << month_names().first[tm.tm_mon+12*(*fmt == 'b')]; +#endif // ONLY_C_LOCALE + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'c': + case 'x': + if (command) + { + if (modified == CharT{'O'}) + os << CharT{'%'} << modified << *fmt; + else + { +#if !ONLY_C_LOCALE + tm = std::tm{}; + auto const& ymd = fds.ymd; + auto ld = local_days(ymd); + tm.tm_sec = static_cast(fds.tod.seconds().count()); + tm.tm_min = static_cast(fds.tod.minutes().count()); + tm.tm_hour = static_cast(fds.tod.hours().count()); + tm.tm_mday = static_cast(static_cast(ymd.day())); + tm.tm_mon = static_cast(extract_month(os, fds) - 1); + tm.tm_year = static_cast(ymd.year()) - 1900; + tm.tm_wday = static_cast(extract_weekday(os, fds)); + if (os.fail()) + return os; + tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); + CharT f[3] = {'%'}; + auto fe = begin(f) + 1; + if (modified == CharT{'E'}) + *fe++ = modified; + *fe++ = *fmt; + facet.put(os, os, os.fill(), &tm, begin(f), fe); +#else // ONLY_C_LOCALE + if (*fmt == 'c') + { + auto wd = static_cast(extract_weekday(os, fds)); + os << weekday_names().first[static_cast(wd)+7] + << ' '; + os << month_names().first[extract_month(os, fds)-1+12] << ' '; + auto d = static_cast(static_cast(fds.ymd.day())); + if (d < 10) + os << ' '; + os << d << ' ' + << make_time(duration_cast(fds.tod.to_duration())) + << ' ' << fds.ymd.year(); + + } + else // *fmt == 'x' + { + auto const& ymd = fds.ymd; + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << static_cast(ymd.month()) << CharT{'/'}; + os.width(2); + os << static_cast(ymd.day()) << CharT{'/'}; + os.width(2); + os << static_cast(ymd.year()) % 100; + } +#endif // ONLY_C_LOCALE + } + command = nullptr; + modified = CharT{}; + } + else + os << *fmt; + break; + case 'C': + if (command) + { + auto y = static_cast(fds.ymd.year()); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + if (y >= 0) + { + os.width(2); + os << y/100; + } + else + { + os << CharT{'-'}; + os.width(2); + os << -(y-99)/100; + } +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'E'}) + { + tm.tm_year = y - 1900; + CharT f[3] = {'%', 'E', 'C'}; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + command = nullptr; + modified = CharT{}; + } + else + os << *fmt; + break; + case 'd': + case 'e': + if (command) + { + auto d = static_cast(static_cast(fds.ymd.day())); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + save_stream _(os); + if (*fmt == CharT{'d'}) + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << d; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + tm.tm_mday = d; + CharT f[3] = {'%', 'O', *fmt}; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + command = nullptr; + modified = CharT{}; + } + else + os << *fmt; + break; + case 'D': + if (command) + { + if (modified == CharT{}) + { + auto const& ymd = fds.ymd; + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << static_cast(ymd.month()) << CharT{'/'}; + os.width(2); + os << static_cast(ymd.day()) << CharT{'/'}; + os.width(2); + os << static_cast(ymd.year()) % 100; + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'F': + if (command) + { + if (modified == CharT{}) + { + auto const& ymd = fds.ymd; + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(4); + os << static_cast(ymd.year()) << CharT{'-'}; + os.width(2); + os << static_cast(ymd.month()) << CharT{'-'}; + os.width(2); + os << static_cast(ymd.day()); + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'g': + case 'G': + if (command) + { + if (modified == CharT{}) + { + auto ld = local_days(fds.ymd); + auto y = year_month_day{ld + days{3}}.year(); + auto start = local_days((y - years{1})/date::dec/thu[last]) + (mon-thu); + if (ld < start) + --y; + if (*fmt == CharT{'G'}) + os << y; + else + { + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(2); + os << std::abs(static_cast(y)) % 100; + } + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'H': + case 'I': + if (command) + { + auto hms = fds.tod; +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + if (*fmt == CharT{'I'}) + hms.make12(); + if (hms.hours() < hours{10}) + os << CharT{'0'}; + os << hms.hours().count(); +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_hour = static_cast(hms.hours().count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'j': + if (command) + { + if (modified == CharT{}) + { + auto ld = local_days(fds.ymd); + auto y = fds.ymd.year(); + auto doy = ld - local_days(y/jan/1) + days{1}; + save_stream _(os); + os.fill('0'); + os.flags(std::ios::dec | std::ios::right); + os.width(3); + os << doy.count(); + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'm': + if (command) + { + auto m = static_cast(fds.ymd.month()); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + if (m < 10) + os << CharT{'0'}; + os << m; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_mon = static_cast(m-1); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'M': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + if (fds.tod.minutes() < minutes{10}) + os << CharT{'0'}; + os << fds.tod.minutes().count(); +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_min = static_cast(fds.tod.minutes().count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'n': + if (command) + { + if (modified == CharT{}) + os << CharT{'\n'}; + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'p': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { + const CharT f[] = {'%', *fmt}; + tm.tm_hour = static_cast(fds.tod.hours().count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#else + if (fds.tod.hours() < hours{12}) + os << ampm_names().first[0]; + else + os << ampm_names().first[1]; +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'r': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { + const CharT f[] = {'%', *fmt}; + tm.tm_hour = static_cast(fds.tod.hours().count()); + tm.tm_min = static_cast(fds.tod.minutes().count()); + tm.tm_sec = static_cast(fds.tod.seconds().count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#else + time_of_day tod(duration_cast(fds.tod.to_duration())); + tod.make12(); + save_stream _(os); + os.fill('0'); + os.width(2); + os << tod.hours().count() << CharT{':'}; + os.width(2); + os << tod.minutes().count() << CharT{':'}; + os.width(2); + os << tod.seconds().count() << CharT{' '}; + tod.make24(); + if (tod.hours() < hours{12}) + os << ampm_names().first[0]; + else + os << ampm_names().first[1]; +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'R': + if (command) + { + if (modified == CharT{}) + { + if (fds.tod.hours() < hours{10}) + os << CharT{'0'}; + os << fds.tod.hours().count() << CharT{':'}; + if (fds.tod.minutes() < minutes{10}) + os << CharT{'0'}; + os << fds.tod.minutes().count(); + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'S': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + os << fds.tod.s_; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_sec = static_cast(fds.tod.s_.seconds().count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 't': + if (command) + { + if (modified == CharT{}) + os << CharT{'\t'}; + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'T': + if (command) + { + if (modified == CharT{}) + { + os << fds.tod; + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'u': + if (command) + { + auto wd = extract_weekday(os, fds); + if (os.fail()) + return os; +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + os << (wd != 0 ? wd : 7u); +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_wday = static_cast(wd); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'U': + if (command) + { + auto const& ymd = fds.ymd; + auto ld = local_days(ymd); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + auto st = local_days(sun[1]/jan/ymd.year()); + if (ld < st) + os << CharT{'0'} << CharT{'0'}; + else + { + auto wn = duration_cast(ld - st).count() + 1; + if (wn < 10) + os << CharT{'0'}; + os << wn; + } + #if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_year = static_cast(ymd.year()) - 1900; + tm.tm_wday = static_cast(extract_weekday(os, fds)); + if (os.fail()) + return os; + tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'V': + if (command) + { + auto ld = local_days(fds.ymd); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + auto y = year_month_day{ld + days{3}}.year(); + auto st = local_days((y - years{1})/12/thu[last]) + (mon-thu); + if (ld < st) + { + --y; + st = local_days((y - years{1})/12/thu[last]) + (mon-thu); + } + auto wn = duration_cast(ld - st).count() + 1; + if (wn < 10) + os << CharT{'0'}; + os << wn; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + auto const& ymd = fds.ymd; + tm.tm_year = static_cast(ymd.year()) - 1900; + tm.tm_wday = static_cast(extract_weekday(os, fds)); + if (os.fail()) + return os; + tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'w': + if (command) + { + auto wd = extract_weekday(os, fds); + if (os.fail()) + return os; +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + os << wd; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_wday = static_cast(wd); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'W': + if (command) + { + auto const& ymd = fds.ymd; + auto ld = local_days(ymd); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + auto st = local_days(mon[1]/jan/ymd.year()); + if (ld < st) + os << CharT{'0'} << CharT{'0'}; + else + { + auto wn = duration_cast(ld - st).count() + 1; + if (wn < 10) + os << CharT{'0'}; + os << wn; + } +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_year = static_cast(ymd.year()) - 1900; + tm.tm_wday = static_cast(extract_weekday(os, fds)); + if (os.fail()) + return os; + tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'X': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{'O'}) + os << CharT{'%'} << modified << *fmt; + else + { + tm = std::tm{}; + tm.tm_sec = static_cast(fds.tod.seconds().count()); + tm.tm_min = static_cast(fds.tod.minutes().count()); + tm.tm_hour = static_cast(fds.tod.hours().count()); + CharT f[3] = {'%'}; + auto fe = begin(f) + 1; + if (modified == CharT{'E'}) + *fe++ = modified; + *fe++ = *fmt; + facet.put(os, os, os.fill(), &tm, begin(f), fe); + } +#else + os << fds.tod; +#endif + command = nullptr; + modified = CharT{}; + } + else + os << *fmt; + break; + case 'y': + if (command) + { + auto y = static_cast(fds.ymd.year()); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + y = std::abs(y) % 100; + if (y < 10) + os << CharT{'0'}; + os << y; +#if !ONLY_C_LOCALE + } + else + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_year = y - 1900; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'Y': + if (command) + { + auto y = fds.ymd.year(); +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + os << y; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'E'}) + { + const CharT f[] = {'%', modified, *fmt}; + tm.tm_year = static_cast(y) - 1900; + facet.put(os, os, os.fill(), &tm, begin(f), end(f)); + } + else + { + os << CharT{'%'} << modified << *fmt; + } +#endif + modified = CharT{}; + command = nullptr; + } + else + os << *fmt; + break; + case 'z': + if (command) + { + if (offset_sec == nullptr) + { + // Can not format %z with unknown offset + os.setstate(ios::failbit); + return os; + } + auto m = duration_cast(*offset_sec); + auto neg = m < minutes{0}; + m = date::abs(m); + auto h = duration_cast(m); + m -= h; + if (neg) + os << CharT{'-'}; + else + os << CharT{'+'}; + if (h < hours{10}) + os << CharT{'0'}; + os << h.count(); + if (modified != CharT{}) + os << CharT{':'}; + if (m < minutes{10}) + os << CharT{'0'}; + os << m.count(); + command = nullptr; + modified = CharT{}; + } + else + os << *fmt; + break; + case 'Z': + if (command) + { + if (modified == CharT{}) + { + if (abbrev == nullptr) + { + // Can not format %Z with unknown time_zone + os.setstate(ios::failbit); + return os; + } + for (auto c : *abbrev) + os << CharT(c); + } + else + { + os << CharT{'%'} << modified << *fmt; + modified = CharT{}; + } + command = nullptr; + } + else + os << *fmt; + break; + case 'E': + case 'O': + if (command) + { + if (modified == CharT{}) + { + modified = *fmt; + } + else + { + os << CharT{'%'} << modified << *fmt; + command = nullptr; + modified = CharT{}; + } + } + else + os << *fmt; + break; + case '%': + if (command) + { + if (modified == CharT{}) + { + os << CharT{'%'}; + command = nullptr; + } + else + { + os << CharT{'%'} << modified << CharT{'%'}; + command = nullptr; + modified = CharT{}; + } + } + else + command = fmt; + break; + default: + if (command) + { + os << CharT{'%'}; + command = nullptr; + } + if (modified != CharT{}) + { + os << modified; + modified = CharT{}; + } + os << *fmt; + break; + } + } + if (command) + os << CharT{'%'}; + if (modified != CharT{}) + os << modified; + return os; +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const year& y) +{ + using CT = std::chrono::seconds; + fields fds{y/0/0}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const month& m) +{ + using CT = std::chrono::seconds; + fields fds{m/0/0}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const day& d) +{ + using CT = std::chrono::seconds; + fields fds{d/0/0}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const weekday& wd) +{ + using CT = std::chrono::seconds; + fields fds{wd}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const year_month& ym) +{ + using CT = std::chrono::seconds; + fields fds{ym/0}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, const month_day& md) +{ + using CT = std::chrono::seconds; + fields fds{md/0}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const year_month_day& ymd) +{ + using CT = std::chrono::seconds; + fields fds{ymd}; + return to_stream(os, fmt, fds); +} + +template +inline +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const std::chrono::duration& d) +{ + using Duration = std::chrono::duration; + using CT = typename std::common_type::type; + fields fds{time_of_day{d}}; + return to_stream(os, fmt, fds); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const local_time& tp, const std::string* abbrev = nullptr, + const std::chrono::seconds* offset_sec = nullptr) +{ + using CT = typename std::common_type::type; + auto ld = floor(tp); + fields fds{year_month_day{ld}, time_of_day{tp-local_seconds{ld}}}; + return to_stream(os, fmt, fds, abbrev, offset_sec); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const sys_time& tp) +{ + using namespace std::chrono; + using CT = typename std::common_type::type; + const std::string abbrev("UTC"); + CONSTDATA seconds offset{0}; + auto sd = floor(tp); + fields fds{year_month_day{sd}, time_of_day{tp-sys_seconds{sd}}}; + return to_stream(os, fmt, fds, &abbrev, &offset); +} + +// format + +template +auto +format(const std::locale& loc, const CharT* fmt, const Streamable& tp) + -> decltype(to_stream(std::declval&>(), fmt, tp), + std::basic_string{}) +{ + std::basic_ostringstream os; + os.exceptions(std::ios::failbit | std::ios::badbit); + os.imbue(loc); + to_stream(os, fmt, tp); + return os.str(); +} + +template +auto +format(const CharT* fmt, const Streamable& tp) + -> decltype(to_stream(std::declval&>(), fmt, tp), + std::basic_string{}) +{ + std::basic_ostringstream os; + os.exceptions(std::ios::failbit | std::ios::badbit); + to_stream(os, fmt, tp); + return os.str(); +} + +template +auto +format(const std::locale& loc, const std::basic_string& fmt, + const Streamable& tp) + -> decltype(to_stream(std::declval&>(), fmt.c_str(), tp), + std::basic_string{}) +{ + std::basic_ostringstream os; + os.exceptions(std::ios::failbit | std::ios::badbit); + os.imbue(loc); + to_stream(os, fmt.c_str(), tp); + return os.str(); +} + +template +auto +format(const std::basic_string& fmt, const Streamable& tp) + -> decltype(to_stream(std::declval&>(), fmt.c_str(), tp), + std::basic_string{}) +{ + std::basic_ostringstream os; + os.exceptions(std::ios::failbit | std::ios::badbit); + to_stream(os, fmt.c_str(), tp); + return os.str(); +} + +// parse + +namespace detail +{ + +template +bool +read_char(std::basic_istream& is, CharT fmt, std::ios::iostate& err) +{ + auto ic = is.get(); + if (Traits::eq_int_type(ic, Traits::eof()) || + !Traits::eq(Traits::to_char_type(ic), fmt)) + { + err |= std::ios::failbit; + is.setstate(std::ios::failbit); + return false; + } + return true; +} + +template +unsigned +read_unsigned(std::basic_istream& is, unsigned m = 1, unsigned M = 10) +{ + unsigned x = 0; + unsigned count = 0; + while (true) + { + auto ic = is.peek(); + if (Traits::eq_int_type(ic, Traits::eof())) + break; + auto c = static_cast(Traits::to_char_type(ic)); + if (!('0' <= c && c <= '9')) + break; + (void)is.get(); + ++count; + x = 10*x + static_cast(c - '0'); + if (count == M) + break; + } + if (count < m) + is.setstate(std::ios::failbit); + return x; +} + +template +int +read_signed(std::basic_istream& is, unsigned m = 1, unsigned M = 10) +{ + auto ic = is.peek(); + if (!Traits::eq_int_type(ic, Traits::eof())) + { + auto c = static_cast(Traits::to_char_type(ic)); + if (('0' <= c && c <= '9') || c == '-' || c == '+') + { + if (c == '-' || c == '+') + (void)is.get(); + auto x = static_cast(read_unsigned(is, std::max(m, 1u), M)); + if (!is.fail()) + { + if (c == '-') + x = -x; + return x; + } + } + } + if (m > 0) + is.setstate(std::ios::failbit); + return 0; +} + +template +long double +read_long_double(std::basic_istream& is, unsigned m = 1, unsigned M = 10) +{ + using namespace std; + unsigned count = 0; + auto decimal_point = Traits::to_int_type( + use_facet>(is.getloc()).decimal_point()); + std::string buf; + while (true) + { + auto ic = is.peek(); + if (Traits::eq_int_type(ic, Traits::eof())) + break; + if (Traits::eq_int_type(ic, decimal_point)) + { + buf += '.'; + decimal_point = Traits::eof(); + is.get(); + } + else + { + auto c = static_cast(Traits::to_char_type(ic)); + if (!('0' <= c && c <= '9')) + break; + buf += c; + (void)is.get(); + } + if (++count == M) + break; + } + if (count < m) + { + is.setstate(std::ios::failbit); + return 0; + } + return std::stold(buf); +} + +struct rs +{ + int& i; + unsigned m; + unsigned M; +}; + +struct ru +{ + int& i; + unsigned m; + unsigned M; +}; + +struct rld +{ + long double& i; + unsigned m; + unsigned M; +}; + +template +void +read(std::basic_istream&) +{ +} + +template +void +read(std::basic_istream& is, CharT a0, Args&& ...args); + +template +void +read(std::basic_istream& is, rs a0, Args&& ...args); + +template +void +read(std::basic_istream& is, ru a0, Args&& ...args); + +template +void +read(std::basic_istream& is, int a0, Args&& ...args); + +template +void +read(std::basic_istream& is, rld a0, Args&& ...args); + +template +void +read(std::basic_istream& is, CharT a0, Args&& ...args) +{ + // No-op if a0 == CharT{} + if (a0 != CharT{}) + { + auto ic = is.peek(); + if (Traits::eq_int_type(ic, Traits::eof())) + { + is.setstate(std::ios::failbit | std::ios::eofbit); + return; + } + if (!Traits::eq(Traits::to_char_type(ic), a0)) + { + is.setstate(std::ios::failbit); + return; + } + (void)is.get(); + } + read(is, std::forward(args)...); +} + +template +void +read(std::basic_istream& is, rs a0, Args&& ...args) +{ + auto x = read_signed(is, a0.m, a0.M); + if (is.fail()) + return; + a0.i = x; + read(is, std::forward(args)...); +} + +template +void +read(std::basic_istream& is, ru a0, Args&& ...args) +{ + auto x = read_unsigned(is, a0.m, a0.M); + if (is.fail()) + return; + a0.i = static_cast(x); + read(is, std::forward(args)...); +} + +template +void +read(std::basic_istream& is, int a0, Args&& ...args) +{ + if (a0 != -1) + { + auto u = static_cast(a0); + CharT buf[std::numeric_limits::digits10+2] = {}; + auto e = buf; + do + { + *e++ = CharT(u % 10) + CharT{'0'}; + u /= 10; + } while (u > 0); + std::reverse(buf, e); + for (auto p = buf; p != e && is.rdstate() == std::ios::goodbit; ++p) + read(is, *p); + } + if (is.rdstate() == std::ios::goodbit) + read(is, std::forward(args)...); +} + +template +void +read(std::basic_istream& is, rld a0, Args&& ...args) +{ + auto x = read_long_double(is, a0.m, a0.M); + if (is.fail()) + return; + a0.i = x; + read(is, std::forward(args)...); +} + +} // namespace detail; + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + fields& fds, std::basic_string* abbrev, + std::chrono::minutes* offset) +{ + using namespace std; + using namespace std::chrono; + typename basic_istream::sentry ok{is, true}; + if (ok) + { +#if !ONLY_C_LOCALE + auto& f = use_facet>(is.getloc()); + std::tm tm{}; +#endif + std::basic_string temp_abbrev; + minutes temp_offset{}; + const CharT* command = nullptr; + auto modified = CharT{}; + auto width = -1; + CONSTDATA int not_a_year = numeric_limits::min(); + int Y = not_a_year; + CONSTDATA int not_a_century = not_a_year / 100; + int C = not_a_century; + CONSTDATA int not_a_2digit_year = 100; + int y = not_a_2digit_year; + int m{}; + int d{}; + int j{}; + CONSTDATA int not_a_weekday = 7; + int wd = not_a_weekday; + CONSTDATA int not_a_hour_12_value = 0; + int I = not_a_hour_12_value; + hours h{}; + minutes min{}; + Duration s{}; + int g = not_a_2digit_year; + int G = not_a_year; + CONSTDATA int not_a_week_num = 100; + int V = not_a_week_num; + int U = not_a_week_num; + int W = not_a_week_num; + using detail::read; + using detail::rs; + using detail::ru; + using detail::rld; + for (; *fmt && is.rdstate() == std::ios::goodbit; ++fmt) + { + switch (*fmt) + { + case 'a': + case 'A': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + wd = tm.tm_wday; + is.setstate(err); +#else + auto nm = detail::weekday_names(); + auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (!is.fail()) + wd = i % 7; +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'b': + case 'B': + case 'h': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + m = tm.tm_mon + 1; + is.setstate(err); +#else + auto nm = detail::month_names(); + auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (!is.fail()) + m = i % 12 + 1; +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'c': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + { + Y = tm.tm_year + 1900; + m = tm.tm_mon + 1; + d = tm.tm_mday; + h = hours{tm.tm_hour}; + min = minutes{tm.tm_min}; + s = duration_cast(seconds{tm.tm_sec}); + } + is.setstate(err); +#else + auto nm = detail::weekday_names(); + auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (is.fail()) + goto broken; + wd = i % 7; + ws(is); + nm = detail::month_names(); + i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (is.fail()) + goto broken; + m = i % 12 + 1; + ws(is); + read(is, rs{d, 1, 2}); + if (is.fail()) + goto broken; + ws(is); + using dfs = detail::decimal_format_seconds; + CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; + int H; + int M; + long double S; + read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, + CharT{':'}, rld{S, 1, w}); + if (is.fail()) + goto broken; + h = hours{H}; + min = minutes{M}; + s = round(duration{S}); + ws(is); + read(is, rs{Y, 1, 4u}); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'x': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + { + Y = tm.tm_year + 1900; + m = tm.tm_mon + 1; + d = tm.tm_mday; + } + is.setstate(err); +#else + read(is, ru{m, 1, 2}, CharT{'/'}, ru{d, 1, 2}, CharT{'/'}, + rs{y, 1, 2}); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'X': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + { + h = hours{tm.tm_hour}; + min = minutes{tm.tm_min}; + s = duration_cast(seconds{tm.tm_sec}); + } + is.setstate(err); +#else + using dfs = detail::decimal_format_seconds; + CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; + int H; + int M; + long double S; + read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, + CharT{':'}, rld{S, 1, w}); + if (!is.fail()) + { + h = hours{H}; + min = minutes{M}; + s = round(duration{S}); + } +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'C': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + read(is, rs{C, 1, width == -1 ? 2u : static_cast(width)}); +#if !ONLY_C_LOCALE + } + else + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + { + auto tY = tm.tm_year + 1900; + C = (tY >= 0 ? tY : tY-99) / 100; + } + is.setstate(err); + } +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'D': + if (command) + { + if (modified == CharT{}) + read(is, ru{m, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'}, + ru{d, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'}, + rs{y, 1, 2}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'F': + if (command) + { + if (modified == CharT{}) + read(is, rs{Y, 1, width == -1 ? 4u : static_cast(width)}, + CharT{'-'}, ru{m, 1, 2}, CharT{'-'}, ru{d, 1, 2}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'd': + case 'e': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) +#endif + read(is, rs{d, 1, width == -1 ? 2u : static_cast(width)}); +#if !ONLY_C_LOCALE + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + command = nullptr; + width = -1; + modified = CharT{}; + if ((err & ios::failbit) == 0) + d = tm.tm_mday; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'H': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + int H; + read(is, ru{H, 1, width == -1 ? 2u : static_cast(width)}); + if (!is.fail()) + h = hours{H}; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + h = hours{tm.tm_hour}; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'I': + if (command) + { + if (modified == CharT{}) + { + // reads in an hour into I, but most be in [1, 12] + read(is, rs{I, 1, width == -1 ? 2u : static_cast(width)}); + if (I != not_a_hour_12_value) + { + if (!(1 <= I && I <= 12)) + { + I = not_a_hour_12_value; + goto broken; + } + } + } + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'j': + if (command) + { + if (modified == CharT{}) + read(is, ru{j, 1, width == -1 ? 3u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'M': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + int M; + read(is, ru{M, 1, width == -1 ? 2u : static_cast(width)}); + if (!is.fail()) + min = minutes{M}; +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + min = minutes{tm.tm_min}; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'm': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) +#endif + read(is, rs{m, 1, width == -1 ? 2u : static_cast(width)}); +#if !ONLY_C_LOCALE + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + m = tm.tm_mon + 1; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'n': + case 't': + if (command) + { + // %n matches a single white space character + // %t matches 0 or 1 white space characters + auto ic = is.peek(); + if (Traits::eq_int_type(ic, Traits::eof())) + { + ios_base::iostate err = ios_base::eofbit; + if (*fmt == 'n') + err |= ios_base::failbit; + is.setstate(err); + break; + } + if (isspace(ic)) + { + (void)is.get(); + } + else if (*fmt == 'n') + is.setstate(ios_base::failbit); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'p': + // Error if haven't yet seen %I + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { + if (I == not_a_hour_12_value) + goto broken; + tm = std::tm{}; + tm.tm_hour = I; + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if (err & ios::failbit) + goto broken; + h = hours{tm.tm_hour}; + I = not_a_hour_12_value; + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#else + if (I == not_a_hour_12_value) + goto broken; + auto nm = detail::ampm_names(); + auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (is.fail()) + goto broken; + h = hours{I}; + if (i == 1) + { + if (h != hours{12}) + h += hours{12}; + } + else if (h == hours{12}) + h = hours{0}; + I = not_a_hour_12_value; +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + + break; + case 'r': + if (command) + { +#if !ONLY_C_LOCALE + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + { + h = hours{tm.tm_hour}; + min = minutes{tm.tm_min}; + s = duration_cast(seconds{tm.tm_sec}); + } + is.setstate(err); +#else + using dfs = detail::decimal_format_seconds; + CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; + int H; + int M; + long double S; + read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, + CharT{':'}, rld{S, 1, w}); + if (is.fail() || !(1 <= H && H <= 12)) + goto broken; + ws(is); + auto nm = detail::ampm_names(); + auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; + if (is.fail()) + goto broken; + h = hours{H}; + if (i == 1) + { + if (h != hours{12}) + h += hours{12}; + } + else if (h == hours{12}) + h = hours{0}; + min = minutes{M}; + s = round(duration{S}); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'R': + if (command) + { + if (modified == CharT{}) + { + int H, M; + read(is, ru{H, 1, 2}, CharT{'\0'}, CharT{':'}, CharT{'\0'}, + ru{M, 1, 2}, CharT{'\0'}); + if (!is.fail()) + { + h = hours{H}; + min = minutes{M}; + } + } + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'S': + if (command) + { + #if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + using dfs = detail::decimal_format_seconds; + CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; + long double S; + read(is, rld{S, 1, width == -1 ? w : static_cast(width)}); + if (!is.fail()) + s = round(duration{S}); +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + s = duration_cast(seconds{tm.tm_sec}); + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'T': + if (command) + { + if (modified == CharT{}) + { + using dfs = detail::decimal_format_seconds; + CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; + int H; + int M; + long double S; + read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, + CharT{':'}, rld{S, 1, w}); + if (!is.fail()) + { + h = hours{H}; + min = minutes{M}; + s = round(duration{S}); + } + } + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'Y': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) +#endif + read(is, rs{Y, 1, width == -1 ? 4u : static_cast(width)}); +#if !ONLY_C_LOCALE + else if (modified == CharT{'E'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + Y = tm.tm_year + 1900; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'y': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) +#endif + read(is, ru{y, 1, width == -1 ? 2u : static_cast(width)}); +#if !ONLY_C_LOCALE + else + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + Y = tm.tm_year + 1900; + is.setstate(err); + } +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'g': + if (command) + { + if (modified == CharT{}) + read(is, ru{g, 1, width == -1 ? 2u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'G': + if (command) + { + if (modified == CharT{}) + read(is, rs{G, 1, width == -1 ? 4u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'U': + if (command) + { + if (modified == CharT{}) + read(is, ru{U, 1, width == -1 ? 2u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'V': + if (command) + { + if (modified == CharT{}) + read(is, ru{V, 1, width == -1 ? 2u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'W': + if (command) + { + if (modified == CharT{}) + read(is, ru{W, 1, width == -1 ? 2u : static_cast(width)}); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'u': + case 'w': + if (command) + { +#if !ONLY_C_LOCALE + if (modified == CharT{}) + { +#endif + read(is, ru{wd, 1, width == -1 ? 1u : static_cast(width)}); + if (!is.fail() && *fmt == 'u') + { + if (wd == 7) + wd = 0; + else if (wd == 0) + wd = 7; + } +#if !ONLY_C_LOCALE + } + else if (modified == CharT{'O'}) + { + ios_base::iostate err = ios_base::goodbit; + f.get(is, nullptr, is, err, &tm, command, fmt+1); + if ((err & ios::failbit) == 0) + wd = tm.tm_wday; + is.setstate(err); + } + else + read(is, CharT{'%'}, width, modified, *fmt); +#endif + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'E': + case 'O': + if (command) + { + if (modified == CharT{}) + { + modified = *fmt; + } + else + { + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + } + else + read(is, *fmt); + break; + case '%': + if (command) + { + if (modified == CharT{}) + read(is, *fmt); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + command = fmt; + break; + case 'z': + if (command) + { + int H, M; + if (modified == CharT{}) + { + read(is, rs{H, 2, 2}); + if (!is.fail()) + temp_offset = hours{H}; + if (is.good()) + { + auto ic = is.peek(); + if (!Traits::eq_int_type(ic, Traits::eof())) + { + auto c = static_cast(Traits::to_char_type(ic)); + if ('0' <= c && c <= '9') + { + read(is, ru{M, 2, 2}); + if (!is.fail()) + temp_offset += minutes{ H < 0 ? -M : M }; + } + } + } + } + else + { + read(is, rs{H, 1, 2}); + if (!is.fail()) + temp_offset = hours{H}; + if (is.good()) + { + auto ic = is.peek(); + if (!Traits::eq_int_type(ic, Traits::eof())) + { + auto c = static_cast(Traits::to_char_type(ic)); + if (c == ':') + { + (void)is.get(); + read(is, ru{M, 2, 2}); + if (!is.fail()) + temp_offset += minutes{ H < 0 ? -M : M }; + } + } + } + } + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + case 'Z': + if (command) + { + if (modified == CharT{}) + { + if (!temp_abbrev.empty()) + is.setstate(ios::failbit); + else + { + while (is.rdstate() == std::ios::goodbit) + { + auto i = is.rdbuf()->sgetc(); + if (Traits::eq_int_type(i, Traits::eof())) + { + is.setstate(ios::eofbit); + break; + } + auto wc = Traits::to_char_type(i); + auto c = static_cast(wc); + // is c a valid time zone name or abbreviation character? + if (!(CharT{1} < wc && wc < CharT{127}) || !(isalnum(c) || + c == '_' || c == '/' || c == '-' || c == '+')) + break; + temp_abbrev.push_back(c); + is.rdbuf()->sbumpc(); + } + if (temp_abbrev.empty()) + is.setstate(ios::failbit); + } + } + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + else + read(is, *fmt); + break; + default: + if (command) + { + if (width == -1 && modified == CharT{} && '0' <= *fmt && *fmt <= '9') + { + width = static_cast(*fmt) - '0'; + while ('0' <= fmt[1] && fmt[1] <= '9') + width = 10*width + static_cast(*++fmt) - '0'; + } + else + { + if (modified == CharT{}) + read(is, CharT{'%'}, width, *fmt); + else + read(is, CharT{'%'}, width, modified, *fmt); + command = nullptr; + width = -1; + modified = CharT{}; + } + } + else // !command + { + if (isspace(*fmt)) + ws(is); // space matches 0 or more white space characters + else + read(is, *fmt); + } + break; + } + } + // is.rdstate() != ios::goodbit || *fmt == CharT{} + if (is.rdstate() == ios::goodbit && command) + { + if (modified == CharT{}) + read(is, CharT{'%'}, width); + else + read(is, CharT{'%'}, width, modified); + } + if (is.rdstate() != ios::goodbit && *fmt != CharT{} && !is.fail()) + is.setstate(ios::failbit); + if (!is.fail()) + { + if (y != not_a_2digit_year) + { + // Convert y and an optional C to Y + if (!(0 <= y && y <= 99)) + goto broken; + if (C == not_a_century) + { + if (Y == not_a_year) + { + if (y >= 69) + C = 19; + else + C = 20; + } + else + { + C = (Y >= 0 ? Y : Y-100) / 100; + } + } + int tY; + if (C >= 0) + tY = 100*C + y; + else + tY = 100*(C+1) - (y == 0 ? 100 : y); + if (Y != not_a_year && Y != tY) + goto broken; + Y = tY; + } + if (g != not_a_2digit_year) + { + // Convert g and an optional C to G + if (!(0 <= g && g <= 99)) + goto broken; + if (C == not_a_century) + { + if (G == not_a_year) + { + if (g >= 69) + C = 19; + else + C = 20; + } + else + { + C = (G >= 0 ? G : G-100) / 100; + } + } + int tG; + if (C >= 0) + tG = 100*C + g; + else + tG = 100*(C+1) - (g == 0 ? 100 : g); + if (G != not_a_year && G != tG) + goto broken; + G = tG; + } + if (G != not_a_year) + { + // Convert G, V and wd to Y, m and d + if (V == not_a_week_num || wd == not_a_weekday) + goto broken; + auto ymd = year_month_day{local_days(year{G-1}/dec/thu[last]) + + (mon-thu) + weeks{V-1} + + (weekday{static_cast(wd)}-mon)}; + if (Y == not_a_year) + Y = static_cast(ymd.year()); + else if (year{Y} != ymd.year()) + goto broken; + if (m == 0) + m = static_cast(static_cast(ymd.month())); + else if (month(static_cast(m)) != ymd.month()) + goto broken; + if (d == 0) + d = static_cast(static_cast(ymd.day())); + else if (day(static_cast(d)) != ymd.day()) + goto broken; + } + if (j != 0 && Y != not_a_year) + { + auto ymd = year_month_day{local_days(year{Y}/1/1) + days{j-1}}; + if (m == 0) + m = static_cast(static_cast(ymd.month())); + else if (month(static_cast(m)) != ymd.month()) + goto broken; + if (d == 0) + d = static_cast(static_cast(ymd.day())); + else if (day(static_cast(d)) != ymd.day()) + goto broken; + } + if (U != not_a_week_num && Y != not_a_year) + { + if (wd == not_a_weekday) + goto broken; + sys_days sd; + if (U == 0) + sd = year{Y-1}/dec/weekday{static_cast(wd)}[last]; + else + sd = sys_days(year{Y}/jan/sun[1]) + weeks{U-1} + + (weekday{static_cast(wd)} - sun); + year_month_day ymd = sd; + if (year{Y} != ymd.year()) + goto broken; + if (m == 0) + m = static_cast(static_cast(ymd.month())); + else if (month(static_cast(m)) != ymd.month()) + goto broken; + if (d == 0) + d = static_cast(static_cast(ymd.day())); + else if (day(static_cast(d)) != ymd.day()) + goto broken; + } + if (W != not_a_week_num && Y != not_a_year) + { + if (wd == not_a_weekday) + goto broken; + sys_days sd; + if (W == 0) + sd = year{Y-1}/dec/weekday{static_cast(wd)}[last]; + else + sd = sys_days(year{Y}/jan/mon[1]) + weeks{W-1} + + (weekday{static_cast(wd)} - mon); + year_month_day ymd = sd; + if (year{Y} != ymd.year()) + goto broken; + if (m == 0) + m = static_cast(static_cast(ymd.month())); + else if (month(static_cast(m)) != ymd.month()) + goto broken; + if (d == 0) + d = static_cast(static_cast(ymd.day())); + else if (day(static_cast(d)) != ymd.day()) + goto broken; + } + if (Y < static_cast(year::min()) || Y > static_cast(year::max())) + Y = not_a_year; + auto ymd = year{Y}/m/d; + if (wd != not_a_weekday && ymd.ok()) + { + if (weekday{static_cast(wd)} != weekday(ymd)) + goto broken; + } + fds.ymd = ymd; + fds.tod = time_of_day{h}; + fds.tod.m_ = min; + fds.tod.s_ = detail::decimal_format_seconds{s}; + if (wd != not_a_weekday) + fds.wd = weekday{static_cast(wd)}; + if (abbrev != nullptr) + *abbrev = std::move(temp_abbrev); + if (offset != nullptr) + *offset = temp_offset; + } + return is; + } +broken: + is.setstate(ios_base::failbit); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, year& y, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.year().ok()) + is.setstate(ios::failbit); + if (!is.fail()) + y = fds.ymd.year(); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, month& m, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.month().ok()) + is.setstate(ios::failbit); + if (!is.fail()) + m = fds.ymd.month(); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, day& d, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.day().ok()) + is.setstate(ios::failbit); + if (!is.fail()) + d = fds.ymd.day(); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, weekday& wd, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.wd.ok()) + is.setstate(ios::failbit); + if (!is.fail()) + wd = fds.wd; + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, year_month& ym, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.month().ok()) + is.setstate(ios::failbit); + if (!is.fail()) + ym = fds.ymd.year()/fds.ymd.month(); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, month_day& md, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.month().ok() || !fds.ymd.day().ok()) + is.setstate(ios::failbit); + if (!is.fail()) + md = fds.ymd.month()/fds.ymd.day(); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + year_month_day& ymd, std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = seconds; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.ok()) + is.setstate(ios::failbit); + if (!is.fail()) + ymd = fds.ymd; + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + sys_time& tp, std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + minutes offset_local{}; + auto offptr = offset ? offset : &offset_local; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offptr); + if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) + is.setstate(ios::failbit); + if (!is.fail()) + tp = round(sys_days(fds.ymd) - *offptr + fds.tod.to_duration()); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + local_time& tp, std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) + is.setstate(ios::failbit); + if (!is.fail()) + tp = round(local_seconds{local_days(fds.ymd)} + fds.tod.to_duration()); + return is; +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + std::chrono::duration& d, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using Duration = std::chrono::duration; + using CT = typename common_type::type; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offset); + if (!is.fail()) + d = duration_cast(fds.tod.to_duration()); + return is; +} + +template , + class Alloc = std::allocator> +struct parse_manip +{ + const std::basic_string format_; + Parsable& tp_; + std::basic_string* abbrev_; + std::chrono::minutes* offset_; + +public: + parse_manip(std::basic_string format, Parsable& tp, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) + : format_(std::move(format)) + , tp_(tp) + , abbrev_(abbrev) + , offset_(offset) + {} + +}; + +template +std::basic_istream& +operator>>(std::basic_istream& is, + const parse_manip& x) +{ + return from_stream(is, x.format_.c_str(), x.tp_, x.abbrev_, x.offset_); +} + +template +inline +auto +parse(const std::basic_string& format, Parsable& tp) + -> decltype(from_stream(std::declval&>(), + format.c_str(), tp), + parse_manip{format, tp}) +{ + return {format, tp}; +} + +template +inline +auto +parse(const std::basic_string& format, Parsable& tp, + std::basic_string& abbrev) + -> decltype(from_stream(std::declval&>(), + format.c_str(), tp, &abbrev), + parse_manip{format, tp, &abbrev}) +{ + return {format, tp, &abbrev}; +} + +template +inline +auto +parse(const std::basic_string& format, Parsable& tp, + std::chrono::minutes& offset) + -> decltype(from_stream(std::declval&>(), + format.c_str(), tp, nullptr, &offset), + parse_manip{format, tp, nullptr, &offset}) +{ + return {format, tp, nullptr, &offset}; +} + +template +inline +auto +parse(const std::basic_string& format, Parsable& tp, + std::basic_string& abbrev, std::chrono::minutes& offset) + -> decltype(from_stream(std::declval&>(), + format.c_str(), tp, &abbrev, &offset), + parse_manip{format, tp, &abbrev, &offset}) +{ + return {format, tp, &abbrev, &offset}; +} + +// const CharT* formats + +template +inline +auto +parse(const CharT* format, Parsable& tp) + -> decltype(from_stream(std::declval&>(), format, tp), + parse_manip{format, tp}) +{ + return {format, tp}; +} + +template +inline +auto +parse(const CharT* format, Parsable& tp, std::basic_string& abbrev) + -> decltype(from_stream(std::declval&>(), format, + tp, &abbrev), + parse_manip{format, tp, &abbrev}) +{ + return {format, tp, &abbrev}; +} + +template +inline +auto +parse(const CharT* format, Parsable& tp, std::chrono::minutes& offset) + -> decltype(from_stream(std::declval&>(), format, + tp, nullptr, &offset), + parse_manip{format, tp, nullptr, &offset}) +{ + return {format, tp, nullptr, &offset}; +} + +template +inline +auto +parse(const CharT* format, Parsable& tp, + std::basic_string& abbrev, std::chrono::minutes& offset) + -> decltype(from_stream(std::declval&>(), format, + tp, &abbrev, &offset), + parse_manip{format, tp, &abbrev, &offset}) +{ + return {format, tp, &abbrev, &offset}; +} + +// duration streaming + +namespace detail +{ + +#if __cplusplus >= 201402 && (!defined(__EDG_VERSION__) || __EDG_VERSION__ > 411) \ + && (!defined(__SUNPRO_CC) || __SUNPRO_CC > 0x5150) + +template +class string_literal +{ + CharT p_[N]; + +public: + using const_iterator = const CharT*; + + string_literal(string_literal const&) = default; + string_literal& operator=(string_literal const&) = delete; + + template > + CONSTCD14 string_literal(CharT c) NOEXCEPT + : p_{c} + { + } + + CONSTCD14 string_literal(const CharT(&a)[N]) NOEXCEPT + : p_{} + { + for (std::size_t i = 0; i < N; ++i) + p_[i] = a[i]; + } + + template > + CONSTCD14 string_literal(const char(&a)[N]) NOEXCEPT + : p_{} + { + for (std::size_t i = 0; i < N; ++i) + p_[i] = a[i]; + } + + template {}>> + CONSTCD14 string_literal(string_literal const& a) NOEXCEPT + : p_{} + { + for (std::size_t i = 0; i < N; ++i) + p_[i] = a[i]; + } + + template > + CONSTCD14 string_literal(const string_literal& x, + const string_literal& y) NOEXCEPT + : p_{} + { + std::size_t i = 0; + for (; i < N1-1; ++i) + p_[i] = x[i]; + for (std::size_t j = 0; j < N2; ++j, ++i) + p_[i] = y[j]; + } + + CONSTCD14 const CharT* data() const NOEXCEPT {return p_;} + CONSTCD14 std::size_t size() const NOEXCEPT {return N-1;} + + CONSTCD14 const_iterator begin() const NOEXCEPT {return p_;} + CONSTCD14 const_iterator end() const NOEXCEPT {return p_ + N-1;} + + CONSTCD14 CharT const& operator[](std::size_t n) const NOEXCEPT + { + return p_[n]; + } + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, const string_literal& s) + { + return os << s.p_; + } +}; + +template +CONSTCD14 +inline +string_literal, + N1 + N2 - 1> +operator+(const string_literal& x, const string_literal& y) NOEXCEPT +{ + using CharT = std::conditional_t; + return string_literal{string_literal{x}, + string_literal{y}}; +} + +template +inline +std::basic_string +operator+(std::basic_string x, + const string_literal& y) NOEXCEPT +{ + x.append(y.data(), y.size()); + return x; +} + +template +CONSTCD14 +inline +string_literal +msl(const CharT(&a)[N]) NOEXCEPT +{ + return string_literal{a}; +} + +template {} || + std::is_same{} || + std::is_same{} || + std::is_same{}>> +CONSTCD14 +inline +string_literal +msl(CharT c) NOEXCEPT +{ + return string_literal{c}; +} + +CONSTCD14 +inline +std::size_t +to_string_len(std::intmax_t i) +{ + std::size_t r = 0; + do + { + i /= 10; + ++r; + } while (i > 0); + return r; +} + +template +CONSTCD14 +inline +std::enable_if_t +< + N < 10, + string_literal +> +msl() NOEXCEPT +{ + return msl(char(N % 10 + '0')); +} + +template +CONSTCD14 +inline +std::enable_if_t +< + 10 <= N, + string_literal +> +msl() NOEXCEPT +{ + return msl() + msl(char(N % 10 + '0')); +} + +template +CONSTCD14 +inline +std::enable_if_t +< + std::ratio::type::den != 1, + string_literal::type::num) + + to_string_len(std::ratio::type::den) + 4> +> +msl(std::ratio) NOEXCEPT +{ + using R = typename std::ratio::type; + return msl(CharT{'['}) + msl() + msl(CharT{'/'}) + + msl() + msl(CharT{']'}); +} + +template +CONSTCD14 +inline +std::enable_if_t +< + std::ratio::type::den == 1, + string_literal::type::num) + 3> +> +msl(std::ratio) NOEXCEPT +{ + using R = typename std::ratio::type; + return msl(CharT{'['}) + msl() + msl(CharT{']'}); +} + +template +CONSTCD14 +inline +auto +msl(std::atto) NOEXCEPT +{ + return msl(CharT{'a'}); +} + +template +CONSTCD14 +inline +auto +msl(std::femto) NOEXCEPT +{ + return msl(CharT{'f'}); +} + +template +CONSTCD14 +inline +auto +msl(std::pico) NOEXCEPT +{ + return msl(CharT{'p'}); +} + +template +CONSTCD14 +inline +auto +msl(std::nano) NOEXCEPT +{ + return msl(CharT{'n'}); +} + +template +CONSTCD14 +inline +std::enable_if_t +< + std::is_same{}, + string_literal +> +msl(std::micro) NOEXCEPT +{ + return string_literal{"\xC2\xB5"}; +} + +template +CONSTCD14 +inline +std::enable_if_t +< + !std::is_same{}, + string_literal +> +msl(std::micro) NOEXCEPT +{ + return string_literal{CharT{static_cast('\xB5')}}; +} + +template +CONSTCD14 +inline +auto +msl(std::milli) NOEXCEPT +{ + return msl(CharT{'m'}); +} + +template +CONSTCD14 +inline +auto +msl(std::centi) NOEXCEPT +{ + return msl(CharT{'c'}); +} + +template +CONSTCD14 +inline +auto +msl(std::deci) NOEXCEPT +{ + return msl(CharT{'d'}); +} + +template +CONSTCD14 +inline +auto +msl(std::deca) NOEXCEPT +{ + return string_literal{"da"}; +} + +template +CONSTCD14 +inline +auto +msl(std::hecto) NOEXCEPT +{ + return msl(CharT{'h'}); +} + +template +CONSTCD14 +inline +auto +msl(std::kilo) NOEXCEPT +{ + return msl(CharT{'k'}); +} + +template +CONSTCD14 +inline +auto +msl(std::mega) NOEXCEPT +{ + return msl(CharT{'M'}); +} + +template +CONSTCD14 +inline +auto +msl(std::giga) NOEXCEPT +{ + return msl(CharT{'G'}); +} + +template +CONSTCD14 +inline +auto +msl(std::tera) NOEXCEPT +{ + return msl(CharT{'T'}); +} + +template +CONSTCD14 +inline +auto +msl(std::peta) NOEXCEPT +{ + return msl(CharT{'P'}); +} + +template +CONSTCD14 +inline +auto +msl(std::exa) NOEXCEPT +{ + return msl(CharT{'E'}); +} + +template +CONSTCD14 +auto +get_units(Period p) +{ + return msl(p) + string_literal{"s"}; +} + +template +CONSTCD14 +auto +get_units(std::ratio<1>) +{ + return string_literal{"s"}; +} + +template +CONSTCD14 +auto +get_units(std::ratio<60>) +{ + return string_literal{"min"}; +} + +template +CONSTCD14 +auto +get_units(std::ratio<3600>) +{ + return string_literal{"h"}; +} + +#else // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411) + +inline +std::string +to_string(std::uint64_t x) +{ + return std::to_string(x); +} + +template +std::basic_string +to_string(std::uint64_t x) +{ + auto y = std::to_string(x); + return std::basic_string(y.begin(), y.end()); +} + +template +inline +typename std::enable_if +< + std::ratio::type::den != 1, + std::basic_string +>::type +msl(std::ratio) +{ + using R = typename std::ratio::type; + return std::basic_string(1, '[') + to_string(R::num) + CharT{'/'} + + to_string(R::den) + CharT{']'}; +} + +template +inline +typename std::enable_if +< + std::ratio::type::den == 1, + std::basic_string +>::type +msl(std::ratio) +{ + using R = typename std::ratio::type; + return std::basic_string(1, '[') + to_string(R::num) + CharT{']'}; +} + +template +inline +std::basic_string +msl(std::atto) +{ + return {'a'}; +} + +template +inline +std::basic_string +msl(std::femto) +{ + return {'f'}; +} + +template +inline +std::basic_string +msl(std::pico) +{ + return {'p'}; +} + +template +inline +std::basic_string +msl(std::nano) +{ + return {'n'}; +} + +template +inline +typename std::enable_if +< + std::is_same::value, + std::string +>::type +msl(std::micro) +{ + return "\xC2\xB5"; +} + +template +inline +typename std::enable_if +< + !std::is_same::value, + std::basic_string +>::type +msl(std::micro) +{ + return {CharT(static_cast('\xB5'))}; +} + +template +inline +std::basic_string +msl(std::milli) +{ + return {'m'}; +} + +template +inline +std::basic_string +msl(std::centi) +{ + return {'c'}; +} + +template +inline +std::basic_string +msl(std::deci) +{ + return {'d'}; +} + +template +inline +std::basic_string +msl(std::deca) +{ + return {'d', 'a'}; +} + +template +inline +std::basic_string +msl(std::hecto) +{ + return {'h'}; +} + +template +inline +std::basic_string +msl(std::kilo) +{ + return {'k'}; +} + +template +inline +std::basic_string +msl(std::mega) +{ + return {'M'}; +} + +template +inline +std::basic_string +msl(std::giga) +{ + return {'G'}; +} + +template +inline +std::basic_string +msl(std::tera) +{ + return {'T'}; +} + +template +inline +std::basic_string +msl(std::peta) +{ + return {'P'}; +} + +template +inline +std::basic_string +msl(std::exa) +{ + return {'E'}; +} + +template +std::basic_string +get_units(Period p) +{ + return msl(p) + CharT{'s'}; +} + +template +std::basic_string +get_units(std::ratio<1>) +{ + return {'s'}; +} + +template +std::basic_string +get_units(std::ratio<60>) +{ + return {'m', 'i', 'n'}; +} + +template +std::basic_string +get_units(std::ratio<3600>) +{ + return {'h'}; +} + +#endif // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411) + +template > +struct make_string; + +template <> +struct make_string +{ + template + static + std::string + from(Rep n) + { + return std::to_string(n); + } +}; + +template +struct make_string +{ + template + static + std::basic_string + from(Rep n) + { + auto s = std::to_string(n); + return std::basic_string(s.begin(), s.end()); + } +}; + +template <> +struct make_string +{ + template + static + std::wstring + from(Rep n) + { + return std::to_wstring(n); + } +}; + +template +struct make_string +{ + template + static + std::basic_string + from(Rep n) + { + auto s = std::to_wstring(n); + return std::basic_string(s.begin(), s.end()); + } +}; + +} // namespace detail + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, + const std::chrono::duration& d) +{ + using namespace detail; + return os << make_string::from(d.count()) + + get_units(typename Period::type{}); +} + +} // namespace date +} // namespace arrow_vendored + + +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif + + +#endif // DATE_H diff --git a/r/R/inst/include/arrow/vendored/datetime/ios.h b/r/R/inst/include/arrow/vendored/datetime/ios.h new file mode 100644 index 00000000000..acad28d13b5 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime/ios.h @@ -0,0 +1,53 @@ +// +// ios.h +// DateTimeLib +// +// The MIT License (MIT) +// +// Copyright (c) 2016 Alexander Kormanovsky +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ios_hpp +#define ios_hpp + +#if __APPLE__ +# include +# if TARGET_OS_IPHONE +# include + + namespace arrow_vendored + { + namespace date + { + namespace iOSUtils + { + + std::string get_tzdata_path(); + std::string get_current_timezone(); + + } // namespace iOSUtils + } // namespace date + } // namespace arrow_vendored + +# endif // TARGET_OS_IPHONE +#else // !__APPLE__ +# define TARGET_OS_IPHONE 0 +#endif // !__APPLE__ +#endif // ios_hpp diff --git a/r/R/inst/include/arrow/vendored/datetime/tz.h b/r/R/inst/include/arrow/vendored/datetime/tz.h new file mode 100644 index 00000000000..249162b0149 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime/tz.h @@ -0,0 +1,2590 @@ +#ifndef TZ_H +#define TZ_H + +// The MIT License (MIT) +// +// Copyright (c) 2015, 2016, 2017 Howard Hinnant +// Copyright (c) 2017 Jiangang Zhuang +// Copyright (c) 2017 Aaron Bishop +// Copyright (c) 2017 Tomasz KamiÅ„ski +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Our apologies. When the previous paragraph was written, lowercase had not yet +// been invented (that would involve another several millennia of evolution). +// We did not mean to shout. + +// Get more recent database at http://www.iana.org/time-zones + +// The notion of "current timezone" is something the operating system is expected to "just +// know". How it knows this is system specific. It's often a value set by the user at OS +// installation time and recorded by the OS somewhere. On Linux and Mac systems the current +// timezone name is obtained by looking at the name or contents of a particular file on +// disk. On Windows the current timezone name comes from the registry. In either method, +// there is no guarantee that the "native" current timezone name obtained will match any +// of the "Standard" names in this library's "database". On Linux, the names usually do +// seem to match so mapping functions to map from native to "Standard" are typically not +// required. On Windows, the names are never "Standard" so mapping is always required. +// Technically any OS may use the mapping process but currently only Windows does use it. + +/////////////////////////////////////////////////// + +// Windows does not support OS timezone database +#ifdef _WIN32 +# define USE_OS_TZDB 0 +#else +# define USE_OS_TZDB 1 +#endif +#define HAS_REMOTE_API 0 + +//////////////////////////////////////////////////// + +#ifndef USE_OS_TZDB +# define USE_OS_TZDB 0 +#endif + +#ifndef HAS_REMOTE_API +# if USE_OS_TZDB == 0 +# ifdef _WIN32 +# define HAS_REMOTE_API 0 +# else +# define HAS_REMOTE_API 1 +# endif +# else // HAS_REMOTE_API makes no since when using the OS timezone database +# define HAS_REMOTE_API 0 +# endif +#endif + +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wconstant-logical-operand" +#endif + +static_assert(!(USE_OS_TZDB && HAS_REMOTE_API), + "USE_OS_TZDB and HAS_REMOTE_API can not be used together"); + +#ifdef __clang__ +# pragma clang diagnostic pop +#endif + +#ifndef AUTO_DOWNLOAD +# define AUTO_DOWNLOAD HAS_REMOTE_API +#endif + +static_assert(HAS_REMOTE_API == 0 ? AUTO_DOWNLOAD == 0 : true, + "AUTO_DOWNLOAD can not be turned on without HAS_REMOTE_API"); + +#ifndef USE_SHELL_API +# define USE_SHELL_API 1 +#endif + +#if USE_OS_TZDB +# ifdef _WIN32 +# error "USE_OS_TZDB can not be used on Windows" +# endif +# ifndef MISSING_LEAP_SECONDS +# ifdef __APPLE__ +# define MISSING_LEAP_SECONDS 1 +# else +# define MISSING_LEAP_SECONDS 0 +# endif +# endif +#else +# define MISSING_LEAP_SECONDS 0 +#endif + +#ifndef HAS_DEDUCTION_GUIDES +# if __cplusplus >= 201703 +# define HAS_DEDUCTION_GUIDES 1 +# else +# define HAS_DEDUCTION_GUIDES 0 +# endif +#endif // HAS_DEDUCTION_GUIDES + +#include "date.h" + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#include "tz_private.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +# ifdef DATE_BUILD_DLL +# define DATE_API __declspec(dllexport) +# elif defined(DATE_USE_DLL) +# define DATE_API __declspec(dllimport) +# else +# define DATE_API +# endif +#else +# ifdef DATE_BUILD_DLL +# define DATE_API __attribute__ ((visibility ("default"))) +# else +# define DATE_API +# endif +#endif + +namespace arrow_vendored +{ +namespace date +{ + +enum class choose {earliest, latest}; + +namespace detail +{ + struct undocumented; +} + +struct sys_info +{ + sys_seconds begin; + sys_seconds end; + std::chrono::seconds offset; + std::chrono::minutes save; + std::string abbrev; +}; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const sys_info& r) +{ + os << r.begin << '\n'; + os << r.end << '\n'; + os << make_time(r.offset) << "\n"; + os << make_time(r.save) << "\n"; + os << r.abbrev << '\n'; + return os; +} + +struct local_info +{ + enum {unique, nonexistent, ambiguous} result; + sys_info first; + sys_info second; +}; + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const local_info& r) +{ + if (r.result == local_info::nonexistent) + os << "nonexistent between\n"; + else if (r.result == local_info::ambiguous) + os << "ambiguous between\n"; + os << r.first; + if (r.result != local_info::unique) + { + os << "and\n"; + os << r.second; + } + return os; +} + +class nonexistent_local_time + : public std::runtime_error +{ +public: + template + nonexistent_local_time(local_time tp, const local_info& i); + +private: + template + static + std::string + make_msg(local_time tp, const local_info& i); +}; + +template +inline +nonexistent_local_time::nonexistent_local_time(local_time tp, + const local_info& i) + : std::runtime_error(make_msg(tp, i)) +{ +} + +template +std::string +nonexistent_local_time::make_msg(local_time tp, const local_info& i) +{ + assert(i.result == local_info::nonexistent); + std::ostringstream os; + os << tp << " is in a gap between\n" + << local_seconds{i.first.end.time_since_epoch()} + i.first.offset << ' ' + << i.first.abbrev << " and\n" + << local_seconds{i.second.begin.time_since_epoch()} + i.second.offset << ' ' + << i.second.abbrev + << " which are both equivalent to\n" + << i.first.end << " UTC"; + return os.str(); +} + +class ambiguous_local_time + : public std::runtime_error +{ +public: + template + ambiguous_local_time(local_time tp, const local_info& i); + +private: + template + static + std::string + make_msg(local_time tp, const local_info& i); +}; + +template +inline +ambiguous_local_time::ambiguous_local_time(local_time tp, const local_info& i) + : std::runtime_error(make_msg(tp, i)) +{ +} + +template +std::string +ambiguous_local_time::make_msg(local_time tp, const local_info& i) +{ + assert(i.result == local_info::ambiguous); + std::ostringstream os; + os << tp << " is ambiguous. It could be\n" + << tp << ' ' << i.first.abbrev << " == " + << tp - i.first.offset << " UTC or\n" + << tp << ' ' << i.second.abbrev << " == " + << tp - i.second.offset << " UTC"; + return os.str(); +} + +class time_zone; + +#if HAS_STRING_VIEW +DATE_API const time_zone* locate_zone(std::string_view tz_name); +#else +DATE_API const time_zone* locate_zone(const std::string& tz_name); +#endif + +DATE_API const time_zone* current_zone(); + +template +struct zoned_traits +{ +}; + +template <> +struct zoned_traits +{ + static + const time_zone* + default_zone() + { + return date::locate_zone("Etc/UTC"); + } + +#if HAS_STRING_VIEW + + static + const time_zone* + locate_zone(std::string_view name) + { + return date::locate_zone(name); + } + +#else // !HAS_STRING_VIEW + + static + const time_zone* + locate_zone(const std::string& name) + { + return date::locate_zone(name); + } + + static + const time_zone* + locate_zone(const char* name) + { + return date::locate_zone(name); + } + +#endif // !HAS_STRING_VIEW +}; + +template +class zoned_time; + +template +bool +operator==(const zoned_time& x, + const zoned_time& y); + +template +class zoned_time +{ +public: + using duration = typename std::common_type::type; + +private: + TimeZonePtr zone_; + sys_time tp_; + +public: +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::default_zone())> +#endif + zoned_time(); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::default_zone())> +#endif + zoned_time(const sys_time& st); + explicit zoned_time(TimeZonePtr z); + +#if HAS_STRING_VIEW + template ::locate_zone(std::string_view())) + >::value + >::type> + explicit zoned_time(std::string_view name); +#else +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())) + >::value + >::type> +#endif + explicit zoned_time(const std::string& name); +#endif + + template , + sys_time>::value + >::type> + zoned_time(const zoned_time& zt) NOEXCEPT; + + zoned_time(TimeZonePtr z, const sys_time& st); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ()->to_sys(local_time{})), + sys_time + >::value + >::type> +#endif + zoned_time(TimeZonePtr z, const local_time& tp); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ()->to_sys(local_time{}, + choose::earliest)), + sys_time + >::value + >::type> +#endif + zoned_time(TimeZonePtr z, const local_time& tp, choose c); + + template , + sys_time>::value + >::type> + zoned_time(TimeZonePtr z, const zoned_time& zt); + + template , + sys_time>::value + >::type> + zoned_time(TimeZonePtr z, const zoned_time& zt, choose); + +#if HAS_STRING_VIEW + + template ::locate_zone(std::string_view())), + sys_time + >::value + >::type> + zoned_time(std::string_view name, const sys_time& st); + + template ::locate_zone(std::string_view())), + local_time + >::value + >::type> + zoned_time(std::string_view name, const local_time& tp); + + template ::locate_zone(std::string_view())), + local_time, + choose + >::value + >::type> + zoned_time(std::string_view name, const local_time& tp, choose c); + + template ::locate_zone(std::string_view())), + zoned_time + >::value + >::type> + zoned_time(std::string_view name, const zoned_time& zt); + + template ::locate_zone(std::string_view())), + zoned_time, + choose + >::value + >::type> + zoned_time(std::string_view name, const zoned_time& zt, choose); + +#else // !HAS_STRING_VIEW + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + sys_time + >::value + >::type> +#endif + zoned_time(const std::string& name, const sys_time& st); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + sys_time + >::value + >::type> +#endif + zoned_time(const char* name, const sys_time& st); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + local_time + >::value + >::type> +#endif + zoned_time(const std::string& name, const local_time& tp); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + local_time + >::value + >::type> +#endif + zoned_time(const char* name, const local_time& tp); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + local_time, + choose + >::value + >::type> +#endif + zoned_time(const std::string& name, const local_time& tp, choose c); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + local_time, + choose + >::value + >::type> +#endif + zoned_time(const char* name, const local_time& tp, choose c); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + zoned_time + >::value + >::type> +#endif + zoned_time(const std::string& name, const zoned_time& zt); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + zoned_time + >::value + >::type> +#endif + zoned_time(const char* name, const zoned_time& zt); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + zoned_time, + choose + >::value + >::type> +#endif + zoned_time(const std::string& name, const zoned_time& zt, choose); + +#if !defined(_MSC_VER) || (_MSC_VER > 1900) + template ::locate_zone(std::string())), + zoned_time, + choose + >::value + >::type> +#endif + zoned_time(const char* name, const zoned_time& zt, choose); + +#endif // !HAS_STRING_VIEW + + zoned_time& operator=(const sys_time& st); + zoned_time& operator=(const local_time& ut); + + explicit operator sys_time() const; + explicit operator local_time() const; + + TimeZonePtr get_time_zone() const; + local_time get_local_time() const; + sys_time get_sys_time() const; + sys_info get_info() const; + + template + friend + bool + operator==(const zoned_time& x, + const zoned_time& y); + + template + friend + std::basic_ostream& + operator<<(std::basic_ostream& os, + const zoned_time& t); + +private: + template friend class zoned_time; +}; + +using zoned_seconds = zoned_time; + +#if HAS_DEDUCTION_GUIDES + +zoned_time() + -> zoned_time; + +template +zoned_time(sys_time) + -> zoned_time>; + +template +zoned_time(TimeZonePtr) + -> zoned_time; + +template +zoned_time(TimeZonePtr, sys_time) + -> zoned_time, TimeZonePtr>; + +template +zoned_time(TimeZonePtr, local_time, choose = choose::earliest) + -> zoned_time, TimeZonePtr>; + +#if HAS_STRING_VIEW + +zoned_time(std::string_view) + -> zoned_time; + +template +zoned_time(std::string_view, sys_time) + -> zoned_time>; + +template +zoned_time(std::string_view, local_time, choose = choose::earliest) + -> zoned_time>; + +#else // !HAS_STRING_VIEW + +zoned_time(std::string) + -> zoned_time; + +template +zoned_time(std::string, sys_time) + -> zoned_time>; + +template +zoned_time(std::string, local_time, choose = choose::earliest) + -> zoned_time>; + +#endif // !HAS_STRING_VIEW + +template +zoned_time(const char*, sys_time) + -> zoned_time>; + +template +zoned_time(const char*, local_time, choose = choose::earliest) + -> zoned_time>; + +template +zoned_time(TimeZonePtr, zoned_time, choose = choose::earliest) + -> zoned_time; + +#endif // HAS_DEDUCTION_GUIDES + +template +inline +bool +operator==(const zoned_time& x, + const zoned_time& y) +{ + return x.zone_ == y.zone_ && x.tp_ == y.tp_; +} + +template +inline +bool +operator!=(const zoned_time& x, + const zoned_time& y) +{ + return !(x == y); +} + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + +namespace detail +{ +# if USE_OS_TZDB + struct transition; + struct expanded_ttinfo; +# else // !USE_OS_TZDB + struct zonelet; + class Rule; +# endif // !USE_OS_TZDB +} + +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) + +class time_zone +{ +private: + std::string name_; +#if USE_OS_TZDB + std::vector transitions_; + std::vector ttinfos_; +#else // !USE_OS_TZDB + std::vector zonelets_; +#endif // !USE_OS_TZDB + std::unique_ptr adjusted_; + +public: +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + time_zone(time_zone&&) = default; + time_zone& operator=(time_zone&&) = default; +#else // defined(_MSC_VER) && (_MSC_VER < 1900) + time_zone(time_zone&& src); + time_zone& operator=(time_zone&& src); +#endif // defined(_MSC_VER) && (_MSC_VER < 1900) + + DATE_API explicit time_zone(const std::string& s, detail::undocumented); + + const std::string& name() const NOEXCEPT; + + template sys_info get_info(sys_time st) const; + template local_info get_info(local_time tp) const; + + template + sys_time::type> + to_sys(local_time tp) const; + + template + sys_time::type> + to_sys(local_time tp, choose z) const; + + template + local_time::type> + to_local(sys_time tp) const; + + friend bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT; + friend bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT; + friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone& z); + +#if !USE_OS_TZDB + DATE_API void add(const std::string& s); +#endif // !USE_OS_TZDB + +private: + DATE_API sys_info get_info_impl(sys_seconds tp) const; + DATE_API local_info get_info_impl(local_seconds tp) const; + + template + sys_time::type> + to_sys_impl(local_time tp, choose z, std::false_type) const; + template + sys_time::type> + to_sys_impl(local_time tp, choose, std::true_type) const; + +#if USE_OS_TZDB + DATE_API void init() const; + DATE_API void init_impl(); + DATE_API sys_info + load_sys_info(std::vector::const_iterator i) const; + + template + DATE_API void + load_data(std::istream& inf, std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt, + std::int32_t tzh_typecnt, std::int32_t tzh_charcnt); +#else // !USE_OS_TZDB + DATE_API sys_info get_info_impl(sys_seconds tp, int timezone) const; + DATE_API void adjust_infos(const std::vector& rules); + DATE_API void parse_info(std::istream& in); +#endif // !USE_OS_TZDB +}; + +#if defined(_MSC_VER) && (_MSC_VER < 1900) + +inline +time_zone::time_zone(time_zone&& src) + : name_(std::move(src.name_)) + , zonelets_(std::move(src.zonelets_)) + , adjusted_(std::move(src.adjusted_)) + {} + +inline +time_zone& +time_zone::operator=(time_zone&& src) +{ + name_ = std::move(src.name_); + zonelets_ = std::move(src.zonelets_); + adjusted_ = std::move(src.adjusted_); + return *this; +} + +#endif // defined(_MSC_VER) && (_MSC_VER < 1900) + +inline +const std::string& +time_zone::name() const NOEXCEPT +{ + return name_; +} + +template +inline +sys_info +time_zone::get_info(sys_time st) const +{ + using namespace std::chrono; + return get_info_impl(date::floor(st)); +} + +template +inline +local_info +time_zone::get_info(local_time tp) const +{ + using namespace std::chrono; + return get_info_impl(date::floor(tp)); +} + +template +inline +sys_time::type> +time_zone::to_sys(local_time tp) const +{ + return to_sys_impl(tp, choose{}, std::true_type{}); +} + +template +inline +sys_time::type> +time_zone::to_sys(local_time tp, choose z) const +{ + return to_sys_impl(tp, z, std::false_type{}); +} + +template +inline +local_time::type> +time_zone::to_local(sys_time tp) const +{ + using LT = local_time::type>; + auto i = get_info(tp); + return LT{(tp + i.offset).time_since_epoch()}; +} + +inline bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ == y.name_;} +inline bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ < y.name_;} + +inline bool operator!=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x == y);} +inline bool operator> (const time_zone& x, const time_zone& y) NOEXCEPT {return y < x;} +inline bool operator<=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(y < x);} +inline bool operator>=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x < y);} + +template +sys_time::type> +time_zone::to_sys_impl(local_time tp, choose z, std::false_type) const +{ + using namespace date; + using namespace std::chrono; + auto i = get_info(tp); + if (i.result == local_info::nonexistent) + { + return i.first.end; + } + else if (i.result == local_info::ambiguous) + { + if (z == choose::latest) + return sys_time{tp.time_since_epoch()} - i.second.offset; + } + return sys_time{tp.time_since_epoch()} - i.first.offset; +} + +template +sys_time::type> +time_zone::to_sys_impl(local_time tp, choose, std::true_type) const +{ + using namespace date; + using namespace std::chrono; + auto i = get_info(tp); + if (i.result == local_info::nonexistent) + throw nonexistent_local_time(tp, i); + else if (i.result == local_info::ambiguous) + throw ambiguous_local_time(tp, i); + return sys_time{tp.time_since_epoch()} - i.first.offset; +} + +#if !USE_OS_TZDB + +class link +{ +private: + std::string name_; + std::string target_; +public: + DATE_API explicit link(const std::string& s); + + const std::string& name() const {return name_;} + const std::string& target() const {return target_;} + + friend bool operator==(const link& x, const link& y) {return x.name_ == y.name_;} + friend bool operator< (const link& x, const link& y) {return x.name_ < y.name_;} + + friend DATE_API std::ostream& operator<<(std::ostream& os, const link& x); +}; + +inline bool operator!=(const link& x, const link& y) {return !(x == y);} +inline bool operator> (const link& x, const link& y) {return y < x;} +inline bool operator<=(const link& x, const link& y) {return !(y < x);} +inline bool operator>=(const link& x, const link& y) {return !(x < y);} + +#endif // !USE_OS_TZDB + +#if !MISSING_LEAP_SECONDS + +class leap +{ +private: + sys_seconds date_; + +public: +#if USE_OS_TZDB + DATE_API explicit leap(const sys_seconds& s, detail::undocumented); +#else + DATE_API explicit leap(const std::string& s, detail::undocumented); +#endif + + sys_seconds date() const {return date_;} + + friend bool operator==(const leap& x, const leap& y) {return x.date_ == y.date_;} + friend bool operator< (const leap& x, const leap& y) {return x.date_ < y.date_;} + + template + friend + bool + operator==(const leap& x, const sys_time& y) + { + return x.date_ == y; + } + + template + friend + bool + operator< (const leap& x, const sys_time& y) + { + return x.date_ < y; + } + + template + friend + bool + operator< (const sys_time& x, const leap& y) + { + return x < y.date_; + } + + friend DATE_API std::ostream& operator<<(std::ostream& os, const leap& x); +}; + +inline bool operator!=(const leap& x, const leap& y) {return !(x == y);} +inline bool operator> (const leap& x, const leap& y) {return y < x;} +inline bool operator<=(const leap& x, const leap& y) {return !(y < x);} +inline bool operator>=(const leap& x, const leap& y) {return !(x < y);} + +template +inline +bool +operator==(const sys_time& x, const leap& y) +{ + return y == x; +} + +template +inline +bool +operator!=(const leap& x, const sys_time& y) +{ + return !(x == y); +} + +template +inline +bool +operator!=(const sys_time& x, const leap& y) +{ + return !(x == y); +} + +template +inline +bool +operator> (const leap& x, const sys_time& y) +{ + return y < x; +} + +template +inline +bool +operator> (const sys_time& x, const leap& y) +{ + return y < x; +} + +template +inline +bool +operator<=(const leap& x, const sys_time& y) +{ + return !(y < x); +} + +template +inline +bool +operator<=(const sys_time& x, const leap& y) +{ + return !(y < x); +} + +template +inline +bool +operator>=(const leap& x, const sys_time& y) +{ + return !(x < y); +} + +template +inline +bool +operator>=(const sys_time& x, const leap& y) +{ + return !(x < y); +} + +#endif // !MISSING_LEAP_SECONDS + +#ifdef _WIN32 + +namespace detail +{ + +// The time zone mapping is modelled after this data file: +// http://unicode.org/repos/cldr/trunk/common/supplemental/windowsZones.xml +// and the field names match the element names from the mapZone element +// of windowsZones.xml. +// The website displays this file here: +// http://www.unicode.org/cldr/charts/latest/supplemental/zone_tzid.html +// The html view is sorted before being displayed but is otherwise the same +// There is a mapping between the os centric view (in this case windows) +// the html displays uses and the generic view the xml file. +// That mapping is this: +// display column "windows" -> xml field "other". +// display column "region" -> xml field "territory". +// display column "tzid" -> xml field "type". +// This structure uses the generic terminology because it could be +// used to to support other os/native name conversions, not just windows, +// and using the same generic names helps retain the connection to the +// origin of the data that we are using. +struct timezone_mapping +{ + timezone_mapping(const char* other, const char* territory, const char* type) + : other(other), territory(territory), type(type) + { + } + timezone_mapping() = default; + std::string other; + std::string territory; + std::string type; +}; + +} // detail + +#endif // _WIN32 + +struct tzdb +{ + std::string version = "unknown"; + std::vector zones; +#if !USE_OS_TZDB + std::vector links; +#endif +#if !MISSING_LEAP_SECONDS + std::vector leaps; +#endif +#if !USE_OS_TZDB + std::vector rules; +#endif +#ifdef _WIN32 + std::vector mappings; +#endif + tzdb* next = nullptr; + + tzdb() = default; +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + tzdb(tzdb&&) = default; + tzdb& operator=(tzdb&&) = default; +#else // defined(_MSC_VER) && (_MSC_VER < 1900) + tzdb(tzdb&& src) + : version(std::move(src.version)) + , zones(std::move(src.zones)) + , links(std::move(src.links)) + , leaps(std::move(src.leaps)) + , rules(std::move(src.rules)) + , mappings(std::move(src.mappings)) + {} + + tzdb& operator=(tzdb&& src) + { + version = std::move(src.version); + zones = std::move(src.zones); + links = std::move(src.links); + leaps = std::move(src.leaps); + rules = std::move(src.rules); + mappings = std::move(src.mappings); + return *this; + } +#endif // defined(_MSC_VER) && (_MSC_VER < 1900) + +#if HAS_STRING_VIEW + const time_zone* locate_zone(std::string_view tz_name) const; +#else + const time_zone* locate_zone(const std::string& tz_name) const; +#endif + const time_zone* current_zone() const; +}; + +using TZ_DB = tzdb; + +DATE_API std::ostream& +operator<<(std::ostream& os, const tzdb& db); + +DATE_API const tzdb& get_tzdb(); + +class tzdb_list +{ + std::atomic head_{nullptr}; + +public: + ~tzdb_list(); + tzdb_list() = default; + tzdb_list(tzdb_list&& x) noexcept; + + const tzdb& front() const noexcept {return *head_;} + tzdb& front() noexcept {return *head_;} + + class const_iterator; + + const_iterator begin() const noexcept; + const_iterator end() const noexcept; + + const_iterator cbegin() const noexcept; + const_iterator cend() const noexcept; + + const_iterator erase_after(const_iterator p) noexcept; + + struct undocumented_helper; +private: + void push_front(tzdb* tzdb) noexcept; +}; + +class tzdb_list::const_iterator +{ + tzdb* p_ = nullptr; + + explicit const_iterator(tzdb* p) noexcept : p_{p} {} +public: + const_iterator() = default; + + using iterator_category = std::forward_iterator_tag; + using value_type = tzdb; + using reference = const value_type&; + using pointer = const value_type*; + using difference_type = std::ptrdiff_t; + + reference operator*() const noexcept {return *p_;} + pointer operator->() const noexcept {return p_;} + + const_iterator& operator++() noexcept {p_ = p_->next; return *this;} + const_iterator operator++(int) noexcept {auto t = *this; ++(*this); return t;} + + friend + bool + operator==(const const_iterator& x, const const_iterator& y) noexcept + {return x.p_ == y.p_;} + + friend + bool + operator!=(const const_iterator& x, const const_iterator& y) noexcept + {return !(x == y);} + + friend class tzdb_list; +}; + +inline +tzdb_list::const_iterator +tzdb_list::begin() const noexcept +{ + return const_iterator{head_}; +} + +inline +tzdb_list::const_iterator +tzdb_list::end() const noexcept +{ + return const_iterator{nullptr}; +} + +inline +tzdb_list::const_iterator +tzdb_list::cbegin() const noexcept +{ + return begin(); +} + +inline +tzdb_list::const_iterator +tzdb_list::cend() const noexcept +{ + return end(); +} + +DATE_API tzdb_list& get_tzdb_list(); + +#if !USE_OS_TZDB + +DATE_API const tzdb& reload_tzdb(); +DATE_API void set_install(const std::string& install); + +#endif // !USE_OS_TZDB + +#if HAS_REMOTE_API + +DATE_API std::string remote_version(); +DATE_API bool remote_download(const std::string& version); +DATE_API bool remote_install(const std::string& version); + +#endif + +// zoned_time + +namespace detail +{ + +template +inline +T* +to_raw_pointer(T* p) noexcept +{ + return p; +} + +template +inline +auto +to_raw_pointer(Pointer p) noexcept + -> decltype(detail::to_raw_pointer(p.operator->())) +{ + return detail::to_raw_pointer(p.operator->()); +} + +} // namespace detail + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time() + : zone_(zoned_traits::default_zone()) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const sys_time& st) + : zone_(zoned_traits::default_zone()) + , tp_(st) + {} + +template +inline +zoned_time::zoned_time(TimeZonePtr z) + : zone_(std::move(z)) + {assert(detail::to_raw_pointer(zone_) != nullptr);} + +#if HAS_STRING_VIEW + +template +template +inline +zoned_time::zoned_time(std::string_view name) + : zoned_time(zoned_traits::locate_zone(name)) + {} + +#else // !HAS_STRING_VIEW + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name) + : zoned_time(zoned_traits::locate_zone(name)) + {} + +#endif // !HAS_STRING_VIEW + +template +template +inline +zoned_time::zoned_time(const zoned_time& zt) NOEXCEPT + : zone_(zt.zone_) + , tp_(zt.tp_) + {} + +template +inline +zoned_time::zoned_time(TimeZonePtr z, const sys_time& st) + : zone_(std::move(z)) + , tp_(st) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(TimeZonePtr z, const local_time& t) + : zone_(std::move(z)) + , tp_(zone_->to_sys(t)) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(TimeZonePtr z, const local_time& t, + choose c) + : zone_(std::move(z)) + , tp_(zone_->to_sys(t, c)) + {} + +template +template +inline +zoned_time::zoned_time(TimeZonePtr z, + const zoned_time& zt) + : zone_(std::move(z)) + , tp_(zt.tp_) + {} + +template +template +inline +zoned_time::zoned_time(TimeZonePtr z, + const zoned_time& zt, choose) + : zoned_time(std::move(z), zt) + {} + +#if HAS_STRING_VIEW + +template +template +inline +zoned_time::zoned_time(std::string_view name, + const sys_time& st) + : zoned_time(zoned_traits::locate_zone(name), st) + {} + +template +template +inline +zoned_time::zoned_time(std::string_view name, + const local_time& t) + : zoned_time(zoned_traits::locate_zone(name), t) + {} + +template +template +inline +zoned_time::zoned_time(std::string_view name, + const local_time& t, choose c) + : zoned_time(zoned_traits::locate_zone(name), t, c) + {} + +template +template +inline +zoned_time::zoned_time(std::string_view name, const zoned_time& zt) + : zoned_time(zoned_traits::locate_zone(name), zt) + {} + +template +template +inline +zoned_time::zoned_time(std::string_view name, + const zoned_time& zt, choose c) + : zoned_time(zoned_traits::locate_zone(name), zt, c) + {} + +#else // !HAS_STRING_VIEW + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name, + const sys_time& st) + : zoned_time(zoned_traits::locate_zone(name), st) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const char* name, + const sys_time& st) + : zoned_time(zoned_traits::locate_zone(name), st) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name, + const local_time& t) + : zoned_time(zoned_traits::locate_zone(name), t) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const char* name, + const local_time& t) + : zoned_time(zoned_traits::locate_zone(name), t) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name, + const local_time& t, choose c) + : zoned_time(zoned_traits::locate_zone(name), t, c) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const char* name, + const local_time& t, choose c) + : zoned_time(zoned_traits::locate_zone(name), t, c) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name, + const zoned_time& zt) + : zoned_time(zoned_traits::locate_zone(name), zt) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const char* name, const zoned_time& zt) + : zoned_time(zoned_traits::locate_zone(name), zt) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const std::string& name, + const zoned_time& zt, choose c) + : zoned_time(zoned_traits::locate_zone(name), zt, c) + {} + +template +#if !defined(_MSC_VER) || (_MSC_VER > 1900) +template +#endif +inline +zoned_time::zoned_time(const char* name, + const zoned_time& zt, choose c) + : zoned_time(zoned_traits::locate_zone(name), zt, c) + {} + +#endif // HAS_STRING_VIEW + +template +inline +zoned_time& +zoned_time::operator=(const sys_time& st) +{ + tp_ = st; + return *this; +} + +template +inline +zoned_time& +zoned_time::operator=(const local_time& ut) +{ + tp_ = zone_->to_sys(ut); + return *this; +} + +template +inline +zoned_time::operator local_time::duration>() const +{ + return get_local_time(); +} + +template +inline +zoned_time::operator sys_time::duration>() const +{ + return get_sys_time(); +} + +template +inline +TimeZonePtr +zoned_time::get_time_zone() const +{ + return zone_; +} + +template +inline +local_time::duration> +zoned_time::get_local_time() const +{ + return zone_->to_local(tp_); +} + +template +inline +sys_time::duration> +zoned_time::get_sys_time() const +{ + return tp_; +} + +template +inline +sys_info +zoned_time::get_info() const +{ + return zone_->get_info(tp_); +} + +// make_zoned_time + +inline +zoned_time +make_zoned() +{ + return zoned_time(); +} + +template +inline +zoned_time::type> +make_zoned(const sys_time& tp) +{ + return zoned_time::type>(tp); +} + +template 1900) + , class = typename std::enable_if + < + std::is_class + < + typename std::decay + < + decltype(*detail::to_raw_pointer(std::declval())) + >::type + >{} + >::type +#endif + > +inline +zoned_time +make_zoned(TimeZonePtr z) +{ + return zoned_time(std::move(z)); +} + +inline +zoned_seconds +make_zoned(const std::string& name) +{ + return zoned_seconds(name); +} + +template 1900) + , class = typename std::enable_if + < + std::is_class())>::type>{} + >::type +#endif + > +inline +zoned_time::type, TimeZonePtr> +make_zoned(TimeZonePtr zone, const local_time& tp) +{ + return zoned_time::type, + TimeZonePtr>(std::move(zone), tp); +} + +template 1900) + , class = typename std::enable_if + < + std::is_class())>::type>{} + >::type +#endif + > +inline +zoned_time::type, TimeZonePtr> +make_zoned(TimeZonePtr zone, const local_time& tp, choose c) +{ + return zoned_time::type, + TimeZonePtr>(std::move(zone), tp, c); +} + +template +inline +zoned_time::type> +make_zoned(const std::string& name, const local_time& tp) +{ + return zoned_time::type>(name, tp); +} + +template +inline +zoned_time::type> +make_zoned(const std::string& name, const local_time& tp, choose c) +{ + return zoned_time::type>(name, tp, c); +} + +template +inline +zoned_time +make_zoned(TimeZonePtr zone, const zoned_time& zt) +{ + return zoned_time(std::move(zone), zt); +} + +template +inline +zoned_time +make_zoned(const std::string& name, const zoned_time& zt) +{ + return zoned_time(name, zt); +} + +template +inline +zoned_time +make_zoned(TimeZonePtr zone, const zoned_time& zt, choose c) +{ + return zoned_time(std::move(zone), zt, c); +} + +template +inline +zoned_time +make_zoned(const std::string& name, const zoned_time& zt, choose c) +{ + return zoned_time(name, zt, c); +} + +template 1900) + , class = typename std::enable_if + < + std::is_class())>::type>{} + >::type +#endif + > +inline +zoned_time::type, TimeZonePtr> +make_zoned(TimeZonePtr zone, const sys_time& st) +{ + return zoned_time::type, + TimeZonePtr>(std::move(zone), st); +} + +template +inline +zoned_time::type> +make_zoned(const std::string& name, const sys_time& st) +{ + return zoned_time::type>(name, st); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const zoned_time& tp) +{ + using duration = typename zoned_time::duration; + using LT = local_time; + auto const tz = tp.get_time_zone(); + auto const st = tp.get_sys_time(); + auto const info = tz->get_info(st); + return to_stream(os, fmt, LT{(st+info.offset).time_since_epoch()}, + &info.abbrev, &info.offset); +} + +template +inline +std::basic_ostream& +operator<<(std::basic_ostream& os, const zoned_time& t) +{ + const CharT fmt[] = {'%', 'F', ' ', '%', 'T', ' ', '%', 'Z', CharT{}}; + return to_stream(os, fmt, t); +} + +#if !MISSING_LEAP_SECONDS + +class utc_clock +{ +public: + using duration = std::chrono::system_clock::duration; + using rep = duration::rep; + using period = duration::period; + using time_point = std::chrono::time_point; + static CONSTDATA bool is_steady = false; + + static time_point now(); + + template + static + std::chrono::time_point::type> + to_sys(const std::chrono::time_point&); + + template + static + std::chrono::time_point::type> + from_sys(const std::chrono::time_point&); +}; + +template + using utc_time = std::chrono::time_point; + +using utc_seconds = utc_time; + +template +utc_time::type> +utc_clock::from_sys(const sys_time& st) +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + auto const& leaps = get_tzdb().leaps; + auto const lt = std::upper_bound(leaps.begin(), leaps.end(), st); + return utc_time{st.time_since_epoch() + seconds{lt-leaps.begin()}}; +} + +// Return pair +// first is true if ut is during a leap second insertion, otherwise false. +// If ut is during a leap second insertion, that leap second is included in the count +template +std::pair +is_leap_second(date::utc_time const& ut) +{ + using namespace date; + using namespace std::chrono; + using duration = typename std::common_type::type; + auto const& leaps = get_tzdb().leaps; + auto tp = sys_time{ut.time_since_epoch()}; + auto const lt = std::upper_bound(leaps.begin(), leaps.end(), tp); + auto ds = seconds{lt-leaps.begin()}; + tp -= ds; + auto ls = false; + if (lt > leaps.begin()) + { + if (tp < lt[-1]) + { + if (tp >= lt[-1].date() - seconds{1}) + ls = true; + else + --ds; + } + } + return {ls, ds}; +} + +template +sys_time::type> +utc_clock::to_sys(const utc_time& ut) +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + auto ls = is_leap_second(ut); + auto tp = sys_time{ut.time_since_epoch() - ls.second}; + if (ls.first) + tp = floor(tp) + seconds{1} - duration{1}; + return tp; +} + +inline +utc_clock::time_point +utc_clock::now() +{ + using namespace std::chrono; + return from_sys(system_clock::now()); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const utc_time& t) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + const string abbrev("UTC"); + CONSTDATA seconds offset{0}; + auto ls = is_leap_second(t); + auto tp = sys_time{t.time_since_epoch() - ls.second}; + auto const sd = floor(tp); + year_month_day ymd = sd; + auto time = make_time(tp - sys_seconds{sd}); + time.seconds() += seconds{ls.first}; + fields fds{ymd, time}; + return to_stream(os, fmt, fds, &abbrev, &offset); +} + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const utc_time& t) +{ + const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; + return to_stream(os, fmt, t); +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + utc_time& tp, std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + minutes offset_local{}; + auto offptr = offset ? offset : &offset_local; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offptr); + if (!fds.ymd.ok()) + is.setstate(ios::failbit); + if (!is.fail()) + { + bool is_60_sec = fds.tod.seconds() == seconds{60}; + if (is_60_sec) + fds.tod.seconds() -= seconds{1}; + auto tmp = utc_clock::from_sys(sys_days(fds.ymd) - *offptr + fds.tod.to_duration()); + if (is_60_sec) + tmp += seconds{1}; + if (is_60_sec != is_leap_second(tmp).first || !fds.tod.in_conventional_range()) + { + is.setstate(ios::failbit); + return is; + } + tp = time_point_cast(tmp); + } + return is; +} + +// tai_clock + +class tai_clock +{ +public: + using duration = std::chrono::system_clock::duration; + using rep = duration::rep; + using period = duration::period; + using time_point = std::chrono::time_point; + static const bool is_steady = false; + + static time_point now(); + + template + static + std::chrono::time_point::type> + to_utc(const std::chrono::time_point&) NOEXCEPT; + + template + static + std::chrono::time_point::type> + from_utc(const std::chrono::time_point&) NOEXCEPT; +}; + +template + using tai_time = std::chrono::time_point; + +using tai_seconds = tai_time; + +template +inline +utc_time::type> +tai_clock::to_utc(const tai_time& t) NOEXCEPT +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + return utc_time{t.time_since_epoch()} - + (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1) + seconds{10}); +} + +template +inline +tai_time::type> +tai_clock::from_utc(const utc_time& t) NOEXCEPT +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + return tai_time{t.time_since_epoch()} + + (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1) + seconds{10}); +} + +inline +tai_clock::time_point +tai_clock::now() +{ + using namespace std::chrono; + return from_utc(utc_clock::now()); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const tai_time& t) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + const string abbrev("TAI"); + CONSTDATA seconds offset{0}; + auto tp = sys_time{t.time_since_epoch()} - + seconds(sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1)); + auto const sd = floor(tp); + year_month_day ymd = sd; + auto time = make_time(tp - sys_seconds{sd}); + fields fds{ymd, time}; + return to_stream(os, fmt, fds, &abbrev, &offset); +} + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const tai_time& t) +{ + const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; + return to_stream(os, fmt, t); +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + tai_time& tp, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + minutes offset_local{}; + auto offptr = offset ? offset : &offset_local; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offptr); + if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) + is.setstate(ios::failbit); + if (!is.fail()) + tp = tai_time{duration_cast( + (sys_days(fds.ymd) + + (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1)) - + *offptr + fds.tod.to_duration()).time_since_epoch())}; + return is; +} + +// gps_clock + +class gps_clock +{ +public: + using duration = std::chrono::system_clock::duration; + using rep = duration::rep; + using period = duration::period; + using time_point = std::chrono::time_point; + static const bool is_steady = false; + + static time_point now(); + + template + static + std::chrono::time_point::type> + to_utc(const std::chrono::time_point&) NOEXCEPT; + + template + static + std::chrono::time_point::type> + from_utc(const std::chrono::time_point&) NOEXCEPT; + +}; + +template + using gps_time = std::chrono::time_point; + +using gps_seconds = gps_time; + +template +inline +utc_time::type> +gps_clock::to_utc(const gps_time& t) NOEXCEPT +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + return utc_time{t.time_since_epoch()} + + (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1) + seconds{9}); +} + +template +inline +gps_time::type> +gps_clock::from_utc(const utc_time& t) NOEXCEPT +{ + using namespace std::chrono; + using duration = typename std::common_type::type; + return gps_time{t.time_since_epoch()} - + (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1) + seconds{9}); +} + +inline +gps_clock::time_point +gps_clock::now() +{ + using namespace std::chrono; + return from_utc(utc_clock::now()); +} + +template +std::basic_ostream& +to_stream(std::basic_ostream& os, const CharT* fmt, + const gps_time& t) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + const string abbrev("GPS"); + CONSTDATA seconds offset{0}; + auto tp = sys_time{t.time_since_epoch()} + + seconds(sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1)); + auto const sd = floor(tp); + year_month_day ymd = sd; + auto time = make_time(tp - sys_seconds{sd}); + fields fds{ymd, time}; + return to_stream(os, fmt, fds, &abbrev, &offset); +} + +template +std::basic_ostream& +operator<<(std::basic_ostream& os, const gps_time& t) +{ + const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; + return to_stream(os, fmt, t); +} + +template > +std::basic_istream& +from_stream(std::basic_istream& is, const CharT* fmt, + gps_time& tp, + std::basic_string* abbrev = nullptr, + std::chrono::minutes* offset = nullptr) +{ + using namespace std; + using namespace std::chrono; + using CT = typename common_type::type; + minutes offset_local{}; + auto offptr = offset ? offset : &offset_local; + fields fds{}; + from_stream(is, fmt, fds, abbrev, offptr); + if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) + is.setstate(ios::failbit); + if (!is.fail()) + tp = gps_time{duration_cast( + (sys_days(fds.ymd) - + (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1)) - + *offptr + fds.tod.to_duration()).time_since_epoch())}; + return is; +} + +// clock_time_conversion + +template +struct clock_time_conversion +{}; + +template <> +struct clock_time_conversion +{ + template + sys_time + operator()(const sys_time& st) const + { + return st; + } +}; + +template <> +struct clock_time_conversion +{ + template + utc_time + operator()(const utc_time& ut) const + { + return ut; + } +}; + +template <> +struct clock_time_conversion +{ + template + utc_time::type> + operator()(const sys_time& st) const + { + return utc_clock::from_sys(st); + } +}; + +template <> +struct clock_time_conversion +{ + template + sys_time::type> + operator()(const utc_time& ut) const + { + return utc_clock::to_sys(ut); + } +}; + +template +struct clock_time_conversion +{ + template + std::chrono::time_point + operator()(const std::chrono::time_point& tp) const + { + return tp; + } +}; + +namespace ctc_detail +{ + +template + using time_point = std::chrono::time_point; + +using std::declval; +using std::chrono::system_clock; + +//Check if TimePoint is time for given clock, +//if not emits hard error +template +struct return_clock_time +{ + using clock_time_point = time_point; + using type = TimePoint; + + static_assert(std::is_same::value, + "time point with appropariate clock shall be returned"); +}; + +// Check if Clock has to_sys method accepting TimePoint with given duration const& and +// returning sys_time. If so has nested type member equal to return type to_sys. +template +struct return_to_sys +{}; + +template +struct return_to_sys + < + Clock, Duration, + decltype(Clock::to_sys(declval const&>()), void()) + > + : return_clock_time + < + system_clock, + decltype(Clock::to_sys(declval const&>())) + > +{}; + +// Similiar to above +template +struct return_from_sys +{}; + +template +struct return_from_sys + < + Clock, Duration, + decltype(Clock::from_sys(declval const&>()), + void()) + > + : return_clock_time + < + Clock, + decltype(Clock::from_sys(declval const&>())) + > +{}; + +// Similiar to above +template +struct return_to_utc +{}; + +template +struct return_to_utc + < + Clock, Duration, + decltype(Clock::to_utc(declval const&>()), void()) + > + : return_clock_time + < + utc_clock, + decltype(Clock::to_utc(declval const&>()))> +{}; + +// Similiar to above +template +struct return_from_utc +{}; + +template +struct return_from_utc + < + Clock, Duration, + decltype(Clock::from_utc(declval const&>()), + void()) + > + : return_clock_time + < + Clock, + decltype(Clock::from_utc(declval const&>())) + > +{}; + +} // namespace ctc_detail + +template +struct clock_time_conversion +{ + template + typename ctc_detail::return_to_sys::type + operator()(const std::chrono::time_point& tp) const + { + return SrcClock::to_sys(tp); + } +}; + +template +struct clock_time_conversion +{ + template + typename ctc_detail::return_from_sys::type + operator()(const sys_time& st) const + { + return DstClock::from_sys(st); + } +}; + +template +struct clock_time_conversion +{ + template + typename ctc_detail::return_to_utc::type + operator()(const std::chrono::time_point& tp) const + { + return SrcClock::to_utc(tp); + } +}; + +template +struct clock_time_conversion +{ + template + typename ctc_detail::return_from_utc::type + operator()(const utc_time& ut) const + { + return DstClock::from_utc(ut); + } +}; + +namespace clock_cast_detail +{ + +template + using time_point = std::chrono::time_point; +using std::chrono::system_clock; + +template +auto +conv_clock(const time_point& t) + -> decltype(std::declval>()(t)) +{ + return clock_time_conversion{}(t); +} + +//direct trait conversion, 1st candidate +template +auto +cc_impl(const time_point& t, const time_point*) + -> decltype(conv_clock(t)) +{ + return conv_clock(t); +} + +//conversion through sys, 2nd candidate +template +auto +cc_impl(const time_point& t, const void*) + -> decltype(conv_clock(conv_clock(t))) +{ + return conv_clock(conv_clock(t)); +} + +//conversion through utc, 2nd candidate +template +auto +cc_impl(const time_point& t, const void*) + -> decltype(0, // MSVC_WORKAROUND + conv_clock(conv_clock(t))) +{ + return conv_clock(conv_clock(t)); +} + +//conversion through sys and utc, 3rd candidate +template +auto +cc_impl(const time_point& t, ...) + -> decltype(conv_clock(conv_clock(conv_clock(t)))) +{ + return conv_clock(conv_clock(conv_clock(t))); +} + +//conversion through utc and sys, 3rd candidate +template +auto +cc_impl(const time_point& t, ...) + -> decltype(0, // MSVC_WORKAROUND + conv_clock(conv_clock(conv_clock(t)))) +{ + return conv_clock(conv_clock(conv_clock(t))); +} + +} // namespace clock_cast_detail + +template +auto +clock_cast(const std::chrono::time_point& tp) + -> decltype(clock_cast_detail::cc_impl(tp, &tp)) +{ + return clock_cast_detail::cc_impl(tp, &tp); +} + +// Deprecated API + +template +inline +sys_time::type> +to_sys_time(const utc_time& t) +{ + return utc_clock::to_sys(t); +} + +template +inline +sys_time::type> +to_sys_time(const tai_time& t) +{ + return utc_clock::to_sys(tai_clock::to_utc(t)); +} + +template +inline +sys_time::type> +to_sys_time(const gps_time& t) +{ + return utc_clock::to_sys(gps_clock::to_utc(t)); +} + + +template +inline +utc_time::type> +to_utc_time(const sys_time& t) +{ + return utc_clock::from_sys(t); +} + +template +inline +utc_time::type> +to_utc_time(const tai_time& t) +{ + return tai_clock::to_utc(t); +} + +template +inline +utc_time::type> +to_utc_time(const gps_time& t) +{ + return gps_clock::to_utc(t); +} + + +template +inline +tai_time::type> +to_tai_time(const sys_time& t) +{ + return tai_clock::from_utc(utc_clock::from_sys(t)); +} + +template +inline +tai_time::type> +to_tai_time(const utc_time& t) +{ + return tai_clock::from_utc(t); +} + +template +inline +tai_time::type> +to_tai_time(const gps_time& t) +{ + return tai_clock::from_utc(gps_clock::to_utc(t)); +} + + +template +inline +gps_time::type> +to_gps_time(const sys_time& t) +{ + return gps_clock::from_utc(utc_clock::from_sys(t)); +} + +template +inline +gps_time::type> +to_gps_time(const utc_time& t) +{ + return gps_clock::from_utc(t); +} + +template +inline +gps_time::type> +to_gps_time(const tai_time& t) +{ + return gps_clock::from_utc(tai_clock::to_utc(t)); +} + +#endif // !MISSING_LEAP_SECONDS + +} // namespace date +} // namespace arrow + +#endif // TZ_H diff --git a/r/R/inst/include/arrow/vendored/datetime/tz_private.h b/r/R/inst/include/arrow/vendored/datetime/tz_private.h new file mode 100644 index 00000000000..f98c3e79a44 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime/tz_private.h @@ -0,0 +1,321 @@ +#ifndef TZ_PRIVATE_H +#define TZ_PRIVATE_H + +// The MIT License (MIT) +// +// Copyright (c) 2015, 2016 Howard Hinnant +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Our apologies. When the previous paragraph was written, lowercase had not yet +// been invented (that would involve another several millennia of evolution). +// We did not mean to shout. + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +#include "tz.h" +#else +#include "date.h" +#include +#endif + +namespace arrow_vendored +{ +namespace date +{ + +namespace detail +{ + +#if !USE_OS_TZDB + +enum class tz {utc, local, standard}; + +//forward declare to avoid warnings in gcc 6.2 +class MonthDayTime; +std::istream& operator>>(std::istream& is, MonthDayTime& x); +std::ostream& operator<<(std::ostream& os, const MonthDayTime& x); + + +class MonthDayTime +{ +private: + struct pair + { +#if defined(_MSC_VER) && (_MSC_VER < 1900) + pair() : month_day_(date::jan / 1), weekday_(0U) {} + + pair(const date::month_day& month_day, const date::weekday& weekday) + : month_day_(month_day), weekday_(weekday) {} +#endif + + date::month_day month_day_; + date::weekday weekday_; + }; + + enum Type {month_day, month_last_dow, lteq, gteq}; + + Type type_{month_day}; + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + union U +#else + struct U +#endif + { + date::month_day month_day_; + date::month_weekday_last month_weekday_last_; + pair month_day_weekday_; + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + U() : month_day_{date::jan/1} {} +#else + U() : + month_day_(date::jan/1), + month_weekday_last_(date::month(0U), date::weekday_last(date::weekday(0U))) + {} + +#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) + + U& operator=(const date::month_day& x); + U& operator=(const date::month_weekday_last& x); + U& operator=(const pair& x); + } u; + + std::chrono::hours h_{0}; + std::chrono::minutes m_{0}; + std::chrono::seconds s_{0}; + tz zone_{tz::local}; + +public: + MonthDayTime() = default; + MonthDayTime(local_seconds tp, tz timezone); + MonthDayTime(const date::month_day& md, tz timezone); + + date::day day() const; + date::month month() const; + tz zone() const {return zone_;} + + void canonicalize(date::year y); + + sys_seconds + to_sys(date::year y, std::chrono::seconds offset, std::chrono::seconds save) const; + sys_days to_sys_days(date::year y) const; + + sys_seconds to_time_point(date::year y) const; + int compare(date::year y, const MonthDayTime& x, date::year yx, + std::chrono::seconds offset, std::chrono::minutes prev_save) const; + + friend std::istream& operator>>(std::istream& is, MonthDayTime& x); + friend std::ostream& operator<<(std::ostream& os, const MonthDayTime& x); +}; + +// A Rule specifies one or more set of datetimes without using an offset. +// Multiple dates are specified with multiple years. The years in effect +// go from starting_year_ to ending_year_, inclusive. starting_year_ <= +// ending_year_. save_ is in effect for times from the specified time +// onward, including the specified time. When the specified time is +// local, it uses the save_ from the chronologically previous Rule, or if +// there is none, 0. + +//forward declare to avoid warnings in gcc 6.2 +class Rule; +bool operator==(const Rule& x, const Rule& y); +bool operator<(const Rule& x, const Rule& y); +bool operator==(const Rule& x, const date::year& y); +bool operator<(const Rule& x, const date::year& y); +bool operator==(const date::year& x, const Rule& y); +bool operator<(const date::year& x, const Rule& y); +bool operator==(const Rule& x, const std::string& y); +bool operator<(const Rule& x, const std::string& y); +bool operator==(const std::string& x, const Rule& y); +bool operator<(const std::string& x, const Rule& y); +std::ostream& operator<<(std::ostream& os, const Rule& r); + +class Rule +{ +private: + std::string name_; + date::year starting_year_{0}; + date::year ending_year_{0}; + MonthDayTime starting_at_; + std::chrono::minutes save_{0}; + std::string abbrev_; + +public: + Rule() = default; + explicit Rule(const std::string& s); + Rule(const Rule& r, date::year starting_year, date::year ending_year); + + const std::string& name() const {return name_;} + const std::string& abbrev() const {return abbrev_;} + + const MonthDayTime& mdt() const {return starting_at_;} + const date::year& starting_year() const {return starting_year_;} + const date::year& ending_year() const {return ending_year_;} + const std::chrono::minutes& save() const {return save_;} + + static void split_overlaps(std::vector& rules); + + friend bool operator==(const Rule& x, const Rule& y); + friend bool operator<(const Rule& x, const Rule& y); + friend bool operator==(const Rule& x, const date::year& y); + friend bool operator<(const Rule& x, const date::year& y); + friend bool operator==(const date::year& x, const Rule& y); + friend bool operator<(const date::year& x, const Rule& y); + friend bool operator==(const Rule& x, const std::string& y); + friend bool operator<(const Rule& x, const std::string& y); + friend bool operator==(const std::string& x, const Rule& y); + friend bool operator<(const std::string& x, const Rule& y); + + friend std::ostream& operator<<(std::ostream& os, const Rule& r); + +private: + date::day day() const; + date::month month() const; + static void split_overlaps(std::vector& rules, std::size_t i, std::size_t& e); + static bool overlaps(const Rule& x, const Rule& y); + static void split(std::vector& rules, std::size_t i, std::size_t k, + std::size_t& e); +}; + +inline bool operator!=(const Rule& x, const Rule& y) {return !(x == y);} +inline bool operator> (const Rule& x, const Rule& y) {return y < x;} +inline bool operator<=(const Rule& x, const Rule& y) {return !(y < x);} +inline bool operator>=(const Rule& x, const Rule& y) {return !(x < y);} + +inline bool operator!=(const Rule& x, const date::year& y) {return !(x == y);} +inline bool operator> (const Rule& x, const date::year& y) {return y < x;} +inline bool operator<=(const Rule& x, const date::year& y) {return !(y < x);} +inline bool operator>=(const Rule& x, const date::year& y) {return !(x < y);} + +inline bool operator!=(const date::year& x, const Rule& y) {return !(x == y);} +inline bool operator> (const date::year& x, const Rule& y) {return y < x;} +inline bool operator<=(const date::year& x, const Rule& y) {return !(y < x);} +inline bool operator>=(const date::year& x, const Rule& y) {return !(x < y);} + +inline bool operator!=(const Rule& x, const std::string& y) {return !(x == y);} +inline bool operator> (const Rule& x, const std::string& y) {return y < x;} +inline bool operator<=(const Rule& x, const std::string& y) {return !(y < x);} +inline bool operator>=(const Rule& x, const std::string& y) {return !(x < y);} + +inline bool operator!=(const std::string& x, const Rule& y) {return !(x == y);} +inline bool operator> (const std::string& x, const Rule& y) {return y < x;} +inline bool operator<=(const std::string& x, const Rule& y) {return !(y < x);} +inline bool operator>=(const std::string& x, const Rule& y) {return !(x < y);} + +struct zonelet +{ + enum tag {has_rule, has_save, is_empty}; + + std::chrono::seconds gmtoff_; + tag tag_ = has_rule; + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) + union U +#else + struct U +#endif + { + std::string rule_; + std::chrono::minutes save_; + + ~U() {} + U() {} + U(const U&) {} + U& operator=(const U&) = delete; + } u; + + std::string format_; + date::year until_year_{0}; + MonthDayTime until_date_; + sys_seconds until_utc_; + local_seconds until_std_; + local_seconds until_loc_; + std::chrono::minutes initial_save_{}; + std::string initial_abbrev_; + std::pair first_rule_{nullptr, date::year::min()}; + std::pair last_rule_{nullptr, date::year::max()}; + + ~zonelet(); + zonelet(); + zonelet(const zonelet& i); + zonelet& operator=(const zonelet&) = delete; +}; + +#else // USE_OS_TZDB + +struct ttinfo +{ + std::int32_t tt_gmtoff; + unsigned char tt_isdst; + unsigned char tt_abbrind; + unsigned char pad[2]; +}; + +static_assert(sizeof(ttinfo) == 8, ""); + +struct expanded_ttinfo +{ + std::chrono::seconds offset; + std::string abbrev; + bool is_dst; +}; + +struct transition +{ + sys_seconds timepoint; + const expanded_ttinfo* info; + + transition(sys_seconds tp, const expanded_ttinfo* i = nullptr) + : timepoint(tp) + , info(i) + {} + + friend + std::ostream& + operator<<(std::ostream& os, const transition& t) + { + using namespace date; + using namespace std::chrono; + using date::operator<<; + os << t.timepoint << "Z "; + if (t.info->offset >= seconds{0}) + os << '+'; + os << make_time(t.info->offset); + if (t.info->is_dst > 0) + os << " daylight "; + else + os << " standard "; + os << t.info->abbrev; + return os; + } +}; + +#endif // USE_OS_TZDB + +} // namespace detail + +} // namespace date +} // namespace arrow_vendored + +#if defined(_MSC_VER) && (_MSC_VER < 1900) +#include "tz.h" +#endif + +#endif // TZ_PRIVATE_H diff --git a/r/R/inst/include/arrow/vendored/datetime/visibility.h b/r/R/inst/include/arrow/vendored/datetime/visibility.h new file mode 100644 index 00000000000..ae031238d85 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/datetime/visibility.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(ARROW_STATIC) +// intentially empty +#elif defined(ARROW_EXPORTING) +#define DATE_BUILD_DLL +#else +#define DATE_USE_DLL +#endif diff --git a/r/R/inst/include/arrow/vendored/xxhash/xxhash.h b/r/R/inst/include/arrow/vendored/xxhash/xxhash.h new file mode 100644 index 00000000000..8c2d5fac1e7 --- /dev/null +++ b/r/R/inst/include/arrow/vendored/xxhash/xxhash.h @@ -0,0 +1,330 @@ +// Vendored from git tag v0.6.5 + +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2016, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/* **************************** + * API modifier + ******************************/ +/** XXH_INLINE_ALL (and XXH_PRIVATE_API) + * This is useful to include xxhash functions in `static` mode + * in order to inline them, and remove their symbol from the public list. + * Inlining can offer dramatic performance improvement on small keys. + * Methodology : + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * `xxhash.c` is automatically included. + * It's not useful to compile and link it as a separate module. + */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/*! XXH_NAMESPACE, aka Namespace Emulation : + * + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, + * + * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values). + * + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. + */ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 5 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +typedef unsigned int XXH32_hash_t; + +/*! XXH32() : + Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); + +/*====== Streaming ======*/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/* + * Streaming functions generate the xxHash of an input provided in multiple segments. + * Note that, for small input, they are slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * XXH state must first be allocated, using XXH*_createState() . + * + * Start a new hash by initializing state with a seed, using XXH*_reset(). + * + * Then, feed the hash state by calling XXH*_update() as many times as necessary. + * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using XXH*_digest(). + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a digest, + * and generate some new hashes later on, by calling again XXH*_digest(). + * + * When done, free XXH state space if it was allocated dynamically. + */ + +/*====== Canonical representation ======*/ + +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. + * The canonical representation uses human-readable write convention, aka big-endian (large digits first). + * These functions allow transformation of hash result into and from its canonical format. + * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. + */ + + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +typedef unsigned long long XXH64_hash_t; + +/*! XXH64() : + Calculate the 64-bit hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark). +*/ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); + +/*====== Streaming ======*/ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/*====== Canonical representation ======*/ +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); +#endif /* XXH_NO_LONG_LONG */ + + + +#ifdef XXH_STATIC_LINKING_ONLY + +/* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== */ + +/* These definitions are only present to allow + * static allocation of XXH state, on stack or in a struct for example. + * Never **ever** use members directly. */ + +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + +struct XXH32_state_s { + uint32_t total_len_32; + uint32_t large_len; + uint32_t v1; + uint32_t v2; + uint32_t v3; + uint32_t v4; + uint32_t mem32[4]; + uint32_t memsize; + uint32_t reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + +struct XXH64_state_s { + uint64_t total_len; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t mem64[4]; + uint32_t memsize; + uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ + +# else + +struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; + unsigned memsize; + unsigned reserved; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH32_state_t */ + +# ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ +struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; + unsigned memsize; + unsigned reserved[2]; /* never read nor write, might be removed in a future version */ +}; /* typedef'd to XXH64_state_t */ +# endif + +# endif + + +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ +#endif + +#endif /* XXH_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* XXHASH_H_5627135585666179 */ diff --git a/r/R/inst/include/arrow/visitor.h b/r/R/inst/include/arrow/visitor.h new file mode 100644 index 00000000000..1b40ce4efba --- /dev/null +++ b/r/R/inst/include/arrow/visitor.h @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_VISITOR_H +#define ARROW_VISITOR_H + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class ARROW_EXPORT ArrayVisitor { + public: + virtual ~ArrayVisitor() = default; + + virtual Status Visit(const NullArray& array); + virtual Status Visit(const BooleanArray& array); + virtual Status Visit(const Int8Array& array); + virtual Status Visit(const Int16Array& array); + virtual Status Visit(const Int32Array& array); + virtual Status Visit(const Int64Array& array); + virtual Status Visit(const UInt8Array& array); + virtual Status Visit(const UInt16Array& array); + virtual Status Visit(const UInt32Array& array); + virtual Status Visit(const UInt64Array& array); + virtual Status Visit(const HalfFloatArray& array); + virtual Status Visit(const FloatArray& array); + virtual Status Visit(const DoubleArray& array); + virtual Status Visit(const StringArray& array); + virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const FixedSizeBinaryArray& array); + virtual Status Visit(const Date32Array& array); + virtual Status Visit(const Date64Array& array); + virtual Status Visit(const Time32Array& array); + virtual Status Visit(const Time64Array& array); + virtual Status Visit(const TimestampArray& array); + virtual Status Visit(const DayTimeIntervalArray& array); + virtual Status Visit(const MonthIntervalArray& array); + virtual Status Visit(const DurationArray& array); + virtual Status Visit(const Decimal128Array& array); + virtual Status Visit(const ListArray& array); + virtual Status Visit(const FixedSizeListArray& array); + virtual Status Visit(const StructArray& array); + virtual Status Visit(const UnionArray& array); + virtual Status Visit(const DictionaryArray& array); + virtual Status Visit(const ExtensionArray& array); +}; + +class ARROW_EXPORT TypeVisitor { + public: + virtual ~TypeVisitor() = default; + + virtual Status Visit(const NullType& type); + virtual Status Visit(const BooleanType& type); + virtual Status Visit(const Int8Type& type); + virtual Status Visit(const Int16Type& type); + virtual Status Visit(const Int32Type& type); + virtual Status Visit(const Int64Type& type); + virtual Status Visit(const UInt8Type& type); + virtual Status Visit(const UInt16Type& type); + virtual Status Visit(const UInt32Type& type); + virtual Status Visit(const UInt64Type& type); + virtual Status Visit(const HalfFloatType& type); + virtual Status Visit(const FloatType& type); + virtual Status Visit(const DoubleType& type); + virtual Status Visit(const StringType& type); + virtual Status Visit(const BinaryType& type); + virtual Status Visit(const FixedSizeBinaryType& type); + virtual Status Visit(const Date64Type& type); + virtual Status Visit(const Date32Type& type); + virtual Status Visit(const Time32Type& type); + virtual Status Visit(const Time64Type& type); + virtual Status Visit(const TimestampType& type); + virtual Status Visit(const MonthIntervalType& type); + virtual Status Visit(const DayTimeIntervalType& type); + virtual Status Visit(const DurationType& type); + virtual Status Visit(const Decimal128Type& type); + virtual Status Visit(const ListType& type); + virtual Status Visit(const FixedSizeListType& type); + virtual Status Visit(const StructType& type); + virtual Status Visit(const UnionType& type); + virtual Status Visit(const DictionaryType& type); + virtual Status Visit(const ExtensionType& type); +}; + +class ARROW_EXPORT ScalarVisitor { + public: + virtual ~ScalarVisitor() = default; + + virtual Status Visit(const NullScalar& scalar); + virtual Status Visit(const BooleanScalar& scalar); + virtual Status Visit(const Int8Scalar& scalar); + virtual Status Visit(const Int16Scalar& scalar); + virtual Status Visit(const Int32Scalar& scalar); + virtual Status Visit(const Int64Scalar& scalar); + virtual Status Visit(const UInt8Scalar& scalar); + virtual Status Visit(const UInt16Scalar& scalar); + virtual Status Visit(const UInt32Scalar& scalar); + virtual Status Visit(const UInt64Scalar& scalar); + virtual Status Visit(const HalfFloatScalar& scalar); + virtual Status Visit(const FloatScalar& scalar); + virtual Status Visit(const DoubleScalar& scalar); + virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const FixedSizeBinaryScalar& scalar); + virtual Status Visit(const Date64Scalar& scalar); + virtual Status Visit(const Date32Scalar& scalar); + virtual Status Visit(const Time32Scalar& scalar); + virtual Status Visit(const Time64Scalar& scalar); + virtual Status Visit(const TimestampScalar& scalar); + virtual Status Visit(const DayTimeIntervalScalar& scalar); + virtual Status Visit(const MonthIntervalScalar& scalar); + virtual Status Visit(const DurationScalar& scalar); + virtual Status Visit(const Decimal128Scalar& scalar); + virtual Status Visit(const ListScalar& scalar); + virtual Status Visit(const FixedSizeListScalar& scalar); + virtual Status Visit(const StructScalar& scalar); + virtual Status Visit(const DictionaryScalar& scalar); +}; + +} // namespace arrow + +#endif // ARROW_VISITOR_H diff --git a/r/R/inst/include/arrow/visitor_inline.h b/r/R/inst/include/arrow/visitor_inline.h new file mode 100644 index 00000000000..01bf4426f24 --- /dev/null +++ b/r/R/inst/include/arrow/visitor_inline.h @@ -0,0 +1,277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Private header, not to be exported + +#ifndef ARROW_VISITOR_INLINE_H +#define ARROW_VISITOR_INLINE_H + +#include "arrow/array.h" +#include "arrow/extension_type.h" +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ + ACTION(Null); \ + ACTION(Boolean); \ + ACTION(Int8); \ + ACTION(UInt8); \ + ACTION(Int16); \ + ACTION(UInt16); \ + ACTION(Int32); \ + ACTION(UInt32); \ + ACTION(Int64); \ + ACTION(UInt64); \ + ACTION(HalfFloat); \ + ACTION(Float); \ + ACTION(Double); \ + ACTION(String); \ + ACTION(Binary); \ + ACTION(FixedSizeBinary); \ + ACTION(Duration); \ + ACTION(Date32); \ + ACTION(Date64); \ + ACTION(Timestamp); \ + ACTION(Time32); \ + ACTION(Time64); \ + ACTION(Decimal128); \ + ACTION(List); \ + ACTION(FixedSizeList); \ + ACTION(Struct); \ + ACTION(Union); \ + ACTION(Dictionary); \ + ACTION(Extension) + +#define TYPE_VISIT_INLINE(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return visitor->Visit(internal::checked_cast(type)); + +template +inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) { + switch (type.id()) { + ARROW_GENERATE_FOR_ALL_TYPES(TYPE_VISIT_INLINE); + case Type::INTERVAL: { + const auto& interval_type = dynamic_cast(type); + if (interval_type.interval_type() == IntervalType::MONTHS) { + return visitor->Visit(internal::checked_cast(type)); + } + if (interval_type.interval_type() == IntervalType::DAY_TIME) { + return visitor->Visit(internal::checked_cast(type)); + } + break; + } + default: + break; + } + return Status::NotImplemented("Type not implemented"); +} + +#undef TYPE_VISIT_INLINE + +#define ARRAY_VISIT_INLINE(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return visitor->Visit( \ + internal::checked_cast::ArrayType&>( \ + array)); + +template +inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { + switch (array.type_id()) { + ARROW_GENERATE_FOR_ALL_TYPES(ARRAY_VISIT_INLINE); + case Type::INTERVAL: { + const auto& interval_type = dynamic_cast(*array.type()); + if (interval_type.interval_type() == IntervalType::MONTHS) { + return visitor->Visit(internal::checked_cast(array)); + } + if (interval_type.interval_type() == IntervalType::DAY_TIME) { + return visitor->Visit(internal::checked_cast(array)); + } + break; + } + + default: + break; + } + return Status::NotImplemented("Type not implemented"); +} + +// Visit an array's data values, in order, without overhead. +// +// The Visit function's `visitor` argument should define two public methods: +// - Status VisitNull() +// - Status VisitValue() +// +// The scalar value's type depends on the array data type: +// - the type's `c_type`, if any +// - for boolean arrays, a `bool` +// - for binary, string and fixed-size binary arrays, a `util::string_view` + +template +struct ArrayDataVisitor {}; + +template <> +struct ArrayDataVisitor { + template + static Status Visit(const ArrayData& arr, Visitor* visitor) { + if (arr.null_count != 0) { + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); + internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + const bool is_null = valid_reader.IsNotSet(); + if (is_null) { + ARROW_RETURN_NOT_OK(visitor->VisitNull()); + } else { + ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet())); + } + valid_reader.Next(); + value_reader.Next(); + } + } else { + internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet())); + value_reader.Next(); + } + } + return Status::OK(); + } +}; + +template +struct ArrayDataVisitor> { + template + static Status Visit(const ArrayData& arr, Visitor* visitor) { + using c_type = typename T::c_type; + const c_type* data = arr.GetValues(1); + + if (arr.null_count != 0) { + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + const bool is_null = valid_reader.IsNotSet(); + if (is_null) { + ARROW_RETURN_NOT_OK(visitor->VisitNull()); + } else { + ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i])); + } + valid_reader.Next(); + } + } else { + for (int64_t i = 0; i < arr.length; ++i) { + ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i])); + } + } + return Status::OK(); + } +}; + +template +struct ArrayDataVisitor> { + template + static Status Visit(const ArrayData& arr, Visitor* visitor) { + constexpr uint8_t empty_value = 0; + + const int32_t* offsets = arr.GetValues(1); + const uint8_t* data; + if (!arr.buffers[2]) { + data = &empty_value; + } else { + data = arr.GetValues(2); + } + + if (arr.null_count != 0) { + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + const bool is_null = valid_reader.IsNotSet(); + valid_reader.Next(); + if (is_null) { + ARROW_RETURN_NOT_OK(visitor->VisitNull()); + } else { + auto value = util::string_view(reinterpret_cast(data + offsets[i]), + offsets[i + 1] - offsets[i]); + ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); + } + } + } else { + for (int64_t i = 0; i < arr.length; ++i) { + auto value = util::string_view(reinterpret_cast(data + offsets[i]), + offsets[i + 1] - offsets[i]); + ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); + } + } + return Status::OK(); + } +}; + +template +struct ArrayDataVisitor> { + template + static Status Visit(const ArrayData& arr, Visitor* visitor) { + const auto& fw_type = internal::checked_cast(*arr.type); + + const int32_t byte_width = fw_type.byte_width(); + const uint8_t* data = arr.GetValues(1); + + if (arr.null_count != 0) { + internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); + for (int64_t i = 0; i < arr.length; ++i) { + const bool is_null = valid_reader.IsNotSet(); + valid_reader.Next(); + if (is_null) { + ARROW_RETURN_NOT_OK(visitor->VisitNull()); + } else { + auto value = util::string_view(reinterpret_cast(data), byte_width); + ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); + } + data += byte_width; + } + } else { + for (int64_t i = 0; i < arr.length; ++i) { + auto value = util::string_view(reinterpret_cast(data), byte_width); + ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); + data += byte_width; + } + } + return Status::OK(); + } +}; + +#define SCALAR_VISIT_INLINE(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return visitor->Visit(internal::checked_cast(scalar)); + +template +inline Status VisitScalarInline(const Scalar& scalar, VISITOR* visitor) { + switch (scalar.type->id()) { + ARROW_GENERATE_FOR_ALL_TYPES(SCALAR_VISIT_INLINE); + default: + break; + } + return Status::NotImplemented("Scalar visitor for type not implemented ", + scalar.type->ToString()); +} + +#undef TYPE_VISIT_INLINE + +} // namespace arrow + +#endif // ARROW_VISITOR_INLINE_H diff --git a/r/R/inst/include/parquet/api/io.h b/r/R/inst/include/parquet/api/io.h new file mode 100644 index 00000000000..f3092a6d7cb --- /dev/null +++ b/r/R/inst/include/parquet/api/io.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_API_IO_H +#define PARQUET_API_IO_H + +#include "parquet/deprecated_io.h" +#include "parquet/exception.h" + +#endif // PARQUET_API_IO_H diff --git a/r/R/inst/include/parquet/api/reader.h b/r/R/inst/include/parquet/api/reader.h new file mode 100644 index 00000000000..b29ca7205c4 --- /dev/null +++ b/r/R/inst/include/parquet/api/reader.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_API_READER_H +#define PARQUET_API_READER_H + +// Column reader API +#include "parquet/column_reader.h" +#include "parquet/column_scanner.h" +#include "parquet/exception.h" +#include "parquet/file_reader.h" +#include "parquet/metadata.h" +#include "parquet/platform.h" +#include "parquet/printer.h" +#include "parquet/properties.h" + +// Schemas +#include "parquet/api/schema.h" + +// IO +#include "parquet/api/io.h" + +#endif // PARQUET_API_READER_H diff --git a/r/R/inst/include/parquet/api/schema.h b/r/R/inst/include/parquet/api/schema.h new file mode 100644 index 00000000000..2e6c3b309ff --- /dev/null +++ b/r/R/inst/include/parquet/api/schema.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_API_SCHEMA_H +#define PARQUET_API_SCHEMA_H + +// Schemas +#include "parquet/schema.h" + +#endif // PARQUET_API_SCHEMA_H diff --git a/r/R/inst/include/parquet/api/writer.h b/r/R/inst/include/parquet/api/writer.h new file mode 100644 index 00000000000..3b4e42f7aff --- /dev/null +++ b/r/R/inst/include/parquet/api/writer.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_API_WRITER_H +#define PARQUET_API_WRITER_H + +#include "parquet/api/io.h" +#include "parquet/api/schema.h" +#include "parquet/column_writer.h" +#include "parquet/exception.h" +#include "parquet/file_writer.h" + +#endif // PARQUET_API_WRITER_H diff --git a/r/R/inst/include/parquet/arrow/reader.h b/r/R/inst/include/parquet/arrow/reader.h new file mode 100644 index 00000000000..acdda711071 --- /dev/null +++ b/r/R/inst/include/parquet/arrow/reader.h @@ -0,0 +1,356 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ARROW_READER_H +#define PARQUET_ARROW_READER_H + +#include +#include +#include +#include + +#include "parquet/platform.h" + +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class MemoryPool; +class RecordBatchReader; +class Schema; +class Status; +class Table; + +} // namespace arrow + +namespace parquet { + +class FileMetaData; +class ParquetFileReader; +class ReaderProperties; + +namespace arrow { + +class ColumnChunkReader; +class ColumnReader; +class RowGroupReader; + +static constexpr bool DEFAULT_USE_THREADS = false; + +/// EXPERIMENTAL: Properties for configuring FileReader behavior. +class PARQUET_EXPORT ArrowReaderProperties { + public: + explicit ArrowReaderProperties(bool use_threads = DEFAULT_USE_THREADS) + : use_threads_(use_threads), read_dict_indices_() {} + + void set_use_threads(bool use_threads) { use_threads_ = use_threads; } + + bool use_threads() const { return use_threads_; } + + void set_read_dictionary(int column_index, bool read_dict) { + if (read_dict) { + read_dict_indices_.insert(column_index); + } else { + read_dict_indices_.erase(column_index); + } + } + bool read_dictionary(int column_index) const { + if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { + return true; + } else { + return false; + } + } + + private: + bool use_threads_; + std::unordered_set read_dict_indices_; +}; + +/// EXPERIMENTAL: Constructs the default ArrowReaderProperties +PARQUET_EXPORT +ArrowReaderProperties default_arrow_reader_properties(); + +// Arrow read adapter class for deserializing Parquet files as Arrow row +// batches. +// +// This interfaces caters for different use cases and thus provides different +// interfaces. In its most simplistic form, we cater for a user that wants to +// read the whole Parquet at once with the FileReader::ReadTable method. +// +// More advanced users that also want to implement parallelism on top of each +// single Parquet files should do this on the RowGroup level. For this, they can +// call FileReader::RowGroup(i)->ReadTable to receive only the specified +// RowGroup as a table. +// +// In the most advanced situation, where a consumer wants to independently read +// RowGroups in parallel and consume each column individually, they can call +// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column +// instance. +// +// TODO(wesm): nested data does not always make sense with this user +// interface unless you are only reading a single leaf node from a branch of +// a table. For example: +// +// repeated group data { +// optional group record { +// optional int32 val1; +// optional byte_array val2; +// optional bool val3; +// } +// optional int32 val4; +// } +// +// In the Parquet file, there are 3 leaf nodes: +// +// * data.record.val1 +// * data.record.val2 +// * data.record.val3 +// * data.val4 +// +// When materializing this data in an Arrow array, we would have: +// +// data: list), +// val3: bool, +// >, +// val4: int32 +// >> +// +// However, in the Parquet format, each leaf node has its own repetition and +// definition levels describing the structure of the intermediate nodes in +// this array structure. Thus, we will need to scan the leaf data for a group +// of leaf nodes part of the same type tree to create a single result Arrow +// nested array structure. +// +// This is additionally complicated "chunky" repeated fields or very large byte +// arrays +class PARQUET_EXPORT FileReader { + public: + FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader, + const ArrowReaderProperties& properties = default_arrow_reader_properties()); + + // Since the distribution of columns amongst a Parquet file's row groups may + // be uneven (the number of values in each column chunk can be different), we + // provide a column-oriented read interface. The ColumnReader hides the + // details of paging through the file's row groups and yielding + // fully-materialized arrow::Array instances + // + // Returns error status if the column of interest is not flat. + ::arrow::Status GetColumn(int i, std::unique_ptr* out); + + /// \brief Return arrow schema by apply selection of column indices. + /// \returns error status if passed wrong indices. + ::arrow::Status GetSchema(const std::vector& indices, + std::shared_ptr<::arrow::Schema>* out); + + // Read column as a whole into an Array. + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); + + // NOTE: Experimental API + // Reads a specific top level schema field into an Array + // The index i refers the index of the top level schema field, which may + // be nested or flat - e.g. + // + // 0 foo.bar + // foo.bar.baz + // foo.qux + // 1 foo2 + // 2 foo3 + // + // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); + + /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the + /// ordering in row_group_indices matters. + /// \returns error Status if row_group_indices contains invalid index + ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, + std::shared_ptr<::arrow::RecordBatchReader>* out); + + /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, + /// whose columns are selected by column_indices. The ordering in row_group_indices + /// and column_indices matter. + /// \returns error Status if either row_group_indices or column_indices contains invalid + /// index + ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, + const std::vector& column_indices, + std::shared_ptr<::arrow::RecordBatchReader>* out); + + // Read a table of columns into a Table + ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); + + // Read a table of columns into a Table. Read only the indicated column + // indices (relative to the schema) + ::arrow::Status ReadTable(const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + + ::arrow::Status ReadRowGroup(int i, const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + + ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out); + + ::arrow::Status ReadRowGroups(const std::vector& row_groups, + const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + + ::arrow::Status ReadRowGroups(const std::vector& row_groups, + std::shared_ptr<::arrow::Table>* out); + + /// \brief Scan file contents with one thread, return number of rows + ::arrow::Status ScanContents(std::vector columns, const int32_t column_batch_size, + int64_t* num_rows); + + /// \brief Return a reader for the RowGroup, this object must not outlive the + /// FileReader. + std::shared_ptr RowGroup(int row_group_index); + + int num_row_groups() const; + + const ParquetFileReader* parquet_reader() const; + + /// Set the number of threads to use during reads of multiple columns. By + /// default only 1 thread is used + /// \deprecated Use set_use_threads instead. + ARROW_DEPRECATED("Use set_use_threads instead") + void set_num_threads(int num_threads); + + /// Set whether to use multiple threads during reads of multiple columns. + /// By default only one thread is used. + void set_use_threads(bool use_threads); + + virtual ~FileReader(); + + private: + friend ColumnChunkReader; + friend RowGroupReader; + + class PARQUET_NO_EXPORT Impl; + std::unique_ptr impl_; +}; + +class PARQUET_EXPORT RowGroupReader { + public: + std::shared_ptr Column(int column_index); + + ::arrow::Status ReadTable(const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); + + virtual ~RowGroupReader(); + + private: + friend FileReader; + RowGroupReader(FileReader::Impl* reader, int row_group_index); + + FileReader::Impl* impl_; + int row_group_index_; +}; + +class PARQUET_EXPORT ColumnChunkReader { + public: + ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); + + virtual ~ColumnChunkReader(); + + private: + friend RowGroupReader; + ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index); + + FileReader::Impl* impl_; + int column_index_; + int row_group_index_; +}; + +// At this point, the column reader is a stream iterator. It only knows how to +// read the next batch of values for a particular column from the file until it +// runs out. +// +// We also do not expose any internal Parquet details, such as row groups. This +// might change in the future. +class PARQUET_EXPORT ColumnReader { + public: + class PARQUET_NO_EXPORT ColumnReaderImpl; + virtual ~ColumnReader(); + + // Scan the next array of the indicated size. The actual size of the + // returned array may be less than the passed size depending how much data is + // available in the file. + // + // When all the data in the file has been exhausted, the result is set to + // nullptr. + // + // Returns Status::OK on a successful read, including if you have exhausted + // the data available in the file. + ::arrow::Status NextBatch(int64_t batch_size, + std::shared_ptr<::arrow::ChunkedArray>* out); + + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); + + private: + std::unique_ptr impl_; + explicit ColumnReader(std::unique_ptr impl); + + friend class FileReader; + friend class PrimitiveImpl; + friend class StructImpl; +}; + +// Helper function to create a file reader from an implementation of an Arrow +// random access file +// +// metadata : separately-computed file metadata, can be nullptr +PARQUET_EXPORT +::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, + ::arrow::MemoryPool* allocator, + const ReaderProperties& properties, + const std::shared_ptr& metadata, + std::unique_ptr* reader); + +PARQUET_EXPORT +::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, + ::arrow::MemoryPool* allocator, + std::unique_ptr* reader); + +PARQUET_EXPORT +::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, + ::arrow::MemoryPool* allocator, + const ArrowReaderProperties& properties, + std::unique_ptr* reader); + +} // namespace arrow +} // namespace parquet + +#endif // PARQUET_ARROW_READER_H diff --git a/r/R/inst/include/parquet/arrow/record_reader.h b/r/R/inst/include/parquet/arrow/record_reader.h new file mode 100644 index 00000000000..2ae26a5a47d --- /dev/null +++ b/r/R/inst/include/parquet/arrow/record_reader.h @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_RECORD_READER_H +#define PARQUET_RECORD_READER_H + +#include +#include +#include + +#include "parquet/platform.h" + +namespace arrow { + +class Array; + +} // namespace arrow + +namespace parquet { + +class ColumnDescriptor; +class PageReader; + +namespace internal { + +/// \brief Stateful column reader that delimits semantic records for both flat +/// and nested columns +/// +/// \note API EXPERIMENTAL +/// \since 1.3.0 +class RecordReader { + public: + // So that we can create subclasses + class RecordReaderImpl; + + static std::shared_ptr Make( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + const bool read_dictionary = false); + + virtual ~RecordReader(); + + /// \brief Decoded definition levels + const int16_t* def_levels() const; + + /// \brief Decoded repetition levels + const int16_t* rep_levels() const; + + /// \brief Decoded values, including nulls, if any + const uint8_t* values() const; + + /// \brief Attempt to read indicated number of records from column chunk + /// \return number of records read + int64_t ReadRecords(int64_t num_records); + + /// \brief Pre-allocate space for data. Results in better flat read performance + void Reserve(int64_t num_values); + + /// \brief Clear consumed values and repetition/definition levels as the + /// result of calling ReadRecords + void Reset(); + + std::shared_ptr ReleaseValues(); + std::shared_ptr ReleaseIsValid(); + + /// \brief Number of values written including nulls (if any) + int64_t values_written() const; + + /// \brief Number of definition / repetition levels (from those that have + /// been decoded) that have been consumed inside the reader. + int64_t levels_position() const; + + /// \brief Number of definition / repetition levels that have been written + /// internally in the reader + int64_t levels_written() const; + + /// \brief Number of nulls in the leaf + int64_t null_count() const; + + /// \brief True if the leaf values are nullable + bool nullable_values() const; + + /// \brief Return true if the record reader has more internal data yet to + /// process + bool HasMoreData() const; + + /// \brief Advance record reader to the next row group + /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader + void SetPageReader(std::unique_ptr reader); + + void DebugPrintState(); + + // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output + std::vector> GetBuilderChunks(); + + private: + std::unique_ptr impl_; + explicit RecordReader(RecordReaderImpl* impl); + + static std::shared_ptr MakeByteArrayRecordReader( + const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, + const bool read_dictionary); +}; + +} // namespace internal +} // namespace parquet + +#endif // PARQUET_RECORD_READER_H diff --git a/r/R/inst/include/parquet/arrow/schema.h b/r/R/inst/include/parquet/arrow/schema.h new file mode 100644 index 00000000000..52fb843e6c6 --- /dev/null +++ b/r/R/inst/include/parquet/arrow/schema.h @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ARROW_SCHEMA_H +#define PARQUET_ARROW_SCHEMA_H + +#include +#include +#include + +#include "parquet/metadata.h" +#include "parquet/platform.h" +#include "parquet/schema.h" + +namespace arrow { + +class Field; +class Schema; +class Status; + +} // namespace arrow + +namespace parquet { + +class WriterProperties; + +namespace arrow { + +class ArrowWriterProperties; + +PARQUET_EXPORT +::arrow::Status NodeToField(const schema::Node& node, + std::shared_ptr<::arrow::Field>* out); + +/// Convert parquet schema to arrow schema with selected indices +/// \param parquet_schema to be converted +/// \param column_indices indices of leaf nodes in parquet schema tree. Appearing ordering +/// matters for the converted schema. Repeated indices are ignored +/// except for the first one +/// \param key_value_metadata optional metadata, can be nullptr +/// \param out the corresponding arrow schema +/// \return Status::OK() on a successful conversion. +PARQUET_EXPORT +::arrow::Status FromParquetSchema( + const SchemaDescriptor* parquet_schema, const std::vector& column_indices, + const std::shared_ptr& key_value_metadata, + std::shared_ptr<::arrow::Schema>* out); + +// Without indices +PARQUET_EXPORT +::arrow::Status FromParquetSchema( + const SchemaDescriptor* parquet_schema, + const std::shared_ptr& key_value_metadata, + std::shared_ptr<::arrow::Schema>* out); + +// Without metadata +::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, + const std::vector& column_indices, + std::shared_ptr<::arrow::Schema>* out); + +// Without metadata or indices +::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, + std::shared_ptr<::arrow::Schema>* out); + +::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + schema::NodePtr* out); + +::arrow::Status PARQUET_EXPORT +ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::shared_ptr* out); + +::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + std::shared_ptr* out); + +PARQUET_EXPORT +int32_t DecimalSize(int32_t precision); + +} // namespace arrow + +} // namespace parquet + +#endif // PARQUET_ARROW_SCHEMA_H diff --git a/r/R/inst/include/parquet/arrow/test-util.h b/r/R/inst/include/parquet/arrow/test-util.h new file mode 100644 index 00000000000..b99e28f5e03 --- /dev/null +++ b/r/R/inst/include/parquet/arrow/test-util.h @@ -0,0 +1,485 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + +#include "parquet/arrow/record_reader.h" + +namespace parquet { + +using internal::RecordReader; + +namespace arrow { + +using ::arrow::Array; +using ::arrow::Status; + +template +struct DecimalWithPrecisionAndScale { + static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value"); + + using type = ::arrow::Decimal128Type; + static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id; + static constexpr int32_t precision = PRECISION; + static constexpr int32_t scale = PRECISION - 1; +}; + +template +using is_arrow_float = std::is_floating_point; + +template +using is_arrow_int = std::is_integral; + +template +using is_arrow_date = std::is_same; + +template +using is_arrow_string = std::is_same; + +template +using is_arrow_binary = std::is_same; + +template +using is_arrow_fixed_size_binary = std::is_same; + +template +using is_arrow_bool = std::is_same; + +template +typename std::enable_if::value, Status>::type NonNullArray( + size_t size, std::shared_ptr* out) { + using c_type = typename ArrowType::c_type; + std::vector values; + ::arrow::random_real(size, 0, static_cast(0), static_cast(1), &values); + ::arrow::NumericBuilder builder; + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); + return builder.Finish(out); +} + +template +typename std::enable_if< + is_arrow_int::value && !is_arrow_date::value, Status>::type +NonNullArray(size_t size, std::shared_ptr* out) { + std::vector values; + ::arrow::randint(size, 0, 64, &values); + + // Passing data type so this will work with TimestampType too + ::arrow::NumericBuilder builder(std::make_shared(), + ::arrow::default_memory_pool()); + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); + return builder.Finish(out); +} + +template +typename std::enable_if::value, Status>::type NonNullArray( + size_t size, std::shared_ptr* out) { + std::vector values; + ::arrow::randint(size, 0, 64, &values); + for (size_t i = 0; i < size; i++) { + values[i] *= 86400000; + } + + // Passing data type so this will work with TimestampType too + ::arrow::NumericBuilder builder(std::make_shared(), + ::arrow::default_memory_pool()); + builder.AppendValues(values.data(), values.size()); + return builder.Finish(out); +} + +template +typename std::enable_if< + is_arrow_string::value || is_arrow_binary::value, Status>::type +NonNullArray(size_t size, std::shared_ptr* out) { + using BuilderType = typename ::arrow::TypeTraits::BuilderType; + BuilderType builder; + for (size_t i = 0; i < size; i++) { + RETURN_NOT_OK(builder.Append("test-string")); + } + return builder.Finish(out); +} + +template +typename std::enable_if::value, Status>::type +NonNullArray(size_t size, std::shared_ptr* out) { + using BuilderType = typename ::arrow::TypeTraits::BuilderType; + // set byte_width to the length of "fixed": 5 + // todo: find a way to generate test data with more diversity. + BuilderType builder(::arrow::fixed_size_binary(5)); + for (size_t i = 0; i < size; i++) { + RETURN_NOT_OK(builder.Append("fixed")); + } + return builder.Finish(out); +} + +static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, + uint8_t* out) { + std::default_random_engine gen(seed); + std::uniform_int_distribution d(0, std::numeric_limits::max()); + const int32_t required_bytes = ::arrow::DecimalSize(precision); + constexpr int32_t byte_width = 16; + std::fill(out, out + byte_width * n, '\0'); + + for (int64_t i = 0; i < n; ++i, out += byte_width) { + std::generate(out, out + required_bytes, + [&d, &gen] { return static_cast(d(gen)); }); + + // sign extend if the sign bit is set for the last byte generated + // 0b10000000 == 0x80 == 128 + if ((out[required_bytes - 1] & '\x80') != 0) { + std::fill(out + required_bytes, out + byte_width, '\xFF'); + } + } +} + +template +typename std::enable_if< + std::is_same>::value, Status>::type +NonNullArray(size_t size, std::shared_ptr* out) { + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale::scale; + + const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); + ::arrow::Decimal128Builder builder(type); + const int32_t byte_width = + static_cast(*type).byte_width(); + + constexpr int32_t seed = 0; + + std::shared_ptr out_buf; + RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, + &out_buf)); + random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data()); + + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size)); + return builder.Finish(out); +} + +template +typename std::enable_if::value, Status>::type NonNullArray( + size_t size, std::shared_ptr* out) { + std::vector values; + ::arrow::randint(size, 0, 1, &values); + ::arrow::BooleanBuilder builder; + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); + return builder.Finish(out); +} + +// This helper function only supports (size/2) nulls. +template +typename std::enable_if::value, Status>::type NullableArray( + size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { + using c_type = typename ArrowType::c_type; + std::vector values; + ::arrow::random_real(size, seed, static_cast(-1e10), static_cast(1e10), + &values); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + ::arrow::NumericBuilder builder; + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); + return builder.Finish(out); +} + +// This helper function only supports (size/2) nulls. +template +typename std::enable_if< + is_arrow_int::value && !is_arrow_date::value, Status>::type +NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { + std::vector values; + + // Seed is random in Arrow right now + (void)seed; + ::arrow::randint(size, 0, 64, &values); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + // Passing data type so this will work with TimestampType too + ::arrow::NumericBuilder builder(std::make_shared(), + ::arrow::default_memory_pool()); + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); + return builder.Finish(out); +} + +template +typename std::enable_if::value, Status>::type NullableArray( + size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { + std::vector values; + + // Seed is random in Arrow right now + (void)seed; + ::arrow::randint(size, 0, 64, &values); + for (size_t i = 0; i < size; i++) { + values[i] *= 86400000; + } + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + // Passing data type so this will work with TimestampType too + ::arrow::NumericBuilder builder(std::make_shared(), + ::arrow::default_memory_pool()); + builder.AppendValues(values.data(), values.size(), valid_bytes.data()); + return builder.Finish(out); +} + +// This helper function only supports (size/2) nulls yet. +template +typename std::enable_if< + is_arrow_string::value || is_arrow_binary::value, Status>::type +NullableArray(size_t size, size_t num_nulls, uint32_t seed, + std::shared_ptr<::arrow::Array>* out) { + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + using BuilderType = typename ::arrow::TypeTraits::BuilderType; + BuilderType builder; + + const int kBufferSize = 10; + uint8_t buffer[kBufferSize]; + for (size_t i = 0; i < size; i++) { + if (!valid_bytes[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + ::arrow::random_bytes(kBufferSize, seed + static_cast(i), buffer); + RETURN_NOT_OK(builder.Append(buffer, kBufferSize)); + } + } + return builder.Finish(out); +} + +// This helper function only supports (size/2) nulls yet, +// same as NullableArray(..) +template +typename std::enable_if::value, Status>::type +NullableArray(size_t size, size_t num_nulls, uint32_t seed, + std::shared_ptr<::arrow::Array>* out) { + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + using BuilderType = typename ::arrow::TypeTraits::BuilderType; + const int byte_width = 10; + BuilderType builder(::arrow::fixed_size_binary(byte_width)); + + const int kBufferSize = byte_width; + uint8_t buffer[kBufferSize]; + for (size_t i = 0; i < size; i++) { + if (!valid_bytes[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + ::arrow::random_bytes(kBufferSize, seed + static_cast(i), buffer); + RETURN_NOT_OK(builder.Append(buffer)); + } + } + return builder.Finish(out); +} + +template +typename std::enable_if< + std::is_same>::value, Status>::type +NullableArray(size_t size, size_t num_nulls, uint32_t seed, + std::shared_ptr<::arrow::Array>* out) { + std::vector valid_bytes(size, '\1'); + + for (size_t i = 0; i < num_nulls; ++i) { + valid_bytes[i * 2] = '\0'; + } + + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale::scale; + const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); + const int32_t byte_width = + static_cast(*type).byte_width(); + + std::shared_ptr<::arrow::Buffer> out_buf; + RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, + &out_buf)); + + random_decimals(size, seed, precision, out_buf->mutable_data()); + + ::arrow::Decimal128Builder builder(type); + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data())); + return builder.Finish(out); +} + +// This helper function only supports (size/2) nulls yet. +template +typename std::enable_if::value, Status>::type NullableArray( + size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { + std::vector values; + + // Seed is random in Arrow right now + (void)seed; + + ::arrow::randint(size, 0, 1, &values); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + ::arrow::BooleanBuilder builder; + RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); + return builder.Finish(out); +} + +/// Wrap an Array into a ListArray by splitting it up into size lists. +/// +/// This helper function only supports (size/2) nulls. +Status MakeListArray(const std::shared_ptr& values, int64_t size, + int64_t null_count, bool nullable_values, + std::shared_ptr<::arrow::ListArray>* out) { + // We always include an empty list + int64_t non_null_entries = size - null_count - 1; + int64_t length_per_entry = values->length() / non_null_entries; + + auto offsets = AllocateBuffer(); + RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t))); + int32_t* offsets_ptr = reinterpret_cast(offsets->mutable_data()); + + auto null_bitmap = AllocateBuffer(); + int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size); + RETURN_NOT_OK(null_bitmap->Resize(bitmap_size)); + uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); + memset(null_bitmap_ptr, 0, bitmap_size); + + int32_t current_offset = 0; + for (int64_t i = 0; i < size; i++) { + offsets_ptr[i] = current_offset; + if (!(((i % 2) == 0) && ((i / 2) < null_count))) { + // Non-null list (list with index 1 is always empty). + ::arrow::BitUtil::SetBit(null_bitmap_ptr, i); + if (i != 1) { + current_offset += static_cast(length_per_entry); + } + } + } + offsets_ptr[size] = static_cast(values->length()); + + auto value_field = ::arrow::field("item", values->type(), nullable_values); + *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets, + values, null_bitmap, null_count); + + return Status::OK(); +} + +// Make an array containing only empty lists, with a null values array +Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { + // Allocate an offsets buffer containing only zeroes + std::shared_ptr offsets_buffer; + const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t); + RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes, + &offsets_buffer)); + memset(offsets_buffer->mutable_data(), 0, offsets_nbytes); + + auto value_field = + ::arrow::field("item", ::arrow::float64(), false /* nullable_values */); + auto list_type = ::arrow::list(value_field); + + std::vector> child_buffers = {nullptr /* null bitmap */, + nullptr /* values */}; + auto child_data = + ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers)); + + std::vector> buffers = {nullptr /* bitmap */, offsets_buffer}; + auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers)); + array_data->child_data.push_back(child_data); + + *out_array = ::arrow::MakeArray(array_data); + return Status::OK(); +} + +static inline std::shared_ptr<::arrow::Column> MakeColumn( + const std::string& name, const std::shared_ptr& array, bool nullable) { + auto field = ::arrow::field(name, array->type(), nullable); + return std::make_shared<::arrow::Column>(field, array); +} + +static inline std::shared_ptr<::arrow::Column> MakeColumn( + const std::string& name, const std::vector>& arrays, + bool nullable) { + auto field = ::arrow::field(name, arrays[0]->type(), nullable); + return std::make_shared<::arrow::Column>(field, arrays); +} + +std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr& values, + bool nullable) { + std::shared_ptr<::arrow::Column> column = MakeColumn("col", values, nullable); + std::vector> columns({column}); + std::vector> fields({column->field()}); + auto schema = std::make_shared<::arrow::Schema>(fields); + return ::arrow::Table::Make(schema, columns); +} + +template +void ExpectArray(T* expected, Array* result) { + auto p_array = static_cast<::arrow::PrimitiveArray*>(result); + for (int i = 0; i < result->length(); i++) { + EXPECT_EQ(expected[i], reinterpret_cast(p_array->values()->data())[i]); + } +} + +template +void ExpectArrayT(void* expected, Array* result) { + ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result); + for (int64_t i = 0; i < result->length(); i++) { + EXPECT_EQ(reinterpret_cast(expected)[i], + reinterpret_cast( + p_array->values()->data())[i]); + } +} + +template <> +void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) { + ::arrow::BooleanBuilder builder; + ARROW_EXPECT_OK( + builder.AppendValues(reinterpret_cast(expected), result->length())); + + std::shared_ptr expected_array; + ARROW_EXPECT_OK(builder.Finish(&expected_array)); + EXPECT_TRUE(result->Equals(*expected_array)); +} + +} // namespace arrow + +} // namespace parquet diff --git a/r/R/inst/include/parquet/arrow/writer.h b/r/R/inst/include/parquet/arrow/writer.h new file mode 100644 index 00000000000..97ed0f7a0ae --- /dev/null +++ b/r/R/inst/include/parquet/arrow/writer.h @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ARROW_WRITER_H +#define PARQUET_ARROW_WRITER_H + +#include +#include + +#include "parquet/platform.h" +#include "parquet/properties.h" +#include "parquet/types.h" + +#include "arrow/type.h" + +namespace arrow { + +class Array; +class ChunkedArray; +class MemoryPool; +class Status; +class Table; + +namespace io { + +class OutputStream; + +} // namespace io + +} // namespace arrow + +namespace parquet { + +class FileMetaData; +class ParquetFileWriter; + +namespace arrow { + +class PARQUET_EXPORT ArrowWriterProperties { + public: + class Builder { + public: + Builder() + : write_timestamps_as_int96_(false), + coerce_timestamps_enabled_(false), + coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), + truncated_timestamps_allowed_(false) {} + virtual ~Builder() {} + + Builder* disable_deprecated_int96_timestamps() { + write_timestamps_as_int96_ = false; + return this; + } + + Builder* enable_deprecated_int96_timestamps() { + write_timestamps_as_int96_ = true; + return this; + } + + Builder* coerce_timestamps(::arrow::TimeUnit::type unit) { + coerce_timestamps_enabled_ = true; + coerce_timestamps_unit_ = unit; + return this; + } + + Builder* allow_truncated_timestamps() { + truncated_timestamps_allowed_ = true; + return this; + } + + Builder* disallow_truncated_timestamps() { + truncated_timestamps_allowed_ = false; + return this; + } + + std::shared_ptr build() { + return std::shared_ptr(new ArrowWriterProperties( + write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, + truncated_timestamps_allowed_)); + } + + private: + bool write_timestamps_as_int96_; + + bool coerce_timestamps_enabled_; + ::arrow::TimeUnit::type coerce_timestamps_unit_; + bool truncated_timestamps_allowed_; + }; + + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } + + bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } + ::arrow::TimeUnit::type coerce_timestamps_unit() const { + return coerce_timestamps_unit_; + } + + bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } + + private: + explicit ArrowWriterProperties(bool write_nanos_as_int96, + bool coerce_timestamps_enabled, + ::arrow::TimeUnit::type coerce_timestamps_unit, + bool truncated_timestamps_allowed) + : write_timestamps_as_int96_(write_nanos_as_int96), + coerce_timestamps_enabled_(coerce_timestamps_enabled), + coerce_timestamps_unit_(coerce_timestamps_unit), + truncated_timestamps_allowed_(truncated_timestamps_allowed) {} + + const bool write_timestamps_as_int96_; + const bool coerce_timestamps_enabled_; + const ::arrow::TimeUnit::type coerce_timestamps_unit_; + const bool truncated_timestamps_allowed_; +}; + +std::shared_ptr PARQUET_EXPORT default_arrow_writer_properties(); + +/** + * Iterative API: + * Start a new RowGroup/Chunk with NewRowGroup + * Write column-by-column the whole column chunk + */ +class PARQUET_EXPORT FileWriter { + public: + FileWriter(::arrow::MemoryPool* pool, std::unique_ptr writer, + const std::shared_ptr<::arrow::Schema>& schema, + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()); + + static ::arrow::Status Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + const std::shared_ptr<::arrow::io::OutputStream>& sink, + const std::shared_ptr& properties, + std::unique_ptr* writer); + + static ::arrow::Status Open( + const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + const std::shared_ptr<::arrow::io::OutputStream>& sink, + const std::shared_ptr& properties, + const std::shared_ptr& arrow_properties, + std::unique_ptr* writer); + + /// \brief Write a Table to Parquet. + ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size); + + ::arrow::Status NewRowGroup(int64_t chunk_size); + ::arrow::Status WriteColumnChunk(const ::arrow::Array& data); + + /// \brief Write ColumnChunk in row group using slice of a ChunkedArray + ::arrow::Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data, + const int64_t offset, const int64_t size); + ::arrow::Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data); + ::arrow::Status Close(); + + virtual ~FileWriter(); + + ::arrow::MemoryPool* memory_pool() const; + + const std::shared_ptr metadata() const; + + private: + class PARQUET_NO_EXPORT Impl; + std::unique_ptr impl_; + std::shared_ptr<::arrow::Schema> schema_; +}; + +/// \brief Write Parquet file metadata only to indicated Arrow OutputStream +PARQUET_EXPORT +::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +/** + * Write a Table to Parquet. + * + * The table shall only consist of columns of primitive type or of primitive lists. + */ +::arrow::Status PARQUET_EXPORT WriteTable( + const ::arrow::Table& table, ::arrow::MemoryPool* pool, + const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size, + const std::shared_ptr& properties = default_writer_properties(), + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()); + +namespace internal { + +/** + * Timestamp conversion constants + */ +constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); + +template +inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { + int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; + (*impala_timestamp).value[2] = (uint32_t)julian_days; + + int64_t last_day_units = time % UnitPerDay; + int64_t* impala_last_day_nanos = reinterpret_cast(impala_timestamp); + *impala_last_day_nanos = last_day_units * NanosecondsPerUnit; +} + +constexpr int64_t kSecondsInNanos = INT64_C(1000000000); + +inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp(seconds, + impala_timestamp); +} + +constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); + +inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + milliseconds, impala_timestamp); +} + +constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); + +inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + microseconds, impala_timestamp); +} + +constexpr int64_t kNanosecondsInNanos = INT64_C(1); + +inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, + Int96* impala_timestamp) { + ArrowTimestampToImpalaTimestamp( + nanoseconds, impala_timestamp); +} + +} // namespace internal + +} // namespace arrow + +} // namespace parquet + +#endif // PARQUET_ARROW_WRITER_H diff --git a/r/R/inst/include/parquet/bloom_filter.h b/r/R/inst/include/parquet/bloom_filter.h new file mode 100644 index 00000000000..0285b8f9274 --- /dev/null +++ b/r/R/inst/include/parquet/bloom_filter.h @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_BLOOM_FILTER_H +#define PARQUET_BLOOM_FILTER_H + +#include +#include +#include + +#include "arrow/util/logging.h" +#include "parquet/hasher.h" +#include "parquet/platform.h" +#include "parquet/types.h" + +namespace arrow { + +class MemoryPool; + +} // namespace arrow + +namespace parquet { + +// A Bloom filter is a compact structure to indicate whether an item is not in a set or +// probably in a set. The Bloom filter usually consists of a bit set that represents a +// set of elements, a hash strategy and a Bloom filter algorithm. +class PARQUET_EXPORT BloomFilter { + public: + // Maximum Bloom filter size, it sets to HDFS default block size 128MB + // This value will be reconsidered when implementing Bloom filter producer. + static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024; + + /// Determine whether an element exist in set or not. + /// + /// @param hash the element to contain. + /// @return false if value is definitely not in set, and true means PROBABLY + /// in set. + virtual bool FindHash(uint64_t hash) const = 0; + + /// Insert element to set represented by Bloom filter bitset. + /// @param hash the hash of value to insert into Bloom filter. + virtual void InsertHash(uint64_t hash) = 0; + + /// Write this Bloom filter to an output stream. A Bloom filter structure should + /// include bitset length, hash strategy, algorithm, and bitset. + /// + /// @param sink the output stream to write + virtual void WriteTo(ArrowOutputStream* sink) const = 0; + + /// Get the number of bytes of bitset + virtual uint32_t GetBitsetSize() const = 0; + + /// Compute hash for 32 bits value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(int32_t value) const = 0; + + /// Compute hash for 64 bits value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(int64_t value) const = 0; + + /// Compute hash for float value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(float value) const = 0; + + /// Compute hash for double value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(double value) const = 0; + + /// Compute hash for Int96 value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(const Int96* value) const = 0; + + /// Compute hash for ByteArray value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(const ByteArray* value) const = 0; + + /// Compute hash for fixed byte array value by using its plain encoding result. + /// + /// @param value the value address. + /// @param len the value length. + /// @return hash result. + virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; + + virtual ~BloomFilter() {} + + protected: + // Hash strategy available for Bloom filter. + enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 }; + + // Bloom filter algorithm. + enum class Algorithm : uint32_t { BLOCK = 0 }; +}; + +// The BlockSplitBloomFilter is implemented using block-based Bloom filters from +// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to +// hash the item to a tiny Bloom filter which size fit a single cache line or smaller. +// +// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom +// filter is 32 bytes to take advantage of 32-byte SIMD instructions. +class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { + public: + /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function. + BlockSplitBloomFilter(); + + /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within + /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be + /// rounded up/down to lower/upper bound if num_bytes is out of range and also + /// will be rounded up to a power of 2. + /// + /// @param num_bytes The number of bytes to store Bloom filter bitset. + void Init(uint32_t num_bytes); + + /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying + /// bitset because the given bitset may not satisfy the 32-byte alignment requirement + /// which may lead to segfault when performing SIMD instructions. It is the caller's + /// responsibility to free the bitset passed in. This is used when reconstructing + /// a Bloom filter from a parquet file. + /// + /// @param bitset The given bitset to initialize the Bloom filter. + /// @param num_bytes The number of bytes of given bitset. + void Init(const uint8_t* bitset, uint32_t num_bytes); + + // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter. + static constexpr uint32_t kMinimumBloomFilterBytes = 32; + + /// Calculate optimal size according to the number of distinct values and false + /// positive probability. + /// + /// @param ndv The number of distinct values. + /// @param fpp The false positive probability. + /// @return it always return a value between kMinimumBloomFilterBytes and + /// kMaximumBloomFilterBytes, and the return value is always a power of 2 + static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { + DCHECK(fpp > 0.0 && fpp < 1.0); + const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); + uint32_t num_bits; + + // Handle overflow. + if (m < 0 || m > kMaximumBloomFilterBytes << 3) { + num_bits = static_cast(kMaximumBloomFilterBytes << 3); + } else { + num_bits = static_cast(m); + } + + // Round up to lower bound + if (num_bits < kMinimumBloomFilterBytes << 3) { + num_bits = kMinimumBloomFilterBytes << 3; + } + + // Get next power of 2 if bits is not power of 2. + if ((num_bits & (num_bits - 1)) != 0) { + num_bits = static_cast(::arrow::BitUtil::NextPower2(num_bits)); + } + + // Round down to upper bound + if (num_bits > kMaximumBloomFilterBytes << 3) { + num_bits = kMaximumBloomFilterBytes << 3; + } + + return num_bits; + } + + bool FindHash(uint64_t hash) const override; + void InsertHash(uint64_t hash) override; + void WriteTo(ArrowOutputStream* sink) const override; + uint32_t GetBitsetSize() const override { return num_bytes_; } + + uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); } + uint64_t Hash(float value) const override { return hasher_->Hash(value); } + uint64_t Hash(double value) const override { return hasher_->Hash(value); } + uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); } + uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); } + uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); } + uint64_t Hash(const FLBA* value, uint32_t len) const override { + return hasher_->Hash(value, len); + } + + /// Deserialize the Bloom filter from an input stream. It is used when reconstructing + /// a Bloom filter from a parquet filter. + /// + /// @param input_stream The input stream from which to construct the Bloom filter + /// @return The BlockSplitBloomFilter. + static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream); + + private: + // Bytes in a tiny Bloom filter block. + static constexpr int kBytesPerFilterBlock = 32; + + // The number of bits to be set in each tiny Bloom filter + static constexpr int kBitsSetPerBlock = 8; + + // A mask structure used to set bits in each tiny Bloom filter. + struct BlockMask { + uint32_t item[kBitsSetPerBlock]; + }; + + // The block-based algorithm needs eight odd SALT values to calculate eight indexes + // of bit to set, one bit in each 32-bit word. + static constexpr uint32_t SALT[kBitsSetPerBlock] = { + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, + 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U}; + + /// Set bits in mask array according to input key. + /// @param key the value to calculate mask values. + /// @param mask the mask array is used to set inside a block + void SetMask(uint32_t key, BlockMask& mask) const; + + // Memory pool to allocate aligned buffer for bitset + ::arrow::MemoryPool* pool_; + + // The underlying buffer of bitset. + std::shared_ptr data_; + + // The number of bytes of Bloom filter bitset. + uint32_t num_bytes_; + + // Hash strategy used in this Bloom filter. + HashStrategy hash_strategy_; + + // Algorithm used in this Bloom filter. + Algorithm algorithm_; + + // The hash pointer points to actual hash class used. + std::unique_ptr hasher_; +}; + +} // namespace parquet + +#endif // PARQUET_BLOOM_FILTER_H diff --git a/r/R/inst/include/parquet/column_page.h b/r/R/inst/include/parquet/column_page.h new file mode 100644 index 00000000000..66a5bf332de --- /dev/null +++ b/r/R/inst/include/parquet/column_page.h @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#ifndef PARQUET_COLUMN_PAGE_H +#define PARQUET_COLUMN_PAGE_H + +#include +#include +#include + +#include "parquet/statistics.h" +#include "parquet/types.h" + +namespace parquet { + +// TODO: Parallel processing is not yet safe because of memory-ownership +// semantics (the PageReader may or may not own the memory referenced by a +// page) +// +// TODO(wesm): In the future Parquet implementations may store the crc code +// in format::PageHeader. parquet-mr currently does not, so we also skip it +// here, both on the read and write path +class Page { + public: + Page(const std::shared_ptr& buffer, PageType::type type) + : buffer_(buffer), type_(type) {} + + PageType::type type() const { return type_; } + + std::shared_ptr buffer() const { return buffer_; } + + // @returns: a pointer to the page's data + const uint8_t* data() const { return buffer_->data(); } + + // @returns: the total size in bytes of the page's data buffer + int32_t size() const { return static_cast(buffer_->size()); } + + private: + std::shared_ptr buffer_; + PageType::type type_; +}; + +/// \brief Base type for DataPageV1 and DataPageV2 including common attributes +class DataPage : public Page { + public: + int32_t num_values() const { return num_values_; } + Encoding::type encoding() const { return encoding_; } + const EncodedStatistics& statistics() const { return statistics_; } + + protected: + DataPage(PageType::type type, const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, + const EncodedStatistics& statistics = EncodedStatistics()) + : Page(buffer, type), + num_values_(num_values), + encoding_(encoding), + statistics_(statistics) {} + + int32_t num_values_; + Encoding::type encoding_; + EncodedStatistics statistics_; +}; + +class DataPageV1 : public DataPage { + public: + DataPageV1(const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, Encoding::type definition_level_encoding, + Encoding::type repetition_level_encoding, + const EncodedStatistics& statistics = EncodedStatistics()) + : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, statistics), + definition_level_encoding_(definition_level_encoding), + repetition_level_encoding_(repetition_level_encoding) {} + + Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } + + Encoding::type definition_level_encoding() const { return definition_level_encoding_; } + + private: + Encoding::type definition_level_encoding_; + Encoding::type repetition_level_encoding_; +}; + +class CompressedDataPage : public DataPageV1 { + public: + CompressedDataPage(const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, Encoding::type definition_level_encoding, + Encoding::type repetition_level_encoding, int64_t uncompressed_size, + const EncodedStatistics& statistics = EncodedStatistics()) + : DataPageV1(buffer, num_values, encoding, definition_level_encoding, + repetition_level_encoding, statistics), + uncompressed_size_(uncompressed_size) {} + + int64_t uncompressed_size() const { return uncompressed_size_; } + + private: + int64_t uncompressed_size_; +}; + +class DataPageV2 : public DataPage { + public: + DataPageV2(const std::shared_ptr& buffer, int32_t num_values, int32_t num_nulls, + int32_t num_rows, Encoding::type encoding, + int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, + bool is_compressed = false) + : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding), + num_nulls_(num_nulls), + num_rows_(num_rows), + definition_levels_byte_length_(definition_levels_byte_length), + repetition_levels_byte_length_(repetition_levels_byte_length), + is_compressed_(is_compressed) {} + + int32_t num_nulls() const { return num_nulls_; } + + int32_t num_rows() const { return num_rows_; } + + int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } + + int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } + + bool is_compressed() const { return is_compressed_; } + + private: + int32_t num_nulls_; + int32_t num_rows_; + int32_t definition_levels_byte_length_; + int32_t repetition_levels_byte_length_; + bool is_compressed_; + + // TODO(wesm): format::DataPageHeaderV2.statistics +}; + +class DictionaryPage : public Page { + public: + DictionaryPage(const std::shared_ptr& buffer, int32_t num_values, + Encoding::type encoding, bool is_sorted = false) + : Page(buffer, PageType::DICTIONARY_PAGE), + num_values_(num_values), + encoding_(encoding), + is_sorted_(is_sorted) {} + + int32_t num_values() const { return num_values_; } + + Encoding::type encoding() const { return encoding_; } + + bool is_sorted() const { return is_sorted_; } + + private: + int32_t num_values_; + Encoding::type encoding_; + bool is_sorted_; +}; + +} // namespace parquet + +#endif // PARQUET_COLUMN_PAGE_H diff --git a/r/R/inst/include/parquet/column_reader.h b/r/R/inst/include/parquet/column_reader.h new file mode 100644 index 00000000000..e7d6afbb467 --- /dev/null +++ b/r/R/inst/include/parquet/column_reader.h @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace arrow { + +namespace BitUtil { +class BitReader; +} // namespace BitUtil + +namespace util { +class RleDecoder; +} // namespace util + +} // namespace arrow + +namespace parquet { + +class DictionaryPage; +class Page; + +// 16 MB is the default maximum page header size +static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; + +// 16 KB is the default expected page header size +static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; + +class PARQUET_EXPORT LevelDecoder { + public: + LevelDecoder(); + ~LevelDecoder(); + + // Initialize the LevelDecoder state with new data + // and return the number of bytes consumed + int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, + const uint8_t* data); + + // Decodes a batch of levels into an array and returns the number of levels decoded + int Decode(int batch_size, int16_t* levels); + + private: + int bit_width_; + int num_values_remaining_; + Encoding::type encoding_; + std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; + std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; +}; + +// Abstract page iterator interface. This way, we can feed column pages to the +// ColumnReader through whatever mechanism we choose +class PARQUET_EXPORT PageReader { + public: + virtual ~PageReader() = default; + + static std::unique_ptr Open( + const std::shared_ptr& stream, int64_t total_num_rows, + Compression::type codec, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr + // containing new Page otherwise + virtual std::shared_ptr NextPage() = 0; + + virtual void set_max_page_header_size(uint32_t size) = 0; +}; + +class PARQUET_EXPORT ColumnReader { + public: + virtual ~ColumnReader() = default; + + static std::shared_ptr Make( + const ColumnDescriptor* descr, std::unique_ptr pager, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + // Returns true if there are still values in this column. + virtual bool HasNext() = 0; + + virtual Type::type type() const = 0; + + virtual const ColumnDescriptor* descr() const = 0; +}; + +// API to read values from a single column. This is a main client facing API. +template +class TypedColumnReader : public ColumnReader { + public: + typedef typename DType::c_type T; + + // Read a batch of repetition levels, definition levels, and values from the + // column. + // + // Since null values are not stored in the values, the number of values read + // may be less than the number of repetition and definition levels. With + // nested data this is almost certainly true. + // + // Set def_levels or rep_levels to nullptr if you want to skip reading them. + // This is only safe if you know through some other source that there are no + // undefined values. + // + // To fully exhaust a row group, you must read batches until the number of + // values read reaches the number of stored values according to the metadata. + // + // This API is the same for both V1 and V2 of the DataPage + // + // @returns: actual number of levels read (see values_read for number of values read) + virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, + T* values, int64_t* values_read) = 0; + + /// Read a batch of repetition levels, definition levels, and values from the + /// column and leave spaces for null entries on the lowest level in the values + /// buffer. + /// + /// In comparision to ReadBatch the length of repetition and definition levels + /// is the same as of the number of values read for max_definition_level == 1. + /// In the case of max_definition_level > 1, the repetition and definition + /// levels are larger than the values but the values include the null entries + /// with definition_level == (max_definition_level - 1). + /// + /// To fully exhaust a row group, you must read batches until the number of + /// values read reaches the number of stored values according to the metadata. + /// + /// @param batch_size the number of levels to read + /// @param[out] def_levels The Parquet definition levels, output has + /// the length levels_read. + /// @param[out] rep_levels The Parquet repetition levels, output has + /// the length levels_read. + /// @param[out] values The values in the lowest nested level including + /// spacing for nulls on the lowest levels; output has the length + /// values_read. + /// @param[out] valid_bits Memory allocated for a bitmap that indicates if + /// the row is null or on the maximum definition level. For performance + /// reasons the underlying buffer should be able to store 1 bit more than + /// required. If this requires an additional byte, this byte is only read + /// but never written to. + /// @param valid_bits_offset The offset in bits of the valid_bits where the + /// first relevant bit resides. + /// @param[out] levels_read The number of repetition/definition levels that were read. + /// @param[out] values_read The number of values read, this includes all + /// non-null entries as well as all null-entries on the lowest level + /// (i.e. definition_level == max_definition_level - 1) + /// @param[out] null_count The number of nulls on the lowest levels. + /// (i.e. (values_read - null_count) is total number of non-null entries) + virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, + int16_t* rep_levels, T* values, uint8_t* valid_bits, + int64_t valid_bits_offset, int64_t* levels_read, + int64_t* values_read, int64_t* null_count) = 0; + + // Skip reading levels + // Returns the number of levels skipped + virtual int64_t Skip(int64_t num_rows_to_skip) = 0; +}; + +namespace internal { + +static inline void DefinitionLevelsToBitmap( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, + valid_bits_offset + num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future + for (int i = 0; i < num_def_levels; ++i) { + if (def_levels[i] == max_definition_level) { + valid_bits_writer.Set(); + } else if (max_repetition_level > 0) { + // repetition+flat case + if (def_levels[i] == (max_definition_level - 1)) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + continue; + } + } else { + // non-repeated+nested case + if (def_levels[i] < max_definition_level) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + throw ParquetException("definition level exceeds maximum"); + } + } + + valid_bits_writer.Next(); + } + valid_bits_writer.Finish(); + *values_read = valid_bits_writer.position(); +} + +} // namespace internal + +namespace internal { + +// TODO(itaiin): another code path split to merge when the general case is done +static inline bool HasSpacedValues(const ColumnDescriptor* descr) { + if (descr->max_repetition_level() > 0) { + // repeated+flat case + return !descr->schema_node()->is_required(); + } else { + // non-repeated+nested case + // Find if a node forces nulls in the lowest level along the hierarchy + const schema::Node* node = descr->schema_node().get(); + while (node) { + if (node->is_optional()) { + return true; + } + node = node->parent(); + } + return false; + } +} + +} // namespace internal + +using BoolReader = TypedColumnReader; +using Int32Reader = TypedColumnReader; +using Int64Reader = TypedColumnReader; +using Int96Reader = TypedColumnReader; +using FloatReader = TypedColumnReader; +using DoubleReader = TypedColumnReader; +using ByteArrayReader = TypedColumnReader; +using FixedLenByteArrayReader = TypedColumnReader; + +} // namespace parquet diff --git a/r/R/inst/include/parquet/column_scanner.h b/r/R/inst/include/parquet/column_scanner.h new file mode 100644 index 00000000000..9f65d1866b9 --- /dev/null +++ b/r/R/inst/include/parquet/column_scanner.h @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_COLUMN_SCANNER_H +#define PARQUET_COLUMN_SCANNER_H + +#include +#include +#include +#include +#include +#include + +#include "parquet/column_reader.h" +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128; + +class PARQUET_EXPORT Scanner { + public: + explicit Scanner(std::shared_ptr reader, + int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : batch_size_(batch_size), + level_offset_(0), + levels_buffered_(0), + value_buffer_(AllocateBuffer(pool)), + value_offset_(0), + values_buffered_(0), + reader_(reader) { + def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0); + rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0); + } + + virtual ~Scanner() {} + + static std::shared_ptr Make( + std::shared_ptr col_reader, + int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0; + + bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); } + + const ColumnDescriptor* descr() const { return reader_->descr(); } + + int64_t batch_size() const { return batch_size_; } + + void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; } + + protected: + int64_t batch_size_; + + std::vector def_levels_; + std::vector rep_levels_; + int level_offset_; + int levels_buffered_; + + std::shared_ptr value_buffer_; + int value_offset_; + int64_t values_buffered_; + + private: + std::shared_ptr reader_; +}; + +template +class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner { + public: + typedef typename DType::c_type T; + + explicit TypedScanner(std::shared_ptr reader, + int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : Scanner(reader, batch_size, pool) { + typed_reader_ = static_cast*>(reader.get()); + int value_byte_size = type_traits::value_byte_size; + PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size)); + values_ = reinterpret_cast(value_buffer_->mutable_data()); + } + + virtual ~TypedScanner() {} + + bool NextLevels(int16_t* def_level, int16_t* rep_level) { + if (level_offset_ == levels_buffered_) { + levels_buffered_ = static_cast( + typed_reader_->ReadBatch(static_cast(batch_size_), def_levels_.data(), + rep_levels_.data(), values_, &values_buffered_)); + + value_offset_ = 0; + level_offset_ = 0; + if (!levels_buffered_) { + return false; + } + } + *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0; + *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0; + level_offset_++; + return true; + } + + bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) { + if (level_offset_ == levels_buffered_) { + if (!HasNext()) { + // Out of data pages + return false; + } + } + + NextLevels(def_level, rep_level); + *is_null = *def_level < descr()->max_definition_level(); + + if (*is_null) { + return true; + } + + if (value_offset_ == values_buffered_) { + throw ParquetException("Value was non-null, but has not been buffered"); + } + *val = values_[value_offset_++]; + return true; + } + + // Returns true if there is a next value + bool NextValue(T* val, bool* is_null) { + if (level_offset_ == levels_buffered_) { + if (!HasNext()) { + // Out of data pages + return false; + } + } + + // Out of values + int16_t def_level = -1; + int16_t rep_level = -1; + NextLevels(&def_level, &rep_level); + *is_null = def_level < descr()->max_definition_level(); + + if (*is_null) { + return true; + } + + if (value_offset_ == values_buffered_) { + throw ParquetException("Value was non-null, but has not been buffered"); + } + *val = values_[value_offset_++]; + return true; + } + + virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) { + T val; + int16_t def_level = -1; + int16_t rep_level = -1; + bool is_null = false; + char buffer[80]; + + if (!Next(&val, &def_level, &rep_level, &is_null)) { + throw ParquetException("No more values buffered"); + } + + if (with_levels) { + out << " D:" << def_level << " R:" << rep_level << " "; + if (!is_null) { + out << "V:"; + } + } + + if (is_null) { + std::string null_fmt = format_fwf(width); + snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL"); + } else { + FormatValue(&val, buffer, sizeof(buffer), width); + } + out << buffer; + } + + private: + // The ownership of this object is expressed through the reader_ variable in the base + TypedColumnReader* typed_reader_; + + inline void FormatValue(void* val, char* buffer, int bufsize, int width); + + T* values_; +}; + +template +inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, + int width) { + std::string fmt = format_fwf(width); + snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast(val)); +} + +template <> +inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, + int width) { + std::string fmt = format_fwf(width); + std::string result = Int96ToString(*reinterpret_cast(val)); + snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); +} + +template <> +inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, + int width) { + std::string fmt = format_fwf(width); + std::string result = ByteArrayToString(*reinterpret_cast(val)); + snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); +} + +template <> +inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, + int width) { + std::string fmt = format_fwf(width); + std::string result = FixedLenByteArrayToString( + *reinterpret_cast(val), descr()->type_length()); + snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); +} + +typedef TypedScanner BoolScanner; +typedef TypedScanner Int32Scanner; +typedef TypedScanner Int64Scanner; +typedef TypedScanner Int96Scanner; +typedef TypedScanner FloatScanner; +typedef TypedScanner DoubleScanner; +typedef TypedScanner ByteArrayScanner; +typedef TypedScanner FixedLenByteArrayScanner; + +template +int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels, + uint8_t* values, int64_t* values_buffered, + parquet::ColumnReader* reader) { + typedef typename RType::T Type; + auto typed_reader = static_cast(reader); + auto vals = reinterpret_cast(&values[0]); + return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals, + values_buffered); +} + +int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels, + int16_t* rep_levels, uint8_t* values, + int64_t* values_buffered, + parquet::ColumnReader* reader); + +} // namespace parquet + +#endif // PARQUET_COLUMN_SCANNER_H diff --git a/r/R/inst/include/parquet/column_writer.h b/r/R/inst/include/parquet/column_writer.h new file mode 100644 index 00000000000..023b96585eb --- /dev/null +++ b/r/R/inst/include/parquet/column_writer.h @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/memory_pool.h" + +#include "parquet/column_page.h" +#include "parquet/encoding.h" +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace arrow { + +namespace BitUtil { +class BitWriter; +} // namespace BitUtil + +namespace util { +class RleEncoder; +} // namespace util + +} // namespace arrow + +namespace parquet { + +class ColumnChunkMetaDataBuilder; +class WriterProperties; + +class PARQUET_EXPORT LevelEncoder { + public: + LevelEncoder(); + ~LevelEncoder(); + + static int MaxBufferSize(Encoding::type encoding, int16_t max_level, + int num_buffered_values); + + // Initialize the LevelEncoder. + void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values, + uint8_t* data, int data_size); + + // Encodes a batch of levels from an array and returns the number of levels encoded + int Encode(int batch_size, const int16_t* levels); + + int32_t len() { + if (encoding_ != Encoding::RLE) { + throw ParquetException("Only implemented for RLE encoding"); + } + return rle_length_; + } + + private: + int bit_width_; + int rle_length_; + Encoding::type encoding_; + std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_; + std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_; +}; + +class PARQUET_EXPORT PageWriter { + public: + virtual ~PageWriter() {} + + static std::unique_ptr Open( + const std::shared_ptr& sink, Compression::type codec, + ColumnChunkMetaDataBuilder* metadata, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + bool buffered_row_group = false); + + // The Column Writer decides if dictionary encoding is used if set and + // if the dictionary encoding has fallen back to default encoding on reaching dictionary + // page limit + virtual void Close(bool has_dictionary, bool fallback) = 0; + + virtual int64_t WriteDataPage(const CompressedDataPage& page) = 0; + + virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0; + + virtual bool has_compressor() = 0; + + virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0; +}; + +static constexpr int WRITE_BATCH_SIZE = 1000; +class PARQUET_EXPORT ColumnWriter { + public: + virtual ~ColumnWriter() = default; + + static std::shared_ptr Make(ColumnChunkMetaDataBuilder*, + std::unique_ptr, + const WriterProperties* properties); + + /// \brief Closes the ColumnWriter, commits any buffered values to pages. + /// \return Total size of the column in bytes + virtual int64_t Close() = 0; + + /// \brief The physical Parquet type of the column + virtual Type::type type() const = 0; + + /// \brief The schema for the column + virtual const ColumnDescriptor* descr() const = 0; + + /// \brief The number of rows written so far + virtual int64_t rows_written() const = 0; + + /// \brief The total size of the compressed pages + page headers. Some values + /// might be still buffered an not written to a page yet + virtual int64_t total_compressed_bytes() const = 0; + + /// \brief The total number of bytes written as serialized data and + /// dictionary pages to the ColumnChunk so far + virtual int64_t total_bytes_written() const = 0; + + /// \brief The file-level writer properties + virtual const WriterProperties* properties() = 0; +}; + +// API to write values to a single column. This is the main client facing API. +template +class TypedColumnWriter : public ColumnWriter { + public: + using T = typename DType::c_type; + + // Write a batch of repetition levels, definition levels, and values to the + // column. + virtual void WriteBatch(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels, const T* values) = 0; + + /// Write a batch of repetition levels, definition levels, and values to the + /// column. + /// + /// In comparision to WriteBatch the length of repetition and definition levels + /// is the same as of the number of values read for max_definition_level == 1. + /// In the case of max_definition_level > 1, the repetition and definition + /// levels are larger than the values but the values include the null entries + /// with definition_level == (max_definition_level - 1). Thus we have to differentiate + /// in the parameters of this function if the input has the length of num_values or the + /// _number of rows in the lowest nesting level_. + /// + /// In the case that the most inner node in the Parquet is required, the _number of rows + /// in the lowest nesting level_ is equal to the number of non-null values. If the + /// inner-most schema node is optional, the _number of rows in the lowest nesting level_ + /// also includes all values with definition_level == (max_definition_level - 1). + /// + /// @param num_values number of levels to write. + /// @param def_levels The Parquet definiton levels, length is num_values + /// @param rep_levels The Parquet repetition levels, length is num_values + /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting + /// level. The length is number of rows in the lowest nesting level. + /// @param valid_bits_offset The offset in bits of the valid_bits where the + /// first relevant bit resides. + /// @param values The values in the lowest nested level including + /// spacing for nulls on the lowest levels; input has the length + /// of the number of rows on the lowest nesting level. + virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels, const uint8_t* valid_bits, + int64_t valid_bits_offset, const T* values) = 0; + + // Estimated size of the values that are not written to a page yet + virtual int64_t EstimatedBufferedValueBytes() const = 0; +}; + +using BoolWriter = TypedColumnWriter; +using Int32Writer = TypedColumnWriter; +using Int64Writer = TypedColumnWriter; +using Int96Writer = TypedColumnWriter; +using FloatWriter = TypedColumnWriter; +using DoubleWriter = TypedColumnWriter; +using ByteArrayWriter = TypedColumnWriter; +using FixedLenByteArrayWriter = TypedColumnWriter; + +} // namespace parquet diff --git a/r/R/inst/include/parquet/deprecated_io.h b/r/R/inst/include/parquet/deprecated_io.h new file mode 100644 index 00000000000..8dfdeda5d24 --- /dev/null +++ b/r/R/inst/include/parquet/deprecated_io.h @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// DEPRECATED IO INTERFACES: We have transitioned to using the Apache +// Arrow file input and output abstract interfaces defined in +// arrow/io/interfaces.h. These legacy interfaces are being preserved +// through a wrapper layer for one to two releases + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/memory_pool.h" + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/types.h" + +namespace parquet { + +class PARQUET_EXPORT FileInterface { + public: + virtual ~FileInterface() = default; + + // Close the file + virtual void Close() = 0; + + // Return the current position in the file relative to the start + virtual int64_t Tell() = 0; +}; + +/// It is the responsibility of implementations to mind threadsafety of shared +/// resources +class PARQUET_EXPORT RandomAccessSource : virtual public FileInterface { + public: + virtual ~RandomAccessSource() = default; + + virtual int64_t Size() const = 0; + + // Returns bytes read + virtual int64_t Read(int64_t nbytes, uint8_t* out) = 0; + + virtual std::shared_ptr Read(int64_t nbytes) = 0; + + virtual std::shared_ptr ReadAt(int64_t position, int64_t nbytes) = 0; + + /// Returns bytes read + virtual int64_t ReadAt(int64_t position, int64_t nbytes, uint8_t* out) = 0; +}; + +class PARQUET_EXPORT OutputStream : virtual public FileInterface { + public: + virtual ~OutputStream() = default; + + // Copy bytes into the output stream + virtual void Write(const uint8_t* data, int64_t length) = 0; +}; + +// ---------------------------------------------------------------------- +// Wrapper classes + +class PARQUET_EXPORT ParquetInputWrapper : public ::arrow::io::RandomAccessFile { + public: + explicit ParquetInputWrapper(std::unique_ptr source); + explicit ParquetInputWrapper(RandomAccessSource* source); + + ~ParquetInputWrapper() override; + + // FileInterface + ::arrow::Status Close() override; + ::arrow::Status Tell(int64_t* position) const override; + bool closed() const override; + + // Seekable + ::arrow::Status Seek(int64_t position) override; + + // InputStream / RandomAccessFile + ::arrow::Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; + ::arrow::Status Read(int64_t nbytes, std::shared_ptr* out) override; + ::arrow::Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) override; + ::arrow::Status GetSize(int64_t* size) override; + + private: + std::unique_ptr owned_source_; + RandomAccessSource* source_; + bool closed_; +}; + +class PARQUET_EXPORT ParquetOutputWrapper : public ::arrow::io::OutputStream { + public: + explicit ParquetOutputWrapper(const std::shared_ptr<::parquet::OutputStream>& sink); + explicit ParquetOutputWrapper(std::unique_ptr<::parquet::OutputStream> sink); + explicit ParquetOutputWrapper(::parquet::OutputStream* sink); + + ~ParquetOutputWrapper() override; + + // FileInterface + ::arrow::Status Close() override; + ::arrow::Status Tell(int64_t* position) const override; + bool closed() const override; + + // Writable + ::arrow::Status Write(const void* data, int64_t nbytes) override; + + private: + std::unique_ptr<::parquet::OutputStream> owned_sink_; + std::shared_ptr<::parquet::OutputStream> shared_sink_; + ::parquet::OutputStream* sink_; + bool closed_; +}; + +} // namespace parquet diff --git a/r/R/inst/include/parquet/encoding.h b/r/R/inst/include/parquet/encoding.h new file mode 100644 index 00000000000..28a9b98716f --- /dev/null +++ b/r/R/inst/include/parquet/encoding.h @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/types.h" + +namespace parquet { + +class ColumnDescriptor; + +// Untyped base for all encoders +class Encoder { + public: + virtual ~Encoder() = default; + + virtual int64_t EstimatedDataEncodedSize() = 0; + virtual std::shared_ptr FlushValues() = 0; + virtual Encoding::type encoding() const = 0; + + virtual ::arrow::MemoryPool* memory_pool() const = 0; +}; + +// Base class for value encoders. Since encoders may or not have state (e.g., +// dictionary encoding) we use a class instance to maintain any state. +// +// TODO(wesm): Encode interface API is temporary +template +class TypedEncoder : virtual public Encoder { + public: + typedef typename DType::c_type T; + + virtual void Put(const T* src, int num_values) = 0; + + virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer( + this->memory_pool(), num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); + } +}; + +// Base class for dictionary encoders +template +class DictEncoder : virtual public TypedEncoder { + public: + /// Writes out any buffered indices to buffer preceded by the bit width of this data. + /// Returns the number of bytes written. + /// If the supplied buffer is not big enough, returns -1. + /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize() + /// to size buffer. + virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0; + + virtual int dict_encoded_size() = 0; + // virtual int dict_encoded_size() { return dict_encoded_size_; } + + virtual int bit_width() const = 0; + + /// Writes out the encoded dictionary to buffer. buffer must be preallocated to + /// dict_encoded_size() bytes. + virtual void WriteDict(uint8_t* buffer) = 0; + + virtual int num_entries() const = 0; +}; + +// ---------------------------------------------------------------------- +// Value decoding + +class Decoder { + public: + virtual ~Decoder() = default; + + // Sets the data for a new page. This will be called multiple times on the same + // decoder and should reset all internal state. + virtual void SetData(int num_values, const uint8_t* data, int len) = 0; + + // Returns the number of values left (for the last call to SetData()). This is + // the number of values left in this page. + virtual int values_left() const = 0; + virtual Encoding::type encoding() const = 0; +}; + +template +class TypedDecoder : virtual public Decoder { + public: + using T = typename DType::c_type; + + // Subclasses should override the ones they support. In each of these functions, + // the decoder would decode put to 'max_values', storing the result in 'buffer'. + // The function returns the number of values decoded, which should be max_values + // except for end of the current data page. + virtual int Decode(T* buffer, int max_values) = 0; + + // Decode the values in this data page but leave spaces for null entries. + // + // num_values is the size of the def_levels and buffer arrays including the number of + // null values. + virtual int DecodeSpaced(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + int values_to_read = num_values - null_count; + int values_read = Decode(buffer, values_to_read); + if (values_read != values_to_read) { + throw ParquetException("Number of values / definition_levels read did not match"); + } + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); + + // Add spacing for null entries. As we have filled the buffer from the front, + // we need to add the spacing from the back. + int values_to_move = values_read; + for (int i = num_values - 1; i >= 0; i--) { + if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = buffer[--values_to_move]; + } + } + return num_values; + } +}; + +template +class DictDecoder : virtual public TypedDecoder { + public: + virtual void SetDict(TypedDecoder* dictionary) = 0; +}; + +// ---------------------------------------------------------------------- +// TypedEncoder specializations, traits, and factory functions + +class BooleanEncoder : virtual public TypedEncoder { + public: + using TypedEncoder::Put; + virtual void Put(const std::vector& src, int num_values) = 0; +}; + +using Int32Encoder = TypedEncoder; +using Int64Encoder = TypedEncoder; +using Int96Encoder = TypedEncoder; +using FloatEncoder = TypedEncoder; +using DoubleEncoder = TypedEncoder; +class ByteArrayEncoder : virtual public TypedEncoder {}; +class FLBAEncoder : virtual public TypedEncoder {}; + +class BooleanDecoder : virtual public TypedDecoder { + public: + using TypedDecoder::Decode; + virtual int Decode(uint8_t* buffer, int max_values) = 0; +}; + +using Int32Decoder = TypedDecoder; +using Int64Decoder = TypedDecoder; +using Int96Decoder = TypedDecoder; +using FloatDecoder = TypedDecoder; +using DoubleDecoder = TypedDecoder; + +class ByteArrayDecoder : virtual public TypedDecoder { + public: + using TypedDecoder::DecodeSpaced; + + class WrappedBuilderInterface { + public: + virtual void Reserve(int64_t values) = 0; + virtual void Append(const uint8_t* value, uint32_t length) = 0; + virtual void AppendNull() = 0; + virtual ~WrappedBuilderInterface() = default; + }; + + template + class WrappedBuilder : public WrappedBuilderInterface { + public: + explicit WrappedBuilder(Builder* builder) : builder_(builder) {} + + void Reserve(int64_t values) override { + PARQUET_THROW_NOT_OK(builder_->Reserve(values)); + } + void Append(const uint8_t* value, uint32_t length) override { + PARQUET_THROW_NOT_OK(builder_->Append(value, length)); + } + + void AppendNull() override { PARQUET_THROW_NOT_OK(builder_->AppendNull()); } + + private: + Builder* builder_; + }; + + template + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, Builder* builder) { + int result = 0; + WrappedBuilder wrapped_builder(builder); + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, &wrapped_builder, &result)); + return result; + } + + template + int DecodeArrowNonNull(int num_values, Builder* builder) { + int result = 0; + WrappedBuilder wrapped_builder(builder); + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, &wrapped_builder, &result)); + return result; + } + + private: + virtual ::arrow::Status DecodeArrow(int num_values, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, + WrappedBuilderInterface* builder, + int* values_decoded) = 0; + + virtual ::arrow::Status DecodeArrowNonNull(int num_values, + WrappedBuilderInterface* builder, + int* values_decoded) = 0; +}; + +class FLBADecoder : virtual public TypedDecoder { + public: + using TypedDecoder::DecodeSpaced; + + // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if + // there is value in adding specialized read methods for + // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type + // then perhaps not +}; + +template +struct EncodingTraits {}; + +template <> +struct EncodingTraits { + using Encoder = BooleanEncoder; + using Decoder = BooleanDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int32Encoder; + using Decoder = Int32Decoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int64Encoder; + using Decoder = Int64Decoder; +}; + +template <> +struct EncodingTraits { + using Encoder = Int96Encoder; + using Decoder = Int96Decoder; +}; + +template <> +struct EncodingTraits { + using Encoder = FloatEncoder; + using Decoder = FloatDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = DoubleEncoder; + using Decoder = DoubleDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = ByteArrayEncoder; + using Decoder = ByteArrayDecoder; +}; + +template <> +struct EncodingTraits { + using Encoder = FLBAEncoder; + using Decoder = FLBADecoder; +}; + +PARQUET_EXPORT +std::unique_ptr MakeEncoder( + Type::type type_num, Encoding::type encoding, bool use_dictionary = false, + const ColumnDescriptor* descr = NULLPTR, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + +template +std::unique_ptr::Encoder> MakeTypedEncoder( + Encoding::type encoding, bool use_dictionary = false, + const ColumnDescriptor* descr = NULLPTR, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + using OutType = typename EncodingTraits::Encoder; + std::unique_ptr base = + MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool); + return std::unique_ptr(dynamic_cast(base.release())); +} + +PARQUET_EXPORT +std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, + const ColumnDescriptor* descr = NULLPTR); + +namespace detail { + +PARQUET_EXPORT +std::unique_ptr MakeDictDecoder(Type::type type_num, + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool); + +} // namespace detail + +template +std::unique_ptr> MakeDictDecoder( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + using OutType = DictDecoder; + auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); + return std::unique_ptr(dynamic_cast(decoder.release())); +} + +template +std::unique_ptr::Decoder> MakeTypedDecoder( + Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) { + using OutType = typename EncodingTraits::Decoder; + std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr); + return std::unique_ptr(dynamic_cast(base.release())); +} + +} // namespace parquet diff --git a/r/R/inst/include/parquet/encryption_internal.h b/r/R/inst/include/parquet/encryption_internal.h new file mode 100644 index 00000000000..af668dc4136 --- /dev/null +++ b/r/R/inst/include/parquet/encryption_internal.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_ENCRYPTION_INTERNAL_H +#define PARQUET_ENCRYPTION_INTERNAL_H + +#include +#include +#include + +#include "parquet/properties.h" +#include "parquet/types.h" + +using parquet::ParquetCipher; + +namespace parquet { +namespace encryption { + +constexpr int kGcmTagLength = 16; +constexpr int kNonceLength = 12; + +// Module types +constexpr int8_t kFooter = 0; +constexpr int8_t kColumnMetaData = 1; +constexpr int8_t kDataPage = 2; +constexpr int8_t kDictionaryPage = 3; +constexpr int8_t kDataPageHeader = 4; +constexpr int8_t kDictionaryPageHeader = 5; +constexpr int8_t kColumnIndex = 6; +constexpr int8_t kOffsetIndex = 7; + +/// Performs AES encryption operations with GCM or CTR ciphers. +class AesEncryptor { + public: + static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::shared_ptr> all_encryptors); + + ~AesEncryptor(); + + /// Size difference between plaintext and ciphertext, for this cipher. + int CiphertextSizeDelta(); + + /// Encrypts plaintext with the key and aad. Key length is passed only for validation. + /// If different from value in constructor, exception will be thrown. + int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, + uint8_t* aad, int aad_len, uint8_t* ciphertext); + + /// Encrypts plaintext footer, in order to compute footer signature (tag). + int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, + int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, + uint8_t* encrypted_footer); + + void WipeOut(); + + private: + /// Can serve one key length only. Possible values: 16, 24, 32 bytes. + explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata); + // PIMPL Idiom + class AesEncryptorImpl; + std::unique_ptr impl_; +}; + +/// Performs AES decryption operations with GCM or CTR ciphers. +class AesDecryptor { + public: + static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, + std::shared_ptr> all_decryptors); + + ~AesDecryptor(); + void WipeOut(); + + /// Size difference between plaintext and ciphertext, for this cipher. + int CiphertextSizeDelta(); + + /// Decrypts ciphertext with the key and aad. Key length is passed only for + /// validation. If different from value in constructor, exception will be thrown. + int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, + uint8_t* aad, int aad_len, uint8_t* plaintext); + + private: + /// Can serve one key length only. Possible values: 16, 24, 32 bytes. + explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata); + // PIMPL Idiom + class AesDecryptorImpl; + std::unique_ptr impl_; +}; + +std::string CreateModuleAad(const std::string& file_aad, int8_t module_type, + int16_t row_group_ordinal, int16_t column_ordinal, + int16_t page_ordinal); + +std::string CreateFooterAad(const std::string& aad_prefix_bytes); + +// Update last two bytes of page (or page header) module AAD +void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal); + +} // namespace encryption +} // namespace parquet + +#endif // PARQUET_ENCRYPTION_INTERNAL_H diff --git a/r/R/inst/include/parquet/exception.h b/r/R/inst/include/parquet/exception.h new file mode 100644 index 00000000000..7db3ab756f0 --- /dev/null +++ b/r/R/inst/include/parquet/exception.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_EXCEPTION_H +#define PARQUET_EXCEPTION_H + +#include +#include +#include + +#include "arrow/status.h" +#include "parquet/platform.h" + +// PARQUET-1085 +#if !defined(ARROW_UNUSED) +#define ARROW_UNUSED(x) UNUSED(x) +#endif + +#define PARQUET_CATCH_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) { \ + return ::arrow::Status::IOError(e.what()); \ + } + +#define PARQUET_IGNORE_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + ARROW_UNUSED(_s); \ + } while (0) + +#define PARQUET_THROW_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { \ + std::stringstream ss; \ + ss << "Arrow error: " << _s.ToString(); \ + throw ::parquet::ParquetException(ss.str()); \ + } \ + } while (0) + +namespace parquet { + +class ParquetException : public std::exception { + public: + PARQUET_NORETURN static void EofException(const std::string& msg = "") { + std::stringstream ss; + ss << "Unexpected end of stream"; + if (!msg.empty()) { + ss << ": " << msg; + } + throw ParquetException(ss.str()); + } + + PARQUET_NORETURN static void NYI(const std::string& msg = "") { + std::stringstream ss; + ss << "Not yet implemented: " << msg << "."; + throw ParquetException(ss.str()); + } + + explicit ParquetException(const char* msg) : msg_(msg) {} + + explicit ParquetException(const std::string& msg) : msg_(msg) {} + + explicit ParquetException(const char* msg, std::exception&) : msg_(msg) {} + + ~ParquetException() throw() override {} + + const char* what() const throw() override { return msg_.c_str(); } + + private: + std::string msg_; +}; + +} // namespace parquet + +#endif // PARQUET_EXCEPTION_H diff --git a/r/R/inst/include/parquet/file_reader.h b/r/R/inst/include/parquet/file_reader.h new file mode 100644 index 00000000000..214cf112600 --- /dev/null +++ b/r/R/inst/include/parquet/file_reader.h @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_FILE_READER_H +#define PARQUET_FILE_READER_H + +#include +#include +#include +#include + +#include "parquet/metadata.h" // IWYU pragma:: keep +#include "parquet/platform.h" +#include "parquet/properties.h" + +namespace parquet { + +class ColumnReader; +class FileMetaData; +class PageReader; +class RandomAccessSource; +class RowGroupMetaData; + +class PARQUET_EXPORT RowGroupReader { + public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct Contents { + virtual ~Contents() {} + virtual std::unique_ptr GetColumnPageReader(int i) = 0; + virtual const RowGroupMetaData* metadata() const = 0; + virtual const ReaderProperties* properties() const = 0; + }; + + explicit RowGroupReader(std::unique_ptr contents); + + // Returns the rowgroup metadata + const RowGroupMetaData* metadata() const; + + // Construct a ColumnReader for the indicated row group-relative + // column. Ownership is shared with the RowGroupReader. + std::shared_ptr Column(int i); + + std::unique_ptr GetColumnPageReader(int i); + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr contents_; +}; + +class PARQUET_EXPORT ParquetFileReader { + public: + // Declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct PARQUET_EXPORT Contents { + static std::unique_ptr Open( + const std::shared_ptr<::arrow::io::RandomAccessFile>& source, + const ReaderProperties& props = default_reader_properties(), + const std::shared_ptr& metadata = NULLPTR); + + virtual ~Contents() = default; + // Perform any cleanup associated with the file contents + virtual void Close() = 0; + virtual std::shared_ptr GetRowGroup(int i) = 0; + virtual std::shared_ptr metadata() const = 0; + }; + + ParquetFileReader(); + ~ParquetFileReader(); + + // Create a reader from some implementation of parquet-cpp's generic file + // input interface + // + // If you cannot provide exclusive access to your file resource, create a + // subclass of RandomAccessSource that wraps the shared resource + ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version") + static std::unique_ptr Open( + std::unique_ptr source, + const ReaderProperties& props = default_reader_properties(), + const std::shared_ptr& metadata = NULLPTR); + + // Create a file reader instance from an Arrow file object. Thread-safety is + // the responsibility of the file implementation + static std::unique_ptr Open( + const std::shared_ptr<::arrow::io::RandomAccessFile>& source, + const ReaderProperties& props = default_reader_properties(), + const std::shared_ptr& metadata = NULLPTR); + + // API Convenience to open a serialized Parquet file on disk, using Arrow IO + // interfaces. + static std::unique_ptr OpenFile( + const std::string& path, bool memory_map = true, + const ReaderProperties& props = default_reader_properties(), + const std::shared_ptr& metadata = NULLPTR); + + void Open(std::unique_ptr contents); + void Close(); + + // The RowGroupReader is owned by the FileReader + std::shared_ptr RowGroup(int i); + + // Returns the file metadata. Only one instance is ever created + std::shared_ptr metadata() const; + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr contents_; +}; + +// Read only Parquet file metadata +std::shared_ptr PARQUET_EXPORT +ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); + +/// \brief Scan all values in file. Useful for performance testing +/// \param[in] columns the column numbers to scan. If empty scans all +/// \param[in] column_batch_size number of values to read at a time when scanning column +/// \param[in] reader a ParquetFileReader instance +/// \return number of semantic rows in file +PARQUET_EXPORT +int64_t ScanFileContents(std::vector columns, const int32_t column_batch_size, + ParquetFileReader* reader); + +} // namespace parquet + +#endif // PARQUET_FILE_READER_H diff --git a/r/R/inst/include/parquet/file_writer.h b/r/R/inst/include/parquet/file_writer.h new file mode 100644 index 00000000000..cd512cf817d --- /dev/null +++ b/r/R/inst/include/parquet/file_writer.h @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_FILE_WRITER_H +#define PARQUET_FILE_WRITER_H + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/metadata.h" +#include "parquet/platform.h" +#include "parquet/properties.h" +#include "parquet/schema.h" + +namespace arrow { + +class MemoryPool; + +namespace io { + +class OutputStream; + +} // namespace io +} // namespace arrow + +namespace parquet { + +class ColumnWriter; +class OutputStream; + +class PARQUET_EXPORT RowGroupWriter { + public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct Contents { + virtual ~Contents() = default; + virtual int num_columns() const = 0; + virtual int64_t num_rows() const = 0; + + // to be used only with ParquetFileWriter::AppendRowGroup + virtual ColumnWriter* NextColumn() = 0; + // to be used only with ParquetFileWriter::AppendBufferedRowGroup + virtual ColumnWriter* column(int i) = 0; + + virtual int current_column() const = 0; + virtual void Close() = 0; + + // total bytes written by the page writer + virtual int64_t total_bytes_written() const = 0; + // total bytes still compressed but not written + virtual int64_t total_compressed_bytes() const = 0; + }; + + explicit RowGroupWriter(std::unique_ptr contents); + + /// Construct a ColumnWriter for the indicated row group-relative column. + /// + /// To be used only with ParquetFileWriter::AppendRowGroup + /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only + /// valid until the next call to NextColumn or Close. As the contents are + /// directly written to the sink, once a new column is started, the contents + /// of the previous one cannot be modified anymore. + ColumnWriter* NextColumn(); + /// Index of currently written column + int current_column(); + void Close(); + + int num_columns() const; + + /// Construct a ColumnWriter for the indicated row group column. + /// + /// To be used only with ParquetFileWriter::AppendBufferedRowGroup + /// Ownership is solely within the RowGroupWriter. The ColumnWriter is + /// valid until Close. The contents are buffered in memory and written to sink + /// on Close + ColumnWriter* column(int i); + + /** + * Number of rows that shall be written as part of this RowGroup. + */ + int64_t num_rows() const; + + int64_t total_bytes_written() const; + int64_t total_compressed_bytes() const; + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr contents_; +}; + +ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") +PARQUET_EXPORT +void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); + +PARQUET_EXPORT +void WriteFileMetaData(const FileMetaData& file_metadata, + ::arrow::io::OutputStream* sink); + +class PARQUET_EXPORT ParquetFileWriter { + public: + // Forward declare a virtual class 'Contents' to aid dependency injection and more + // easily create test fixtures + // An implementation of the Contents class is defined in the .cc file + struct Contents { + Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema, + const std::shared_ptr& key_value_metadata) + : schema_(), key_value_metadata_(key_value_metadata) { + schema_.Init(schema); + } + virtual ~Contents() {} + // Perform any cleanup associated with the file contents + virtual void Close() = 0; + + /// \note Deprecated since 1.3.0 + RowGroupWriter* AppendRowGroup(int64_t num_rows); + + virtual RowGroupWriter* AppendRowGroup() = 0; + virtual RowGroupWriter* AppendBufferedRowGroup() = 0; + + virtual int64_t num_rows() const = 0; + virtual int num_columns() const = 0; + virtual int num_row_groups() const = 0; + + virtual const std::shared_ptr& properties() const = 0; + + const std::shared_ptr& key_value_metadata() const { + return key_value_metadata_; + } + + // Return const-pointer to make it clear that this object is not to be copied + const SchemaDescriptor* schema() const { return &schema_; } + + SchemaDescriptor schema_; + + /// This should be the only place this is stored. Everything else is a const reference + std::shared_ptr key_value_metadata_; + + const std::shared_ptr metadata() const { return file_metadata_; } + std::shared_ptr file_metadata_; + }; + + ParquetFileWriter(); + ~ParquetFileWriter(); + + static std::unique_ptr Open( + const std::shared_ptr<::arrow::io::OutputStream>& sink, + const std::shared_ptr& schema, + const std::shared_ptr& properties = default_writer_properties(), + const std::shared_ptr& key_value_metadata = NULLPTR); + + ARROW_DEPRECATED("Use version with arrow::io::OutputStream") + static std::unique_ptr Open( + const std::shared_ptr& sink, + const std::shared_ptr& schema, + const std::shared_ptr& properties = default_writer_properties(), + const std::shared_ptr& key_value_metadata = NULLPTR); + + void Open(std::unique_ptr contents); + void Close(); + + // Construct a RowGroupWriter for the indicated number of rows. + // + // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid + // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. + // @param num_rows The number of rows that are stored in the new RowGroup + // + // \deprecated Since 1.3.0 + RowGroupWriter* AppendRowGroup(int64_t num_rows); + + /// Construct a RowGroupWriter with an arbitrary number of rows. + /// + /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid + /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. + RowGroupWriter* AppendRowGroup(); + + /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. + /// Use this if you want to write a RowGroup based on a certain size + /// + /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid + /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. + RowGroupWriter* AppendBufferedRowGroup(); + + /// Number of columns. + /// + /// This number is fixed during the lifetime of the writer as it is determined via + /// the schema. + int num_columns() const; + + /// Number of rows in the yet started RowGroups. + /// + /// Changes on the addition of a new RowGroup. + int64_t num_rows() const; + + /// Number of started RowGroups. + int num_row_groups() const; + + /// Configuration passed to the writer, e.g. the used Parquet format version. + const std::shared_ptr& properties() const; + + /// Returns the file schema descriptor + const SchemaDescriptor* schema() const; + + /// Returns a column descriptor in schema + const ColumnDescriptor* descr(int i) const; + + /// Returns the file custom metadata + const std::shared_ptr& key_value_metadata() const; + + /// Returns the file metadata, only available after calling Close(). + const std::shared_ptr metadata() const; + + private: + // Holds a pointer to an instance of Contents implementation + std::unique_ptr contents_; + std::shared_ptr file_metadata_; +}; + +} // namespace parquet + +#endif // PARQUET_FILE_WRITER_H diff --git a/r/R/inst/include/parquet/hasher.h b/r/R/inst/include/parquet/hasher.h new file mode 100644 index 00000000000..233262ebdd6 --- /dev/null +++ b/r/R/inst/include/parquet/hasher.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_HASHER_H +#define PARQUET_HASHER_H + +#include +#include "parquet/types.h" + +namespace parquet { +// Abstract class for hash +class Hasher { + public: + /// Compute hash for 32 bits value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(int32_t value) const = 0; + + /// Compute hash for 64 bits value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(int64_t value) const = 0; + + /// Compute hash for float value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(float value) const = 0; + + /// Compute hash for double value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(double value) const = 0; + + /// Compute hash for Int96 value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(const Int96* value) const = 0; + + /// Compute hash for ByteArray value by using its plain encoding result. + /// + /// @param value the value to hash. + /// @return hash result. + virtual uint64_t Hash(const ByteArray* value) const = 0; + + /// Compute hash for fixed byte array value by using its plain encoding result. + /// + /// @param value the value address. + /// @param len the value length. + virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; + + virtual ~Hasher() = default; +}; + +} // namespace parquet + +#endif // PARQUET_HASHER_H diff --git a/r/R/inst/include/parquet/metadata.h b/r/R/inst/include/parquet/metadata.h new file mode 100644 index 00000000000..4a7ae447bdd --- /dev/null +++ b/r/R/inst/include/parquet/metadata.h @@ -0,0 +1,304 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_FILE_METADATA_H +#define PARQUET_FILE_METADATA_H + +#include +#include +#include +#include + +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" + +#include "parquet/platform.h" +#include "parquet/properties.h" +#include "parquet/types.h" + +namespace parquet { + +class ColumnDescriptor; +class EncodedStatistics; +class Statistics; +class SchemaDescriptor; + +namespace schema { + +class ColumnPath; + +} // namespace schema + +using KeyValueMetadata = ::arrow::KeyValueMetadata; + +class PARQUET_EXPORT ApplicationVersion { + public: + // Known Versions with Issues + static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); + static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); + static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); + static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); + // Regular expression for the version format + // major . minor . patch unknown - prerelease.x + build info + // Eg: 1.5.0ab-cdh5.5.0+cd + static constexpr char const* VERSION_FORMAT = + "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$"; + // Regular expression for the application format + // application_name version VERSION_FORMAT (build build_name) + // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) + static constexpr char const* APPLICATION_FORMAT = + "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)"; + + // Application that wrote the file. e.g. "IMPALA" + std::string application_; + // Build name + std::string build_; + + // Version of the application that wrote the file, expressed as + // (..). Unmatched parts default to 0. + // "1.2.3" => {1, 2, 3} + // "1.2" => {0, 0, 0} + // "1.2-cdh5" => {0, 0, 0} + // TODO (majetideepak): Implement support for pre_release + struct { + int major; + int minor; + int patch; + std::string unknown; + std::string pre_release; + std::string build_info; + } version; + + ApplicationVersion() {} + explicit ApplicationVersion(const std::string& created_by); + ApplicationVersion(const std::string& application, int major, int minor, int patch); + + // Returns true if version is strictly less than other_version + bool VersionLt(const ApplicationVersion& other_version) const; + + // Returns true if version is strictly less than other_version + bool VersionEq(const ApplicationVersion& other_version) const; + + // Checks if the Version has the correct statistics for a given column + bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics, + SortOrder::type sort_order = SortOrder::SIGNED) const; +}; + +class PARQUET_EXPORT ColumnChunkMetaData { + public: + // API convenience to get a MetaData accessor + static std::unique_ptr Make( + const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR); + + ~ColumnChunkMetaData(); + + // column chunk + int64_t file_offset() const; + + // parameter is only used when a dataset is spread across multiple files + const std::string& file_path() const; + + // column metadata + Type::type type() const; + int64_t num_values() const; + std::shared_ptr path_in_schema() const; + bool is_stats_set() const; + std::shared_ptr statistics() const; + Compression::type compression() const; + const std::vector& encodings() const; + bool has_dictionary_page() const; + int64_t dictionary_page_offset() const; + int64_t data_page_offset() const; + bool has_index_page() const; + int64_t index_page_offset() const; + int64_t total_compressed_size() const; + int64_t total_uncompressed_size() const; + + private: + explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, + const ApplicationVersion* writer_version = NULLPTR); + // PIMPL Idiom + class ColumnChunkMetaDataImpl; + std::unique_ptr impl_; +}; + +class PARQUET_EXPORT RowGroupMetaData { + public: + // API convenience to get a MetaData accessor + static std::unique_ptr Make( + const void* metadata, const SchemaDescriptor* schema, + const ApplicationVersion* writer_version = NULLPTR); + + ~RowGroupMetaData(); + + // row-group metadata + int num_columns() const; + int64_t num_rows() const; + int64_t total_byte_size() const; + // Return const-pointer to make it clear that this object is not to be copied + const SchemaDescriptor* schema() const; + std::unique_ptr ColumnChunk(int i) const; + + private: + explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, + const ApplicationVersion* writer_version = NULLPTR); + // PIMPL Idiom + class RowGroupMetaDataImpl; + std::unique_ptr impl_; +}; + +class FileMetaDataBuilder; + +class PARQUET_EXPORT FileMetaData { + public: + // API convenience to get a MetaData accessor + static std::shared_ptr Make(const void* serialized_metadata, + uint32_t* metadata_len); + + ~FileMetaData(); + + // file metadata + uint32_t size() const; + int num_columns() const; + int64_t num_rows() const; + int num_row_groups() const; + ParquetVersion::type version() const; + const std::string& created_by() const; + int num_schema_elements() const; + std::unique_ptr RowGroup(int i) const; + + const ApplicationVersion& writer_version() const; + + void WriteTo(::arrow::io::OutputStream* dst) const; + + // Return const-pointer to make it clear that this object is not to be copied + const SchemaDescriptor* schema() const; + + std::shared_ptr key_value_metadata() const; + + // Set file_path ColumnChunk fields to a particular value + void set_file_path(const std::string& path); + + private: + friend FileMetaDataBuilder; + explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); + + // PIMPL Idiom + FileMetaData(); + class FileMetaDataImpl; + std::unique_ptr impl_; +}; + +// Builder API +class PARQUET_EXPORT ColumnChunkMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr Make( + const std::shared_ptr& props, const ColumnDescriptor* column); + + static std::unique_ptr Make( + const std::shared_ptr& props, const ColumnDescriptor* column, + void* contents); + + ~ColumnChunkMetaDataBuilder(); + + // column chunk + // Used when a dataset is spread across multiple files + void set_file_path(const std::string& path); + // column metadata + void SetStatistics(const EncodedStatistics& stats); + // get the column descriptor + const ColumnDescriptor* descr() const; + // commit the metadata + void Finish(int64_t num_values, int64_t dictonary_page_offset, + int64_t index_page_offset, int64_t data_page_offset, + int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, + bool dictionary_fallback); + + // The metadata contents, suitable for passing to ColumnChunkMetaData::Make + const void* contents() const; + + // For writing metadata at end of column chunk + void WriteTo(::arrow::io::OutputStream* sink); + + private: + explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, + const ColumnDescriptor* column); + explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, + const ColumnDescriptor* column, void* contents); + // PIMPL Idiom + class ColumnChunkMetaDataBuilderImpl; + std::unique_ptr impl_; +}; + +class PARQUET_EXPORT RowGroupMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr Make( + const std::shared_ptr& props, const SchemaDescriptor* schema_, + void* contents); + + ~RowGroupMetaDataBuilder(); + + ColumnChunkMetaDataBuilder* NextColumnChunk(); + int num_columns(); + int64_t num_rows(); + int current_column() const; + + void set_num_rows(int64_t num_rows); + + // commit the metadata + void Finish(int64_t total_bytes_written); + + private: + explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, + const SchemaDescriptor* schema_, void* contents); + // PIMPL Idiom + class RowGroupMetaDataBuilderImpl; + std::unique_ptr impl_; +}; + +class PARQUET_EXPORT FileMetaDataBuilder { + public: + // API convenience to get a MetaData reader + static std::unique_ptr Make( + const SchemaDescriptor* schema, const std::shared_ptr& props, + const std::shared_ptr& key_value_metadata = NULLPTR); + + ~FileMetaDataBuilder(); + + // The prior RowGroupMetaDataBuilder (if any) is destroyed + RowGroupMetaDataBuilder* AppendRowGroup(); + + // Complete the Thrift structure + std::unique_ptr Finish(); + + private: + explicit FileMetaDataBuilder( + const SchemaDescriptor* schema, const std::shared_ptr& props, + const std::shared_ptr& key_value_metadata = NULLPTR); + // PIMPL Idiom + class FileMetaDataBuilderImpl; + std::unique_ptr impl_; +}; + +PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); + +} // namespace parquet + +#endif // PARQUET_FILE_METADATA_H diff --git a/r/R/inst/include/parquet/murmur3.h b/r/R/inst/include/parquet/murmur3.h new file mode 100644 index 00000000000..d12ae0238b0 --- /dev/null +++ b/r/R/inst/include/parquet/murmur3.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef PARQUET_MURMURHASH3_H_ +#define PARQUET_MURMURHASH3_H_ + +#include + +#include "parquet/hasher.h" +#include "parquet/platform.h" +#include "parquet/types.h" + +namespace parquet { + +/// Source: +/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp +/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class) +class PARQUET_EXPORT MurmurHash3 : public Hasher { + public: + MurmurHash3() : seed_(DEFAULT_SEED) {} + uint64_t Hash(int32_t value) const override; + uint64_t Hash(int64_t value) const override; + uint64_t Hash(float value) const override; + uint64_t Hash(double value) const override; + uint64_t Hash(const Int96* value) const override; + uint64_t Hash(const ByteArray* value) const override; + uint64_t Hash(const FLBA* val, uint32_t len) const override; + + private: + // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated + // by System.nanoTime() of java. + static constexpr int DEFAULT_SEED = 1361930890; + + uint32_t seed_; +}; + +} // namespace parquet + +#endif // PARQUET_MURMURHASH3_H_ diff --git a/r/R/inst/include/parquet/platform.h b/r/R/inst/include/parquet/platform.h new file mode 100644 index 00000000000..25d8dd4d94d --- /dev/null +++ b/r/R/inst/include/parquet/platform.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/buffer.h" // IWYU pragma: export +#include "arrow/io/interfaces.h" // IWYU pragma: export +#include "arrow/io/memory.h" // IWYU pragma: export +#include "arrow/memory_pool.h" // IWYU pragma: export +#include "arrow/status.h" // IWYU pragma: export +#include "arrow/util/bit-util.h" // IWYU pragma: export +#include "arrow/util/macros.h" // IWYU pragma: export +#include "arrow/util/string_view.h" // IWYU pragma: export + +#if defined(_WIN32) || defined(__CYGWIN__) + +#if defined(_MSC_VER) +#pragma warning(push) +// Disable warning for STL types usage in DLL interface +// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports +#pragma warning(disable : 4275 4251) +// Disable diamond inheritance warnings +#pragma warning(disable : 4250) +// Disable macro redefinition warnings +#pragma warning(disable : 4005) +// Disable extern before exported template warnings +#pragma warning(disable : 4910) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef PARQUET_STATIC +#define PARQUET_EXPORT +#elif defined(PARQUET_EXPORTING) +#define PARQUET_EXPORT __declspec(dllexport) +#else +#define PARQUET_EXPORT __declspec(dllimport) +#endif + +#define PARQUET_NO_EXPORT + +#else // Not Windows +#ifndef PARQUET_EXPORT +#define PARQUET_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef PARQUET_NO_EXPORT +#define PARQUET_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +// This is a complicated topic, some reading on it: +// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ +#if defined(_MSC_VER) || defined(__clang__) +#define PARQUET_TEMPLATE_CLASS_EXPORT +#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT +#else +#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT +#define PARQUET_TEMPLATE_EXPORT +#endif + +#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN + +#define PARQUET_NORETURN ARROW_NORETURN +#define PARQUET_DEPRECATED ARROW_DEPRECATED + +// If ARROW_VALGRIND set when compiling unit tests, also define +// PARQUET_VALGRIND +#ifdef ARROW_VALGRIND +#define PARQUET_VALGRIND +#endif + +namespace parquet { + +namespace BitUtil = ::arrow::BitUtil; + +using Buffer = ::arrow::Buffer; +using MemoryPool = ::arrow::MemoryPool; +using MutableBuffer = ::arrow::MutableBuffer; +using ResizableBuffer = ::arrow::ResizableBuffer; +using ResizableBuffer = ::arrow::ResizableBuffer; +using ArrowInputFile = ::arrow::io::RandomAccessFile; +using ArrowInputStream = ::arrow::io::InputStream; +using ArrowOutputStream = ::arrow::io::OutputStream; +using string_view = ::arrow::util::string_view; + +constexpr int64_t kDefaultOutputStreamSize = 1024; + +PARQUET_EXPORT +std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream( + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + +PARQUET_EXPORT +std::shared_ptr AllocateBuffer( + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0); + +} // namespace parquet diff --git a/r/R/inst/include/parquet/printer.h b/r/R/inst/include/parquet/printer.h new file mode 100644 index 00000000000..751b8a44d07 --- /dev/null +++ b/r/R/inst/include/parquet/printer.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_FILE_PRINTER_H +#define PARQUET_FILE_PRINTER_H + +#include +#include + +#include "parquet/platform.h" + +namespace parquet { + +class ParquetFileReader; + +class PARQUET_EXPORT ParquetFilePrinter { + private: + ParquetFileReader* fileReader; + + public: + explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {} + ~ParquetFilePrinter() {} + + void DebugPrint(std::ostream& stream, std::list selected_columns, + bool print_values = false, bool format_dump = false, + bool print_key_value_metadata = false, + const char* filename = "No Name"); + + void JSONPrint(std::ostream& stream, std::list selected_columns, + const char* filename = "No Name"); +}; + +} // namespace parquet + +#endif // PARQUET_FILE_PRINTER_H diff --git a/r/R/inst/include/parquet/properties.h b/r/R/inst/include/parquet/properties.h new file mode 100644 index 00000000000..7277f3a61e6 --- /dev/null +++ b/r/R/inst/include/parquet/properties.h @@ -0,0 +1,428 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_COLUMN_PROPERTIES_H +#define PARQUET_COLUMN_PROPERTIES_H + +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/parquet_version.h" +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +struct ParquetVersion { + enum type { PARQUET_1_0, PARQUET_2_0 }; +}; + +static int64_t DEFAULT_BUFFER_SIZE = 0; +static bool DEFAULT_USE_BUFFERED_STREAM = false; + +class PARQUET_EXPORT ReaderProperties { + public: + explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : pool_(pool) { + buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; + buffer_size_ = DEFAULT_BUFFER_SIZE; + } + + ::arrow::MemoryPool* memory_pool() const { return pool_; } + + std::shared_ptr GetStream(std::shared_ptr source, + int64_t start, int64_t num_bytes); + + bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; } + + void enable_buffered_stream() { buffered_stream_enabled_ = true; } + + void disable_buffered_stream() { buffered_stream_enabled_ = false; } + + void set_buffer_size(int64_t buf_size) { buffer_size_ = buf_size; } + + int64_t buffer_size() const { return buffer_size_; } + + private: + ::arrow::MemoryPool* pool_; + int64_t buffer_size_; + bool buffered_stream_enabled_; +}; + +ReaderProperties PARQUET_EXPORT default_reader_properties(); + +static constexpr int64_t DEFAULT_PAGE_SIZE = 1024 * 1024; +static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true; +static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = DEFAULT_PAGE_SIZE; +static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; +static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024; +static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; +static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; +static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; +static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = + ParquetVersion::PARQUET_1_0; +static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; +static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; + +class PARQUET_EXPORT ColumnProperties { + public: + ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, + Compression::type codec = DEFAULT_COMPRESSION_TYPE, + bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, + bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, + size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE) + : encoding_(encoding), + codec_(codec), + dictionary_enabled_(dictionary_enabled), + statistics_enabled_(statistics_enabled), + max_stats_size_(max_stats_size) {} + + void set_encoding(Encoding::type encoding) { encoding_ = encoding; } + + void set_compression(Compression::type codec) { codec_ = codec; } + + void set_dictionary_enabled(bool dictionary_enabled) { + dictionary_enabled_ = dictionary_enabled; + } + + void set_statistics_enabled(bool statistics_enabled) { + statistics_enabled_ = statistics_enabled; + } + + void set_max_statistics_size(size_t max_stats_size) { + max_stats_size_ = max_stats_size; + } + + Encoding::type encoding() const { return encoding_; } + + Compression::type compression() const { return codec_; } + + bool dictionary_enabled() const { return dictionary_enabled_; } + + bool statistics_enabled() const { return statistics_enabled_; } + + size_t max_statistics_size() const { return max_stats_size_; } + + private: + Encoding::type encoding_; + Compression::type codec_; + bool dictionary_enabled_; + bool statistics_enabled_; + size_t max_stats_size_; +}; + +class PARQUET_EXPORT WriterProperties { + public: + class Builder { + public: + Builder() + : pool_(::arrow::default_memory_pool()), + dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT), + write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), + max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), + pagesize_(DEFAULT_PAGE_SIZE), + version_(DEFAULT_WRITER_VERSION), + created_by_(DEFAULT_CREATED_BY) {} + virtual ~Builder() {} + + Builder* memory_pool(::arrow::MemoryPool* pool) { + pool_ = pool; + return this; + } + + Builder* enable_dictionary() { + default_column_properties_.set_dictionary_enabled(true); + return this; + } + + Builder* disable_dictionary() { + default_column_properties_.set_dictionary_enabled(false); + return this; + } + + Builder* enable_dictionary(const std::string& path) { + dictionary_enabled_[path] = true; + return this; + } + + Builder* enable_dictionary(const std::shared_ptr& path) { + return this->enable_dictionary(path->ToDotString()); + } + + Builder* disable_dictionary(const std::string& path) { + dictionary_enabled_[path] = false; + return this; + } + + Builder* disable_dictionary(const std::shared_ptr& path) { + return this->disable_dictionary(path->ToDotString()); + } + + Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) { + dictionary_pagesize_limit_ = dictionary_psize_limit; + return this; + } + + Builder* write_batch_size(int64_t write_batch_size) { + write_batch_size_ = write_batch_size; + return this; + } + + Builder* max_row_group_length(int64_t max_row_group_length) { + max_row_group_length_ = max_row_group_length; + return this; + } + + Builder* data_pagesize(int64_t pg_size) { + pagesize_ = pg_size; + return this; + } + + Builder* version(ParquetVersion::type version) { + version_ = version; + return this; + } + + Builder* created_by(const std::string& created_by) { + created_by_ = created_by; + return this; + } + + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ + Builder* encoding(Encoding::type encoding_type) { + if (encoding_type == Encoding::PLAIN_DICTIONARY || + encoding_type == Encoding::RLE_DICTIONARY) { + throw ParquetException("Can't use dictionary encoding as fallback encoding"); + } + + default_column_properties_.set_encoding(encoding_type); + return this; + } + + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ + Builder* encoding(const std::string& path, Encoding::type encoding_type) { + if (encoding_type == Encoding::PLAIN_DICTIONARY || + encoding_type == Encoding::RLE_DICTIONARY) { + throw ParquetException("Can't use dictionary encoding as fallback encoding"); + } + + encodings_[path] = encoding_type; + return this; + } + + /** + * Define the encoding that is used when we don't utilise dictionary encoding. + * + * This either apply if dictionary encoding is disabled or if we fallback + * as the dictionary grew too large. + */ + Builder* encoding(const std::shared_ptr& path, + Encoding::type encoding_type) { + return this->encoding(path->ToDotString(), encoding_type); + } + + Builder* compression(Compression::type codec) { + default_column_properties_.set_compression(codec); + return this; + } + + Builder* max_statistics_size(size_t max_stats_sz) { + default_column_properties_.set_max_statistics_size(max_stats_sz); + return this; + } + + Builder* compression(const std::string& path, Compression::type codec) { + codecs_[path] = codec; + return this; + } + + Builder* compression(const std::shared_ptr& path, + Compression::type codec) { + return this->compression(path->ToDotString(), codec); + } + + Builder* enable_statistics() { + default_column_properties_.set_statistics_enabled(true); + return this; + } + + Builder* disable_statistics() { + default_column_properties_.set_statistics_enabled(false); + return this; + } + + Builder* enable_statistics(const std::string& path) { + statistics_enabled_[path] = true; + return this; + } + + Builder* enable_statistics(const std::shared_ptr& path) { + return this->enable_statistics(path->ToDotString()); + } + + Builder* disable_statistics(const std::string& path) { + statistics_enabled_[path] = false; + return this; + } + + Builder* disable_statistics(const std::shared_ptr& path) { + return this->disable_statistics(path->ToDotString()); + } + + std::shared_ptr build() { + std::unordered_map column_properties; + auto get = [&](const std::string& key) -> ColumnProperties& { + auto it = column_properties.find(key); + if (it == column_properties.end()) + return column_properties[key] = default_column_properties_; + else + return it->second; + }; + + for (const auto& item : encodings_) get(item.first).set_encoding(item.second); + for (const auto& item : codecs_) get(item.first).set_compression(item.second); + for (const auto& item : dictionary_enabled_) + get(item.first).set_dictionary_enabled(item.second); + for (const auto& item : statistics_enabled_) + get(item.first).set_statistics_enabled(item.second); + + return std::shared_ptr( + new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, + max_row_group_length_, pagesize_, version_, created_by_, + default_column_properties_, column_properties)); + } + + private: + ::arrow::MemoryPool* pool_; + int64_t dictionary_pagesize_limit_; + int64_t write_batch_size_; + int64_t max_row_group_length_; + int64_t pagesize_; + ParquetVersion::type version_; + std::string created_by_; + + // Settings used for each column unless overridden in any of the maps below + ColumnProperties default_column_properties_; + std::unordered_map encodings_; + std::unordered_map codecs_; + std::unordered_map dictionary_enabled_; + std::unordered_map statistics_enabled_; + }; + + inline ::arrow::MemoryPool* memory_pool() const { return pool_; } + + inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; } + + inline int64_t write_batch_size() const { return write_batch_size_; } + + inline int64_t max_row_group_length() const { return max_row_group_length_; } + + inline int64_t data_pagesize() const { return pagesize_; } + + inline ParquetVersion::type version() const { return parquet_version_; } + + inline std::string created_by() const { return parquet_created_by_; } + + inline Encoding::type dictionary_index_encoding() const { + if (parquet_version_ == ParquetVersion::PARQUET_1_0) { + return Encoding::PLAIN_DICTIONARY; + } else { + return Encoding::RLE_DICTIONARY; + } + } + + inline Encoding::type dictionary_page_encoding() const { + if (parquet_version_ == ParquetVersion::PARQUET_1_0) { + return Encoding::PLAIN_DICTIONARY; + } else { + return Encoding::PLAIN; + } + } + + const ColumnProperties& column_properties( + const std::shared_ptr& path) const { + auto it = column_properties_.find(path->ToDotString()); + if (it != column_properties_.end()) return it->second; + return default_column_properties_; + } + + Encoding::type encoding(const std::shared_ptr& path) const { + return column_properties(path).encoding(); + } + + Compression::type compression(const std::shared_ptr& path) const { + return column_properties(path).compression(); + } + + bool dictionary_enabled(const std::shared_ptr& path) const { + return column_properties(path).dictionary_enabled(); + } + + bool statistics_enabled(const std::shared_ptr& path) const { + return column_properties(path).statistics_enabled(); + } + + size_t max_statistics_size(const std::shared_ptr& path) const { + return column_properties(path).max_statistics_size(); + } + + private: + explicit WriterProperties( + ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, + int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, + ParquetVersion::type version, const std::string& created_by, + const ColumnProperties& default_column_properties, + const std::unordered_map& column_properties) + : pool_(pool), + dictionary_pagesize_limit_(dictionary_pagesize_limit), + write_batch_size_(write_batch_size), + max_row_group_length_(max_row_group_length), + pagesize_(pagesize), + parquet_version_(version), + parquet_created_by_(created_by), + default_column_properties_(default_column_properties), + column_properties_(column_properties) {} + + ::arrow::MemoryPool* pool_; + int64_t dictionary_pagesize_limit_; + int64_t write_batch_size_; + int64_t max_row_group_length_; + int64_t pagesize_; + ParquetVersion::type parquet_version_; + std::string parquet_created_by_; + ColumnProperties default_column_properties_; + std::unordered_map column_properties_; +}; + +std::shared_ptr PARQUET_EXPORT default_writer_properties(); + +} // namespace parquet + +#endif // PARQUET_COLUMN_PROPERTIES_H diff --git a/r/R/inst/include/parquet/schema-internal.h b/r/R/inst/include/parquet/schema-internal.h new file mode 100644 index 00000000000..42eac097ade --- /dev/null +++ b/r/R/inst/include/parquet/schema-internal.h @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module contains the logical parquet-cpp types (independent of Thrift +// structures), schema nodes, and related type tools + +#ifndef PARQUET_SCHEMA_INTERNAL_H +#define PARQUET_SCHEMA_INTERNAL_H + +#include +#include +#include +#include +#include + +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +namespace format { +class SchemaElement; +} + +namespace schema { + +inline bool str_endswith_tuple(const std::string& str) { + if (str.size() >= 6) { + return str.substr(str.size() - 6, 6) == "_tuple"; + } + return false; +} + +// Special case mentioned in the format spec: +// If the name is array or ends in _tuple, this should be a list of struct +// even for single child elements. +inline bool HasStructListName(const GroupNode& node) { + return (node.name() == "array" || str_endswith_tuple(node.name())); +} + +// TODO(itaiin): This aux. function is to be deleted once repeated structs are supported +inline bool IsSimpleStruct(const Node* node) { + if (!node->is_group()) return false; + if (node->is_repeated()) return false; + if (node->logical_type() == LogicalType::LIST) return false; + // Special case mentioned in the format spec: + // If the name is array or ends in _tuple, this should be a list of struct + // even for single child elements. + auto group = static_cast(node); + if (group->field_count() == 1 && HasStructListName(*group)) return false; + + return true; +} + +// Coalesce a list of schema fields indices which are the roots of the +// columns referred by a list of column indices +inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr, + const std::vector& column_indices, + std::vector* out) { + const GroupNode* group = descr.group_node(); + std::unordered_set already_added; + out->clear(); + for (auto& column_idx : column_indices) { + auto field_node = descr.GetColumnRoot(column_idx); + auto field_idx = group->FieldIndex(*field_node); + if (field_idx < 0) { + return false; + } + auto insertion = already_added.insert(field_idx); + if (insertion.second) { + out->push_back(field_idx); + } + } + + return true; +} + +// ---------------------------------------------------------------------- +// Conversion from Parquet Thrift metadata + +std::shared_ptr FromParquet( + const std::vector& schema); + +class FlatSchemaConverter { + public: + FlatSchemaConverter(const format::SchemaElement* elements, int length) + : elements_(elements), length_(length), pos_(0), current_id_(0) {} + + std::unique_ptr Convert(); + + private: + const format::SchemaElement* elements_; + int length_; + int pos_; + int current_id_; + + int next_id() { return current_id_++; } + + const format::SchemaElement& Next(); + + std::unique_ptr NextNode(); +}; + +// ---------------------------------------------------------------------- +// Conversion to Parquet Thrift metadata + +void ToParquet(const GroupNode* schema, std::vector* out); + +// Converts nested parquet schema back to a flat vector of Thrift structs +class SchemaFlattener { + public: + SchemaFlattener(const GroupNode* schema, std::vector* out); + + void Flatten(); + + private: + const GroupNode* root_; + std::vector* elements_; +}; + +} // namespace schema +} // namespace parquet + +#endif // PARQUET_SCHEMA_INTERNAL_H diff --git a/r/R/inst/include/parquet/schema.h b/r/R/inst/include/parquet/schema.h new file mode 100644 index 00000000000..e35d6599fe0 --- /dev/null +++ b/r/R/inst/include/parquet/schema.h @@ -0,0 +1,470 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module contains the logical parquet-cpp types (independent of Thrift +// structures), schema nodes, and related type tools + +#ifndef PARQUET_SCHEMA_TYPES_H +#define PARQUET_SCHEMA_TYPES_H + +#include +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" + +#include "parquet/platform.h" +#include "parquet/types.h" + +namespace parquet { + +class SchemaDescriptor; + +namespace schema { + +class Node; + +// List encodings: using the terminology from Impala to define different styles +// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since +// the converted type named in the Parquet metadata is ConvertedType::LIST we +// use that terminology here. It also helps distinguish from the *_ARRAY +// primitive types. +// +// One-level encoding: Only allows required lists with required cells +// repeated value_type name +// +// Two-level encoding: Enables optional lists with only required cells +// group list +// repeated value_type item +// +// Three-level encoding: Enables optional lists with optional cells +// group bag +// repeated group list +// value_type item +// +// 2- and 1-level encoding are respectively equivalent to 3-level encoding with +// the non-repeated nodes set to required. +// +// The "official" encoding recommended in the Parquet spec is the 3-level, and +// we use that as the default when creating list types. For semantic completeness +// we allow the other two. Since all types of encodings will occur "in the +// wild" we need to be able to interpret the associated definition levels in +// the context of the actual encoding used in the file. +// +// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated +// SchemaElement, which could make things challenging if we are trying to infer +// that a sequence of nodes semantically represents an array according to one +// of these encodings (versus a struct containing an array). We should refuse +// the temptation to guess, as they say. +struct ListEncoding { + enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; +}; + +class PARQUET_EXPORT ColumnPath { + public: + ColumnPath() : path_() {} + explicit ColumnPath(const std::vector& path) : path_(path) {} + explicit ColumnPath(std::vector&& path) : path_(path) {} + + static std::shared_ptr FromDotString(const std::string& dotstring); + static std::shared_ptr FromNode(const Node& node); + + std::shared_ptr extend(const std::string& node_name) const; + std::string ToDotString() const; + const std::vector& ToDotVector() const; + + protected: + std::vector path_; +}; + +class GroupNode; + +// Base class for logical schema types. A type has a name, repetition level, +// and optionally a logical type (ConvertedType in Parquet metadata parlance) +class PARQUET_EXPORT Node { + public: + enum type { PRIMITIVE, GROUP }; + + Node(Node::type type, const std::string& name, Repetition::type repetition, + LogicalType::type logical_type = LogicalType::NONE, int id = -1) + : type_(type), + name_(name), + repetition_(repetition), + logical_type_(logical_type), + id_(id), + parent_(NULLPTR) {} + + Node(Node::type type, const std::string& name, Repetition::type repetition, + std::shared_ptr logical_annotation, int id = -1) + : type_(type), + name_(name), + repetition_(repetition), + logical_annotation_(logical_annotation), + id_(id), + parent_(NULLPTR) {} + + virtual ~Node() {} + + bool is_primitive() const { return type_ == Node::PRIMITIVE; } + + bool is_group() const { return type_ == Node::GROUP; } + + bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } + + bool is_repeated() const { return repetition_ == Repetition::REPEATED; } + + bool is_required() const { return repetition_ == Repetition::REQUIRED; } + + virtual bool Equals(const Node* other) const = 0; + + const std::string& name() const { return name_; } + + Node::type node_type() const { return type_; } + + Repetition::type repetition() const { return repetition_; } + + LogicalType::type logical_type() const { return logical_type_; } + + const std::shared_ptr& logical_annotation() const { + return logical_annotation_; + } + + int id() const { return id_; } + + const Node* parent() const { return parent_; } + + const std::shared_ptr path() const; + + virtual void ToParquet(void* element) const = 0; + + // Node::Visitor abstract class for walking schemas with the visitor pattern + class Visitor { + public: + virtual ~Visitor() {} + + virtual void Visit(Node* node) = 0; + }; + class ConstVisitor { + public: + virtual ~ConstVisitor() {} + + virtual void Visit(const Node* node) = 0; + }; + + virtual void Visit(Visitor* visitor) = 0; + virtual void VisitConst(ConstVisitor* visitor) const = 0; + + protected: + friend class GroupNode; + + Node::type type_; + std::string name_; + Repetition::type repetition_; + LogicalType::type logical_type_; + std::shared_ptr logical_annotation_; + int id_; + // Nodes should not be shared, they have a single parent. + const Node* parent_; + + bool EqualsInternal(const Node* other) const; + void SetParent(const Node* p_parent); + + private: + PARQUET_DISALLOW_COPY_AND_ASSIGN(Node); +}; + +// Save our breath all over the place with these typedefs +typedef std::shared_ptr NodePtr; +typedef std::vector NodeVector; + +// A type that is one of the primitive Parquet storage types. In addition to +// the other type metadata (name, repetition level, logical type), also has the +// physical storage type and their type-specific metadata (byte width, decimal +// parameters) +class PARQUET_EXPORT PrimitiveNode : public Node { + public: + static std::unique_ptr FromParquet(const void* opaque_element, int id); + + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + Type::type type, + LogicalType::type logical_type = LogicalType::NONE, + int length = -1, int precision = -1, int scale = -1) { + return NodePtr(new PrimitiveNode(name, repetition, type, logical_type, length, + precision, scale)); + } + + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + std::shared_ptr logical_annotation, + Type::type primitive_type, int primitive_length = -1) { + return NodePtr(new PrimitiveNode(name, repetition, logical_annotation, primitive_type, + primitive_length)); + } + + bool Equals(const Node* other) const override; + + Type::type physical_type() const { return physical_type_; } + + ColumnOrder column_order() const { return column_order_; } + + void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; } + + int32_t type_length() const { return type_length_; } + + const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } + + void ToParquet(void* element) const override; + void Visit(Visitor* visitor) override; + void VisitConst(ConstVisitor* visitor) const override; + + private: + PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, + LogicalType::type logical_type = LogicalType::NONE, int length = -1, + int precision = -1, int scale = -1, int id = -1); + + PrimitiveNode(const std::string& name, Repetition::type repetition, + std::shared_ptr logical_annotation, + Type::type primitive_type, int primitive_length = -1, int id = -1); + + Type::type physical_type_; + int32_t type_length_; + DecimalMetadata decimal_metadata_; + ColumnOrder column_order_; + + // For FIXED_LEN_BYTE_ARRAY + void SetTypeLength(int32_t length) { type_length_ = length; } + + bool EqualsInternal(const PrimitiveNode* other) const; + + FRIEND_TEST(TestPrimitiveNode, Attrs); + FRIEND_TEST(TestPrimitiveNode, Equals); + FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); + FRIEND_TEST(TestPrimitiveNode, FromParquet); +}; + +class PARQUET_EXPORT GroupNode : public Node { + public: + static std::unique_ptr FromParquet(const void* opaque_element, int id, + const NodeVector& fields); + + static inline NodePtr Make(const std::string& name, Repetition::type repetition, + const NodeVector& fields, + LogicalType::type logical_type = LogicalType::NONE) { + return NodePtr(new GroupNode(name, repetition, fields, logical_type)); + } + + static inline NodePtr Make( + const std::string& name, Repetition::type repetition, const NodeVector& fields, + std::shared_ptr logical_annotation) { + return NodePtr(new GroupNode(name, repetition, fields, logical_annotation)); + } + + bool Equals(const Node* other) const override; + + NodePtr field(int i) const { return fields_[i]; } + // Get the index of a field by its name, or negative value if not found. + // If several fields share the same name, it is unspecified which one + // is returned. + int FieldIndex(const std::string& name) const; + // Get the index of a field by its node, or negative value if not found. + int FieldIndex(const Node& node) const; + + int field_count() const { return static_cast(fields_.size()); } + + void ToParquet(void* element) const override; + void Visit(Visitor* visitor) override; + void VisitConst(ConstVisitor* visitor) const override; + + private: + GroupNode(const std::string& name, Repetition::type repetition, + const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE, + int id = -1); + + GroupNode(const std::string& name, Repetition::type repetition, + const NodeVector& fields, + std::shared_ptr logical_annotation, int id = -1); + + NodeVector fields_; + bool EqualsInternal(const GroupNode* other) const; + + // Mapping between field name to the field index + std::unordered_multimap field_name_to_idx_; + + FRIEND_TEST(TestGroupNode, Attrs); + FRIEND_TEST(TestGroupNode, Equals); + FRIEND_TEST(TestGroupNode, FieldIndex); + FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName); +}; + +// ---------------------------------------------------------------------- +// Convenience primitive type factory functions + +#define PRIMITIVE_FACTORY(FuncName, TYPE) \ + static inline NodePtr FuncName(const std::string& name, \ + Repetition::type repetition = Repetition::OPTIONAL) { \ + return PrimitiveNode::Make(name, repetition, Type::TYPE); \ + } + +PRIMITIVE_FACTORY(Boolean, BOOLEAN); +PRIMITIVE_FACTORY(Int32, INT32); +PRIMITIVE_FACTORY(Int64, INT64); +PRIMITIVE_FACTORY(Int96, INT96); +PRIMITIVE_FACTORY(Float, FLOAT); +PRIMITIVE_FACTORY(Double, DOUBLE); +PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); + +void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream, + int indent_width = 2); + +} // namespace schema + +// The ColumnDescriptor encapsulates information necessary to interpret +// primitive column data in the context of a particular schema. We have to +// examine the node structure of a column's path to the root in the schema tree +// to be able to reassemble the nested structure from the repetition and +// definition levels. +class PARQUET_EXPORT ColumnDescriptor { + public: + ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, + int16_t max_repetition_level, + const SchemaDescriptor* schema_descr = NULLPTR); + + bool Equals(const ColumnDescriptor& other) const; + + int16_t max_definition_level() const { return max_definition_level_; } + + int16_t max_repetition_level() const { return max_repetition_level_; } + + Type::type physical_type() const { return primitive_node_->physical_type(); } + + LogicalType::type logical_type() const { return primitive_node_->logical_type(); } + + const std::shared_ptr& logical_annotation() const { + return primitive_node_->logical_annotation(); + } + + ColumnOrder column_order() const { return primitive_node_->column_order(); } + + SortOrder::type sort_order() const { + auto la = logical_annotation(); + auto pt = physical_type(); + return la ? GetSortOrder(la, pt) : GetSortOrder(logical_type(), pt); + } + + const std::string& name() const { return primitive_node_->name(); } + + const std::shared_ptr path() const; + + const schema::NodePtr& schema_node() const { return node_; } + + std::string ToString() const; + + int type_length() const; + + int type_precision() const; + + int type_scale() const; + + private: + schema::NodePtr node_; + const schema::PrimitiveNode* primitive_node_; + + int16_t max_definition_level_; + int16_t max_repetition_level_; +}; + +// Container for the converted Parquet schema with a computed information from +// the schema analysis needed for file reading +// +// * Column index to Node +// * Max repetition / definition levels for each primitive node +// +// The ColumnDescriptor objects produced by this class can be used to assist in +// the reconstruction of fully materialized data structures from the +// repetition-definition level encoding of nested data +// +// TODO(wesm): this object can be recomputed from a Schema +class PARQUET_EXPORT SchemaDescriptor { + public: + SchemaDescriptor() {} + ~SchemaDescriptor() {} + + // Analyze the schema + void Init(std::unique_ptr schema); + void Init(const schema::NodePtr& schema); + + const ColumnDescriptor* Column(int i) const; + + // Get the index of a column by its dotstring path, or negative value if not found. + // If several columns share the same dotstring path, it is unspecified which one + // is returned. + int ColumnIndex(const std::string& node_path) const; + // Get the index of a column by its node, or negative value if not found. + int ColumnIndex(const schema::Node& node) const; + + bool Equals(const SchemaDescriptor& other) const; + + // The number of physical columns appearing in the file + int num_columns() const { return static_cast(leaves_.size()); } + + const schema::NodePtr& schema_root() const { return schema_; } + + const schema::GroupNode* group_node() const { return group_node_; } + + // Returns the root (child of the schema root) node of the leaf(column) node + const schema::Node* GetColumnRoot(int i) const; + + const std::string& name() const { return group_node_->name(); } + + std::string ToString() const; + + void updateColumnOrders(const std::vector& column_orders); + + private: + friend class ColumnDescriptor; + + // Root Node + schema::NodePtr schema_; + // Root Node + const schema::GroupNode* group_node_; + + void BuildTree(const schema::NodePtr& node, int16_t max_def_level, + int16_t max_rep_level, const schema::NodePtr& base); + + // Result of leaf node / tree analysis + std::vector leaves_; + + // Mapping between leaf nodes and root group of leaf (first node + // below the schema's root group) + // + // For example, the leaf `a.b.c.d` would have a link back to `a` + // + // -- a <------ + // -- -- b | + // -- -- -- c | + // -- -- -- -- d + std::unordered_map leaf_to_base_; + + // Mapping between ColumnPath DotString to the leaf index + std::unordered_multimap leaf_to_idx_; +}; + +} // namespace parquet + +#endif // PARQUET_SCHEMA_TYPES_H diff --git a/r/R/inst/include/parquet/statistics.h b/r/R/inst/include/parquet/statistics.h new file mode 100644 index 00000000000..2dc78da4c3c --- /dev/null +++ b/r/R/inst/include/parquet/statistics.h @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "parquet/platform.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +namespace parquet { + +// ---------------------------------------------------------------------- +// Value comparator interfaces + +/// \brief Base class for value comparators. Generally used with +/// TypedComparator +class PARQUET_EXPORT Comparator { + public: + virtual ~Comparator() {} + + /// \brief Create a comparator explicitly from physical type and + /// sort order + /// \param[in] physical_type the physical type for the typed + /// comparator + /// \param[in] sort_order either SortOrder::SIGNED or + /// SortOrder::UNSIGNED + /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only + static std::shared_ptr Make(Type::type physical_type, + SortOrder::type sort_order, + int type_length = -1); + + /// \brief Create typed comparator inferring default sort order from + /// ColumnDescriptor + /// \param[in] descr the Parquet column schema + static std::shared_ptr Make(const ColumnDescriptor* descr); +}; + +/// \brief Interface for comparison of physical types according to the +/// semantics of a particular logical type. +template +class TypedComparator : public Comparator { + public: + using T = typename DType::c_type; + + /// \brief Typed version of Comparator::Make + static std::shared_ptr> Make(Type::type physical_type, + SortOrder::type sort_order, + int type_length = -1) { + return std::static_pointer_cast>( + Comparator::Make(physical_type, sort_order, type_length)); + } + + /// \brief Typed version of Comparator::Make + static std::shared_ptr> Make(const ColumnDescriptor* descr) { + return std::static_pointer_cast>(Comparator::Make(descr)); + } + + /// \brief Scalar comparison of two elements, return true if first + /// is strictly less than the second + virtual bool Compare(const T& a, const T& b) = 0; + + /// \brief Compute maximum and minimum elements in a batch of + /// elements without any nulls + virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0; + + /// \brief Compute maximum and minimum elements in a batch of + /// elements with accompanying bitmap indicating which elements are + /// included (bit set) and excluded (bit not set) + /// + /// \param[in] values the sequence of values + /// \param[in] length the length of the sequence + /// \param[in] valid_bits a bitmap indicating which elements are + /// included (1) or excluded (0) + /// \param[in] valid_bits_offset the bit offset into the bitmap of + /// the first element in the sequence + /// \param[out] out_min the returned minimum element + /// \param[out] out_max the returned maximum element + virtual void GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, + int64_t valid_bits_offset, T* out_min, T* out_max) = 0; +}; + +// ---------------------------------------------------------------------- + +/// \brief Structure represented encoded statistics to be written to +/// and from Parquet serialized metadata +class PARQUET_EXPORT EncodedStatistics { + std::shared_ptr max_, min_; + bool is_signed_ = false; + + public: + EncodedStatistics() + : max_(std::make_shared()), min_(std::make_shared()) {} + + const std::string& max() const { return *max_; } + const std::string& min() const { return *min_; } + + int64_t null_count = 0; + int64_t distinct_count = 0; + + bool has_min = false; + bool has_max = false; + bool has_null_count = false; + bool has_distinct_count = false; + + // From parquet-mr + // Don't write stats larger than the max size rather than truncating. The + // rationale is that some engines may use the minimum value in the page as + // the true minimum for aggregations and there is no way to mark that a + // value has been truncated and is a lower bound and not in the page. + void ApplyStatSizeLimits(size_t length) { + if (max_->length() > length) { + has_max = false; + } + if (min_->length() > length) { + has_min = false; + } + } + + inline bool is_set() const { + return has_min || has_max || has_null_count || has_distinct_count; + } + + inline bool is_signed() const { return is_signed_; } + + inline void set_is_signed(bool is_signed) { is_signed_ = is_signed; } + + inline EncodedStatistics& set_max(const std::string& value) { + *max_ = value; + has_max = true; + return *this; + } + + inline EncodedStatistics& set_min(const std::string& value) { + *min_ = value; + has_min = true; + return *this; + } + + inline EncodedStatistics& set_null_count(int64_t value) { + null_count = value; + has_null_count = true; + return *this; + } + + inline EncodedStatistics& set_distinct_count(int64_t value) { + distinct_count = value; + has_distinct_count = true; + return *this; + } +}; + +/// \brief Base type for computing column statistics while writing a file +class PARQUET_EXPORT Statistics { + public: + virtual ~Statistics() {} + + /// \brief Create a new statistics instance given a column schema + /// definition + /// \param[in] descr the column schema + /// \param[in] pool a memory pool to use for any memory allocations, optional + static std::shared_ptr Make( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + /// \brief Create a new statistics instance given a column schema + /// definition and pre-existing state + /// \param[in] descr the column schema + /// \param[in] encoded_min the encoded minimum value + /// \param[in] encoded_max the encoded maximum value + /// \param[in] num_values total number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + /// \param[in] has_min_max whether the min/max statistics are set + /// \param[in] pool a memory pool to use for any memory allocations, optional + static std::shared_ptr Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + + /// \brief The number of null values, may not be set + virtual int64_t null_count() const = 0; + + /// \brief The number of distinct values, may not be set + virtual int64_t distinct_count() const = 0; + + /// \brief The total number of values in the column + virtual int64_t num_values() const = 0; + + /// \brief Return true if the min and max statistics are set. Obtain + /// with TypedStatistics::min and max + virtual bool HasMinMax() const = 0; + + /// \brief Reset state of object to initial (no data observed) state + virtual void Reset() = 0; + + /// \brief Plain-encoded minimum value + virtual std::string EncodeMin() = 0; + + /// \brief Plain-encoded maximum value + virtual std::string EncodeMax() = 0; + + /// \brief The finalized encoded form of the statistics for transport + virtual EncodedStatistics Encode() = 0; + + /// \brief The physical type of the column schema + virtual Type::type physical_type() const = 0; + + protected: + static std::shared_ptr Make(Type::type physical_type, const void* min, + const void* max, int64_t num_values, + int64_t null_count, int64_t distinct_count); +}; + +/// \brief A typed implementation of Statistics +template +class TypedStatistics : public Statistics { + public: + using T = typename DType::c_type; + + /// \brief Typed version of Statistics::Make + static std::shared_ptr> Make( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>( + Statistics::Make(descr, pool)); + } + + /// \brief Create Statistics initialized to a particular state + /// \param[in] min the minimum value + /// \param[in] max the minimum value + /// \param[in] num_values number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + static std::shared_ptr> Make(const T& min, const T& max, + int64_t num_values, + int64_t null_count, + int64_t distinct_count) { + return std::static_pointer_cast>(Statistics::Make( + DType::type_num, &min, &max, num_values, null_count, distinct_count)); + } + + /// \brief Typed version of Statistics::Make + static std::shared_ptr> Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, pool)); + } + + /// \brief The current minimum value + virtual const T& min() const = 0; + + /// \brief The current maximum value + virtual const T& max() const = 0; + + /// \brief Update state with state of another Statistics object + virtual void Merge(const TypedStatistics& other) = 0; + + /// \brief Batch statistics update + virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0; + + /// \brief Batch statistics update with supplied validity bitmap + virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits, + int64_t valid_bits_spaced, int64_t num_not_null, + int64_t num_null) = 0; + + /// \brief Set min and max values to particular values + virtual void SetMinMax(const T& min, const T& max) = 0; +}; + +#ifndef ARROW_NO_DEPRECATED_API +// TODO(wesm): Remove after Arrow 0.14.0 +using RowGroupStatistics = Statistics; +#endif + +using BoolStatistics = TypedStatistics; +using Int32Statistics = TypedStatistics; +using Int64Statistics = TypedStatistics; +using FloatStatistics = TypedStatistics; +using DoubleStatistics = TypedStatistics; +using ByteArrayStatistics = TypedStatistics; +using FLBAStatistics = TypedStatistics; + +} // namespace parquet diff --git a/r/R/inst/include/parquet/test-util.h b/r/R/inst/include/parquet/test-util.h new file mode 100644 index 00000000000..c49dcda181b --- /dev/null +++ b/r/R/inst/include/parquet/test-util.h @@ -0,0 +1,710 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This module defines an abstract interface for iterating through pages in a +// Parquet column chunk within a row group. It could be extended in the future +// to iterate through all data pages in all chunks in a file. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/testing/util.h" + +#include "parquet/column_page.h" +#include "parquet/column_reader.h" +#include "parquet/column_writer.h" +#include "parquet/encoding.h" +#include "parquet/platform.h" + +namespace parquet { + +static constexpr int FLBA_LENGTH = 12; + +inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) { + return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH); +} + +namespace test { + +typedef ::testing::Types + ParquetTypes; + +class ParquetTestException : public parquet::ParquetException { + using ParquetException::ParquetException; +}; + +const char* get_data_dir(); +std::string get_bad_data_dir(); + +std::string get_data_file(const std::string& filename, bool is_good = true); + +template +static inline void assert_vector_equal(const std::vector& left, + const std::vector& right) { + ASSERT_EQ(left.size(), right.size()); + + for (size_t i = 0; i < left.size(); ++i) { + ASSERT_EQ(left[i], right[i]) << i; + } +} + +template +static inline bool vector_equal(const std::vector& left, const std::vector& right) { + if (left.size() != right.size()) { + return false; + } + + for (size_t i = 0; i < left.size(); ++i) { + if (left[i] != right[i]) { + std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i] + << std::endl; + return false; + } + } + + return true; +} + +template +static std::vector slice(const std::vector& values, int start, int end) { + if (end < start) { + return std::vector(0); + } + + std::vector out(end - start); + for (int i = start; i < end; ++i) { + out[i - start] = values[i]; + } + return out; +} + +void random_bytes(int n, uint32_t seed, std::vector* out); +void random_bools(int n, double p, uint32_t seed, bool* out); + +template +inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) { + std::default_random_engine gen(seed); + std::uniform_int_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +template <> +inline void random_numbers(int n, uint32_t seed, float min_value, float max_value, + float* out) { + std::default_random_engine gen(seed); + std::uniform_real_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +template <> +inline void random_numbers(int n, uint32_t seed, double min_value, double max_value, + double* out) { + std::default_random_engine gen(seed); + std::uniform_real_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, + Int96* out); + +void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out); + +void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, + int max_size); + +void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size); + +template +std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, + const Sequence& values, int length, + const ColumnDescriptor* descr) { + auto encoder = MakeTypedEncoder(encoding, use_dictionary, descr); + encoder->Put(values, length); + return encoder->FlushValues(); +} + +template +static void InitValues(int num_values, std::vector& values, + std::vector& buffer) { + random_numbers(num_values, 0, std::numeric_limits::min(), + std::numeric_limits::max(), values.data()); +} + +template +static void InitDictValues(int num_values, int num_dicts, std::vector& values, + std::vector& buffer) { + int repeat_factor = num_values / num_dicts; + InitValues(num_dicts, values, buffer); + // add some repeated values + for (int j = 1; j < repeat_factor; ++j) { + for (int i = 0; i < num_dicts; ++i) { + std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T)); + } + } + // computed only dict_per_page * repeat_factor - 1 values < num_values + // compute remaining + for (int i = num_dicts * repeat_factor; i < num_values; ++i) { + std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T)); + } +} + +class MockPageReader : public PageReader { + public: + explicit MockPageReader(const std::vector>& pages) + : pages_(pages), page_index_(0) {} + + std::shared_ptr NextPage() override { + if (page_index_ == static_cast(pages_.size())) { + // EOS to consumer + return std::shared_ptr(nullptr); + } + return pages_[page_index_++]; + } + + // No-op + void set_max_page_header_size(uint32_t size) override {} + + private: + std::vector> pages_; + int page_index_; +}; + +// TODO(wesm): this is only used for testing for now. Refactor to form part of +// primary file write path +template +class DataPageBuilder { + public: + typedef typename Type::c_type T; + + // This class writes data and metadata to the passed inputs + explicit DataPageBuilder(ArrowOutputStream* sink) + : sink_(sink), + num_values_(0), + encoding_(Encoding::PLAIN), + definition_level_encoding_(Encoding::RLE), + repetition_level_encoding_(Encoding::RLE), + have_def_levels_(false), + have_rep_levels_(false), + have_values_(false) {} + + void AppendDefLevels(const std::vector& levels, int16_t max_level, + Encoding::type encoding = Encoding::RLE) { + AppendLevels(levels, max_level, encoding); + + num_values_ = std::max(static_cast(levels.size()), num_values_); + definition_level_encoding_ = encoding; + have_def_levels_ = true; + } + + void AppendRepLevels(const std::vector& levels, int16_t max_level, + Encoding::type encoding = Encoding::RLE) { + AppendLevels(levels, max_level, encoding); + + num_values_ = std::max(static_cast(levels.size()), num_values_); + repetition_level_encoding_ = encoding; + have_rep_levels_ = true; + } + + void AppendValues(const ColumnDescriptor* d, const std::vector& values, + Encoding::type encoding = Encoding::PLAIN) { + std::shared_ptr values_sink = EncodeValues( + encoding, false, values.data(), static_cast(values.size()), d); + PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size())); + + num_values_ = std::max(static_cast(values.size()), num_values_); + encoding_ = encoding; + have_values_ = true; + } + + int32_t num_values() const { return num_values_; } + + Encoding::type encoding() const { return encoding_; } + + Encoding::type rep_level_encoding() const { return repetition_level_encoding_; } + + Encoding::type def_level_encoding() const { return definition_level_encoding_; } + + private: + ArrowOutputStream* sink_; + + int32_t num_values_; + Encoding::type encoding_; + Encoding::type definition_level_encoding_; + Encoding::type repetition_level_encoding_; + + bool have_def_levels_; + bool have_rep_levels_; + bool have_values_; + + // Used internally for both repetition and definition levels + void AppendLevels(const std::vector& levels, int16_t max_level, + Encoding::type encoding) { + if (encoding != Encoding::RLE) { + ParquetException::NYI("only rle encoding currently implemented"); + } + + // TODO: compute a more precise maximum size for the encoded levels + std::vector encode_buffer(levels.size() * 2); + + // We encode into separate memory from the output stream because the + // RLE-encoded bytes have to be preceded in the stream by their absolute + // size. + LevelEncoder encoder; + encoder.Init(encoding, max_level, static_cast(levels.size()), + encode_buffer.data(), static_cast(encode_buffer.size())); + + encoder.Encode(static_cast(levels.size()), levels.data()); + + int32_t rle_bytes = encoder.len(); + PARQUET_THROW_NOT_OK( + sink_->Write(reinterpret_cast(&rle_bytes), sizeof(int32_t))); + PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes)); + } +}; + +template <> +inline void DataPageBuilder::AppendValues(const ColumnDescriptor* d, + const std::vector& values, + Encoding::type encoding) { + if (encoding != Encoding::PLAIN) { + ParquetException::NYI("only plain encoding currently implemented"); + } + + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, d); + dynamic_cast(encoder.get()) + ->Put(values, static_cast(values.size())); + std::shared_ptr buffer = encoder->FlushValues(); + PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size())); + + num_values_ = std::max(static_cast(values.size()), num_values_); + encoding_ = encoding; + have_values_ = true; +} + +template +static std::shared_ptr MakeDataPage( + const ColumnDescriptor* d, const std::vector& values, + int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size, + const std::vector& def_levels, int16_t max_def_level, + const std::vector& rep_levels, int16_t max_rep_level) { + int num_values = 0; + + auto page_stream = CreateOutputStream(); + test::DataPageBuilder page_builder(page_stream.get()); + + if (!rep_levels.empty()) { + page_builder.AppendRepLevels(rep_levels, max_rep_level); + } + if (!def_levels.empty()) { + page_builder.AppendDefLevels(def_levels, max_def_level); + } + + if (encoding == Encoding::PLAIN) { + page_builder.AppendValues(d, values, encoding); + num_values = page_builder.num_values(); + } else { // DICTIONARY PAGES + PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size)); + num_values = std::max(page_builder.num_values(), num_vals); + } + + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(page_stream->Finish(&buffer)); + + return std::make_shared(buffer, num_values, encoding, + page_builder.def_level_encoding(), + page_builder.rep_level_encoding()); +} + +template +class DictionaryPageBuilder { + public: + typedef typename TYPE::c_type TC; + static constexpr int TN = TYPE::type_num; + using SpecializedEncoder = typename EncodingTraits::Encoder; + + // This class writes data and metadata to the passed inputs + explicit DictionaryPageBuilder(const ColumnDescriptor* d) + : num_dict_values_(0), have_values_(false) { + auto encoder = MakeTypedEncoder(Encoding::PLAIN, true, d); + dict_traits_ = dynamic_cast*>(encoder.get()); + encoder_.reset(dynamic_cast(encoder.release())); + } + + ~DictionaryPageBuilder() {} + + std::shared_ptr AppendValues(const std::vector& values) { + int num_values = static_cast(values.size()); + // Dictionary encoding + encoder_->Put(values.data(), num_values); + num_dict_values_ = dict_traits_->num_entries(); + have_values_ = true; + return encoder_->FlushValues(); + } + + std::shared_ptr WriteDict() { + std::shared_ptr dict_buffer = + AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size()); + dict_traits_->WriteDict(dict_buffer->mutable_data()); + return dict_buffer; + } + + int32_t num_values() const { return num_dict_values_; } + + private: + DictEncoder* dict_traits_; + std::unique_ptr encoder_; + int32_t num_dict_values_; + bool have_values_; +}; + +template <> +inline DictionaryPageBuilder::DictionaryPageBuilder( + const ColumnDescriptor* d) { + ParquetException::NYI("only plain encoding currently implemented for boolean"); +} + +template <> +inline std::shared_ptr DictionaryPageBuilder::WriteDict() { + ParquetException::NYI("only plain encoding currently implemented for boolean"); + return nullptr; +} + +template <> +inline std::shared_ptr DictionaryPageBuilder::AppendValues( + const std::vector& values) { + ParquetException::NYI("only plain encoding currently implemented for boolean"); + return nullptr; +} + +template +inline static std::shared_ptr MakeDictPage( + const ColumnDescriptor* d, const std::vector& values, + const std::vector& values_per_page, Encoding::type encoding, + std::vector>& rle_indices) { + test::DictionaryPageBuilder page_builder(d); + int num_pages = static_cast(values_per_page.size()); + int value_start = 0; + + for (int i = 0; i < num_pages; i++) { + rle_indices.push_back(page_builder.AppendValues( + slice(values, value_start, value_start + values_per_page[i]))); + value_start += values_per_page[i]; + } + + auto buffer = page_builder.WriteDict(); + + return std::make_shared(buffer, page_builder.num_values(), + Encoding::PLAIN); +} + +// Given def/rep levels and values create multiple dict pages +template +inline static void PaginateDict(const ColumnDescriptor* d, + const std::vector& values, + const std::vector& def_levels, + int16_t max_def_level, + const std::vector& rep_levels, + int16_t max_rep_level, int num_levels_per_page, + const std::vector& values_per_page, + std::vector>& pages, + Encoding::type encoding = Encoding::RLE_DICTIONARY) { + int num_pages = static_cast(values_per_page.size()); + std::vector> rle_indices; + std::shared_ptr dict_page = + MakeDictPage(d, values, values_per_page, encoding, rle_indices); + pages.push_back(dict_page); + int def_level_start = 0; + int def_level_end = 0; + int rep_level_start = 0; + int rep_level_end = 0; + for (int i = 0; i < num_pages; i++) { + if (max_def_level > 0) { + def_level_start = i * num_levels_per_page; + def_level_end = (i + 1) * num_levels_per_page; + } + if (max_rep_level > 0) { + rep_level_start = i * num_levels_per_page; + rep_level_end = (i + 1) * num_levels_per_page; + } + std::shared_ptr data_page = MakeDataPage( + d, {}, values_per_page[i], encoding, rle_indices[i]->data(), + static_cast(rle_indices[i]->size()), + slice(def_levels, def_level_start, def_level_end), max_def_level, + slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); + pages.push_back(data_page); + } +} + +// Given def/rep levels and values create multiple plain pages +template +static inline void PaginatePlain(const ColumnDescriptor* d, + const std::vector& values, + const std::vector& def_levels, + int16_t max_def_level, + const std::vector& rep_levels, + int16_t max_rep_level, int num_levels_per_page, + const std::vector& values_per_page, + std::vector>& pages, + Encoding::type encoding = Encoding::PLAIN) { + int num_pages = static_cast(values_per_page.size()); + int def_level_start = 0; + int def_level_end = 0; + int rep_level_start = 0; + int rep_level_end = 0; + int value_start = 0; + for (int i = 0; i < num_pages; i++) { + if (max_def_level > 0) { + def_level_start = i * num_levels_per_page; + def_level_end = (i + 1) * num_levels_per_page; + } + if (max_rep_level > 0) { + rep_level_start = i * num_levels_per_page; + rep_level_end = (i + 1) * num_levels_per_page; + } + std::shared_ptr page = MakeDataPage( + d, slice(values, value_start, value_start + values_per_page[i]), + values_per_page[i], encoding, nullptr, 0, + slice(def_levels, def_level_start, def_level_end), max_def_level, + slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); + pages.push_back(page); + value_start += values_per_page[i]; + } +} + +// Generates pages from randomly generated data +template +static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page, + std::vector& def_levels, + std::vector& rep_levels, + std::vector& values, + std::vector& buffer, + std::vector>& pages, + Encoding::type encoding = Encoding::PLAIN) { + int num_levels = levels_per_page * num_pages; + int num_values = 0; + uint32_t seed = 0; + int16_t zero = 0; + int16_t max_def_level = d->max_definition_level(); + int16_t max_rep_level = d->max_repetition_level(); + std::vector values_per_page(num_pages, levels_per_page); + // Create definition levels + if (max_def_level > 0) { + def_levels.resize(num_levels); + random_numbers(num_levels, seed, zero, max_def_level, def_levels.data()); + for (int p = 0; p < num_pages; p++) { + int num_values_per_page = 0; + for (int i = 0; i < levels_per_page; i++) { + if (def_levels[i + p * levels_per_page] == max_def_level) { + num_values_per_page++; + num_values++; + } + } + values_per_page[p] = num_values_per_page; + } + } else { + num_values = num_levels; + } + // Create repitition levels + if (max_rep_level > 0) { + rep_levels.resize(num_levels); + random_numbers(num_levels, seed, zero, max_rep_level, rep_levels.data()); + } + // Create values + values.resize(num_values); + if (encoding == Encoding::PLAIN) { + InitValues(num_values, values, buffer); + PaginatePlain(d, values, def_levels, max_def_level, rep_levels, max_rep_level, + levels_per_page, values_per_page, pages); + } else if (encoding == Encoding::RLE_DICTIONARY || + encoding == Encoding::PLAIN_DICTIONARY) { + // Calls InitValues and repeats the data + InitDictValues(num_values, levels_per_page, values, buffer); + PaginateDict(d, values, def_levels, max_def_level, rep_levels, max_rep_level, + levels_per_page, values_per_page, pages); + } + + return num_values; +} + +// ---------------------------------------------------------------------- +// Test data generation + +template <> +void inline InitValues(int num_values, std::vector& values, + std::vector& buffer) { + values = {}; + ::arrow::random_is_valid(num_values, 1., &values, + static_cast(::arrow::random_seed())); +} + +template <> +inline void InitValues(int num_values, std::vector& values, + std::vector& buffer) { + int max_byte_array_len = 12; + int num_bytes = static_cast(max_byte_array_len + sizeof(uint32_t)); + size_t nbytes = num_values * num_bytes; + buffer.resize(nbytes); + random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len); +} + +inline void InitWideByteArrayValues(int num_values, std::vector& values, + std::vector& buffer, int min_len, + int max_len) { + int num_bytes = static_cast(max_len + sizeof(uint32_t)); + size_t nbytes = num_values * num_bytes; + buffer.resize(nbytes); + random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len); +} + +template <> +inline void InitValues(int num_values, std::vector& values, + std::vector& buffer) { + size_t nbytes = num_values * FLBA_LENGTH; + buffer.resize(nbytes); + random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data()); +} + +template <> +inline void InitValues(int num_values, std::vector& values, + std::vector& buffer) { + random_Int96_numbers(num_values, 0, std::numeric_limits::min(), + std::numeric_limits::max(), values.data()); +} + +inline std::string TestColumnName(int i) { + std::stringstream col_name; + col_name << "column_" << i; + return col_name.str(); +} + +// This class lives here because of its dependency on the InitValues specializations. +template +class PrimitiveTypedTest : public ::testing::Test { + public: + typedef typename TestType::c_type T; + + void SetUpSchema(Repetition::type repetition, int num_columns = 1) { + std::vector fields; + + for (int i = 0; i < num_columns; ++i) { + std::string name = TestColumnName(i); + fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num, + LogicalType::NONE, FLBA_LENGTH)); + } + node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields); + schema_.Init(node_); + } + + void GenerateData(int64_t num_values); + void SetupValuesOut(int64_t num_values); + void SyncValuesOut(); + + protected: + schema::NodePtr node_; + SchemaDescriptor schema_; + + // Input buffers + std::vector values_; + + std::vector def_levels_; + + std::vector buffer_; + // Pointer to the values, needed as we cannot use std::vector::data() + T* values_ptr_; + std::vector bool_buffer_; + + // Output buffers + std::vector values_out_; + std::vector bool_buffer_out_; + T* values_out_ptr_; +}; + +template +inline void PrimitiveTypedTest::SyncValuesOut() {} + +template <> +inline void PrimitiveTypedTest::SyncValuesOut() { + std::vector::const_iterator source_iterator = bool_buffer_out_.begin(); + std::vector::iterator destination_iterator = values_out_.begin(); + while (source_iterator != bool_buffer_out_.end()) { + *destination_iterator++ = *source_iterator++ != 0; + } +} + +template +inline void PrimitiveTypedTest::SetupValuesOut(int64_t num_values) { + values_out_.clear(); + values_out_.resize(num_values); + values_out_ptr_ = values_out_.data(); +} + +template <> +inline void PrimitiveTypedTest::SetupValuesOut(int64_t num_values) { + values_out_.clear(); + values_out_.resize(num_values); + + bool_buffer_out_.clear(); + bool_buffer_out_.resize(num_values); + // Write once to all values so we can copy it without getting Valgrind errors + // about uninitialised values. + std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true); + values_out_ptr_ = reinterpret_cast(bool_buffer_out_.data()); +} + +template +inline void PrimitiveTypedTest::GenerateData(int64_t num_values) { + def_levels_.resize(num_values); + values_.resize(num_values); + + InitValues(static_cast(num_values), values_, buffer_); + values_ptr_ = values_.data(); + + std::fill(def_levels_.begin(), def_levels_.end(), 1); +} + +template <> +inline void PrimitiveTypedTest::GenerateData(int64_t num_values) { + def_levels_.resize(num_values); + values_.resize(num_values); + + InitValues(static_cast(num_values), values_, buffer_); + bool_buffer_.resize(num_values); + std::copy(values_.begin(), values_.end(), bool_buffer_.begin()); + values_ptr_ = reinterpret_cast(bool_buffer_.data()); + + std::fill(def_levels_.begin(), def_levels_.end(), 1); +} + +} // namespace test +} // namespace parquet diff --git a/r/R/inst/include/parquet/thrift.h b/r/R/inst/include/parquet/thrift.h new file mode 100644 index 00000000000..ffefd12900c --- /dev/null +++ b/r/R/inst/include/parquet/thrift.h @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/windows_compatibility.h" + +#include +// Check if thrift version < 0.11.0 +// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp +#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR) +#include +#else +#include +#endif +#include + +// TCompactProtocol requires some #defines to work right. +#define SIGNED_RIGHT_SHIFT_IS 1 +#define ARITHMETIC_RIGHT_SHIFT 1 +#include +#include +#include + +#include +#include +#include + +#include "arrow/util/logging.h" +#include "parquet/exception.h" +#include "parquet/platform.h" +#include "parquet/statistics.h" + +#include "parquet/parquet_types.h" // IYWU pragma: export + +namespace parquet { + +// Check if thrift version < 0.11.0 +// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp +#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR) +using ::boost::shared_ptr; +#else +using ::std::shared_ptr; +#endif + +// ---------------------------------------------------------------------- +// Convert Thrift enums to / from parquet enums + +static inline Type::type FromThrift(format::Type::type type) { + return static_cast(type); +} + +static inline LogicalType::type FromThrift(format::ConvertedType::type type) { + // item 0 is NONE + return static_cast(static_cast(type) + 1); +} + +static inline Repetition::type FromThrift(format::FieldRepetitionType::type type) { + return static_cast(type); +} + +static inline Encoding::type FromThrift(format::Encoding::type type) { + return static_cast(type); +} + +static inline Compression::type FromThrift(format::CompressionCodec::type type) { + return static_cast(type); +} + +static inline format::Type::type ToThrift(Type::type type) { + return static_cast(type); +} + +static inline format::ConvertedType::type ToThrift(LogicalType::type type) { + // item 0 is NONE + DCHECK_NE(type, LogicalType::NONE); + return static_cast(static_cast(type) - 1); +} + +static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) { + return static_cast(type); +} + +static inline format::Encoding::type ToThrift(Encoding::type type) { + return static_cast(type); +} + +static inline format::CompressionCodec::type ToThrift(Compression::type type) { + return static_cast(type); +} + +static inline format::Statistics ToThrift(const EncodedStatistics& stats) { + format::Statistics statistics; + if (stats.has_min) { + statistics.__set_min_value(stats.min()); + // If the order is SIGNED, then the old min value must be set too. + // This for backward compatibility + if (stats.is_signed()) { + statistics.__set_min(stats.min()); + } + } + if (stats.has_max) { + statistics.__set_max_value(stats.max()); + // If the order is SIGNED, then the old max value must be set too. + // This for backward compatibility + if (stats.is_signed()) { + statistics.__set_max(stats.max()); + } + } + if (stats.has_null_count) { + statistics.__set_null_count(stats.null_count); + } + if (stats.has_distinct_count) { + statistics.__set_distinct_count(stats.distinct_count); + } + + return statistics; +} + +// ---------------------------------------------------------------------- +// Thrift struct serialization / deserialization utilities + +using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; + +// Deserialize a thrift message from buf/len. buf/len must at least contain +// all the bytes needed to store the thrift message. On return, len will be +// set to the actual length of the header. +template +inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { + // Deserialize msg bytes into c++ thrift msg using memory transport. + shared_ptr tmem_transport( + new ThriftBuffer(const_cast(buf), *len)); + apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; + shared_ptr tproto = // + tproto_factory.getProtocol(tmem_transport); + try { + deserialized_msg->read(tproto.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't deserialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + uint32_t bytes_left = tmem_transport->available_read(); + *len = *len - bytes_left; +} + +/// Utility class to serialize thrift objects to a binary format. This object +/// should be reused if possible to reuse the underlying memory. +/// Note: thrift will encode NULLs into the serialized buffer so it is not valid +/// to treat it as a string. +class ThriftSerializer { + public: + explicit ThriftSerializer(int initial_buffer_size = 1024) + : mem_buffer_(new ThriftBuffer(initial_buffer_size)) { + apache::thrift::protocol::TCompactProtocolFactoryT factory; + protocol_ = factory.getProtocol(mem_buffer_); + } + + /// Serialize obj into a memory buffer. The result is returned in buffer/len. The + /// memory returned is owned by this object and will be invalid when another object + /// is serialized. + template + void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { + SerializeObject(obj); + mem_buffer_->getBuffer(buffer, len); + } + + template + void SerializeToString(const T* obj, std::string* result) { + SerializeObject(obj); + *result = mem_buffer_->getBufferAsString(); + } + + template + int64_t Serialize(const T* obj, ArrowOutputStream* out) { + uint8_t* out_buffer; + uint32_t out_length; + SerializeToBuffer(obj, &out_length, &out_buffer); + PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); + return static_cast(out_length); + } + + private: + template + void SerializeObject(const T* obj) { + try { + mem_buffer_->resetBuffer(); + obj->write(protocol_.get()); + } catch (std::exception& e) { + std::stringstream ss; + ss << "Couldn't serialize thrift: " << e.what() << "\n"; + throw ParquetException(ss.str()); + } + } + + shared_ptr mem_buffer_; + shared_ptr protocol_; +}; + +} // namespace parquet diff --git a/r/R/inst/include/parquet/types.h b/r/R/inst/include/parquet/types.h new file mode 100644 index 00000000000..779ea6b9b5b --- /dev/null +++ b/r/R/inst/include/parquet/types.h @@ -0,0 +1,662 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_TYPES_H +#define PARQUET_TYPES_H + +#include +#include +#include +#include +#include +#include +#include + +#include "parquet/platform.h" + +namespace arrow { +namespace util { + +class Codec; + +} // namespace util +} // namespace arrow + +namespace parquet { + +// ---------------------------------------------------------------------- +// Metadata enums to match Thrift metadata +// +// The reason we maintain our own enums is to avoid transitive dependency on +// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the +// public API. After building parquet-cpp, you should not need to include +// Thrift headers in your application. This means some boilerplate to convert +// between our types and Parquet's Thrift types. +// +// We can also add special values like NONE to distinguish between metadata +// values being set and not set. As an example consider ConvertedType and +// CompressionCodec + +// Mirrors parquet::Type +struct Type { + enum type { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, + // Should always be last element. + UNDEFINED = 8 + }; +}; + +// Mirrors parquet::ConvertedType +struct LogicalType { + enum type { + NONE, + UTF8, + MAP, + MAP_KEY_VALUE, + LIST, + ENUM, + DECIMAL, + DATE, + TIME_MILLIS, + TIME_MICROS, + TIMESTAMP_MILLIS, + TIMESTAMP_MICROS, + UINT_8, + UINT_16, + UINT_32, + UINT_64, + INT_8, + INT_16, + INT_32, + INT_64, + JSON, + BSON, + INTERVAL, + NA = 25, + // Should always be last element. + UNDEFINED = 26 + }; +}; + +namespace format { + +class LogicalType; + +} + +// Mirrors parquet::FieldRepetitionType +struct Repetition { + enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 }; +}; + +// Reference: +// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ +// format/converter/ParquetMetadataConverter.java +// Sort order for page and column statistics. Types are associated with sort +// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are +// aggregated using a sort order. As of parquet-format version 2.3.1, the +// order used to aggregate stats is always SIGNED and is not stored in the +// Parquet file. These stats are discarded for types that need unsigned. +// See PARQUET-686. +struct SortOrder { + enum type { SIGNED, UNSIGNED, UNKNOWN }; +}; + +namespace schema { + +struct DecimalMetadata { + bool isset; + int32_t scale; + int32_t precision; +}; + +} // namespace schema + +/// \brief Implementation of parquet.thrift LogicalType annotations. +class PARQUET_EXPORT LogicalAnnotation { + public: + struct Type { + enum type { + UNKNOWN = 0, + STRING = 1, + MAP, + LIST, + ENUM, + DECIMAL, + DATE, + TIME, + TIMESTAMP, + INTERVAL, + INT, + NIL, // Thrift NullType + JSON, + BSON, + UUID, + NONE + }; + }; + + struct TimeUnit { + enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS }; + }; + + /// \brief If possible, return an annotation equivalent to the given legacy converted + /// type (and decimal metadata if applicable). + static std::shared_ptr FromConvertedType( + const parquet::LogicalType::type converted_type, + const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1, + -1}); + + /// \brief Return the annotation represented by the Thrift intermediary object. + static std::shared_ptr FromThrift( + const parquet::format::LogicalType& thrift_logical_type); + + /// \brief Return the explicitly requested annotation type. + static std::shared_ptr String(); + static std::shared_ptr Map(); + static std::shared_ptr List(); + static std::shared_ptr Enum(); + static std::shared_ptr Decimal(int32_t precision, + int32_t scale = 0); + static std::shared_ptr Date(); + static std::shared_ptr Time( + bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); + static std::shared_ptr Timestamp( + bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); + static std::shared_ptr Interval(); + static std::shared_ptr Int(int bit_width, bool is_signed); + static std::shared_ptr Null(); + static std::shared_ptr JSON(); + static std::shared_ptr BSON(); + static std::shared_ptr UUID(); + static std::shared_ptr None(); + static std::shared_ptr Unknown(); + + /// \brief Return true if this annotation is consistent with the given underlying + /// physical type. + bool is_applicable(parquet::Type::type primitive_type, + int32_t primitive_length = -1) const; + + /// \brief Return true if this annotation is equivalent to the given legacy converted + /// type (and decimal metadata if applicable). + bool is_compatible(parquet::LogicalType::type converted_type, + parquet::schema::DecimalMetadata converted_decimal_metadata = { + false, -1, -1}) const; + + /// \brief If possible, return the legacy converted type (and decimal metadata if + /// applicable) equivalent to this annotation. + parquet::LogicalType::type ToConvertedType( + parquet::schema::DecimalMetadata* out_decimal_metadata) const; + + /// \brief Return a printable representation of this annotation. + std::string ToString() const; + + /// \brief Return a JSON representation of this annotation. + std::string ToJSON() const; + + /// \brief Return a serializable Thrift object for this annotation. + parquet::format::LogicalType ToThrift() const; + + /// \brief Return true if the given annotation is equivalent to this annotation. + bool Equals(const LogicalAnnotation& other) const; + + /// \brief Return the enumerated type of this annotation. + LogicalAnnotation::Type::type type() const; + + /// \brief Return the appropriate sort order for this annotation. + SortOrder::type sort_order() const; + + // Type checks ... + bool is_string() const; + bool is_map() const; + bool is_list() const; + bool is_enum() const; + bool is_decimal() const; + bool is_date() const; + bool is_time() const; + bool is_timestamp() const; + bool is_interval() const; + bool is_int() const; + bool is_null() const; + bool is_JSON() const; + bool is_BSON() const; + bool is_UUID() const; + bool is_none() const; + /// \brief Return true if this annotation is of a known type. + bool is_valid() const; + bool is_invalid() const; + /// \brief Return true if this annotation is suitable for a schema GroupNode. + bool is_nested() const; + bool is_nonnested() const; + /// \brief Return true if this annotation is included in the Thrift output for its node. + bool is_serialized() const; + + LogicalAnnotation(const LogicalAnnotation&) = delete; + LogicalAnnotation& operator=(const LogicalAnnotation&) = delete; + virtual ~LogicalAnnotation() noexcept; + + protected: + LogicalAnnotation(); + + class Impl; + std::unique_ptr impl_; +}; + +/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. +class PARQUET_EXPORT StringAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + StringAnnotation() = default; +}; + +/// \brief Allowed for group nodes only. +class PARQUET_EXPORT MapAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + MapAnnotation() = default; +}; + +/// \brief Allowed for group nodes only. +class PARQUET_EXPORT ListAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + ListAnnotation() = default; +}; + +/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. +class PARQUET_EXPORT EnumAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + EnumAnnotation() = default; +}; + +/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY, +/// depending on the precision. +class PARQUET_EXPORT DecimalAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(int32_t precision, + int32_t scale = 0); + int32_t precision() const; + int32_t scale() const; + + private: + DecimalAnnotation() = default; +}; + +/// \brief Allowed for physical type INT32. +class PARQUET_EXPORT DateAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + DateAnnotation() = default; +}; + +/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS). +class PARQUET_EXPORT TimeAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make( + bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); + bool is_adjusted_to_utc() const; + LogicalAnnotation::TimeUnit::unit time_unit() const; + + private: + TimeAnnotation() = default; +}; + +/// \brief Allowed for physical type INT64. +class PARQUET_EXPORT TimestampAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make( + bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); + bool is_adjusted_to_utc() const; + LogicalAnnotation::TimeUnit::unit time_unit() const; + + private: + TimestampAnnotation() = default; +}; + +/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12 +class PARQUET_EXPORT IntervalAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + IntervalAnnotation() = default; +}; + +/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64 +/// (for bit width 64). +class PARQUET_EXPORT IntAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(int bit_width, bool is_signed); + int bit_width() const; + bool is_signed() const; + + private: + IntAnnotation() = default; +}; + +/// \brief Allowed for any physical type. +class PARQUET_EXPORT NullAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + NullAnnotation() = default; +}; + +/// \brief Allowed for physical type BYTE_ARRAY. +class PARQUET_EXPORT JSONAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + JSONAnnotation() = default; +}; + +/// \brief Allowed for physical type BYTE_ARRAY. +class PARQUET_EXPORT BSONAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + BSONAnnotation() = default; +}; + +/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16, +/// must encode raw UUID bytes. +class PARQUET_EXPORT UUIDAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + UUIDAnnotation() = default; +}; + +/// \brief Allowed for any physical type. +class PARQUET_EXPORT NoAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + NoAnnotation() = default; +}; + +/// \brief Allowed for any type. +class PARQUET_EXPORT UnknownAnnotation : public LogicalAnnotation { + public: + static std::shared_ptr Make(); + + private: + UnknownAnnotation() = default; +}; + +// Data encodings. Mirrors parquet::Encoding +struct Encoding { + enum type { + PLAIN = 0, + PLAIN_DICTIONARY = 2, + RLE = 3, + BIT_PACKED = 4, + DELTA_BINARY_PACKED = 5, + DELTA_LENGTH_BYTE_ARRAY = 6, + DELTA_BYTE_ARRAY = 7, + RLE_DICTIONARY = 8 + }; +}; + +// Compression, mirrors parquet::CompressionCodec +struct Compression { + enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD }; +}; + +PARQUET_EXPORT +std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec); + +struct Encryption { + enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; +}; + +// parquet::PageType +struct PageType { + enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; +}; + +class ColumnOrder { + public: + enum type { UNDEFINED, TYPE_DEFINED_ORDER }; + explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} + // Default to Type Defined Order + ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} + ColumnOrder::type get_order() { return column_order_; } + + static ColumnOrder undefined_; + static ColumnOrder type_defined_; + + private: + ColumnOrder::type column_order_; +}; + +// ---------------------------------------------------------------------- + +struct ByteArray { + ByteArray() : len(0), ptr(NULLPTR) {} + ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + uint32_t len; + const uint8_t* ptr; +}; + +inline bool operator==(const ByteArray& left, const ByteArray& right) { + return left.len == right.len && + (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); +} + +inline bool operator!=(const ByteArray& left, const ByteArray& right) { + return !(left == right); +} + +struct FixedLenByteArray { + FixedLenByteArray() : ptr(NULLPTR) {} + explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} + const uint8_t* ptr; +}; + +using FLBA = FixedLenByteArray; + +// Julian day at unix epoch. +// +// The Julian Day Number (JDN) is the integer assigned to a whole solar day in +// the Julian day count starting from noon Universal time, with Julian day +// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, +// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian +// calendar), +constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); +constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); +constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); +constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); +constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); + +MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; +STRUCT_END(Int96, 12); + +inline bool operator==(const Int96& left, const Int96& right) { + return std::equal(left.value, left.value + 3, right.value); +} + +inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); } + +static inline std::string ByteArrayToString(const ByteArray& a) { + return std::string(reinterpret_cast(a.ptr), a.len); +} + +static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { + std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); +} + +static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { + int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; + int64_t nanoseconds = 0; + + memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); + return days_since_epoch * kNanosecondsPerDay + nanoseconds; +} + +static inline std::string Int96ToString(const Int96& a) { + std::ostringstream result; + std::copy(a.value, a.value + 3, std::ostream_iterator(result, " ")); + return result.str(); +} + +static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) { + std::ostringstream result; + std::copy(a.ptr, a.ptr + len, std::ostream_iterator(result, " ")); + return result.str(); +} + +template +struct type_traits {}; + +template <> +struct type_traits { + using value_type = bool; + + static constexpr int value_byte_size = 1; + static constexpr const char* printf_code = "d"; +}; + +template <> +struct type_traits { + using value_type = int32_t; + + static constexpr int value_byte_size = 4; + static constexpr const char* printf_code = "d"; +}; + +template <> +struct type_traits { + using value_type = int64_t; + + static constexpr int value_byte_size = 8; + static constexpr const char* printf_code = "ld"; +}; + +template <> +struct type_traits { + using value_type = Int96; + + static constexpr int value_byte_size = 12; + static constexpr const char* printf_code = "s"; +}; + +template <> +struct type_traits { + using value_type = float; + + static constexpr int value_byte_size = 4; + static constexpr const char* printf_code = "f"; +}; + +template <> +struct type_traits { + using value_type = double; + + static constexpr int value_byte_size = 8; + static constexpr const char* printf_code = "lf"; +}; + +template <> +struct type_traits { + using value_type = ByteArray; + + static constexpr int value_byte_size = sizeof(ByteArray); + static constexpr const char* printf_code = "s"; +}; + +template <> +struct type_traits { + using value_type = FixedLenByteArray; + + static constexpr int value_byte_size = sizeof(FixedLenByteArray); + static constexpr const char* printf_code = "s"; +}; + +template +struct DataType { + using c_type = typename type_traits::value_type; + static constexpr Type::type type_num = TYPE; +}; + +using BooleanType = DataType; +using Int32Type = DataType; +using Int64Type = DataType; +using Int96Type = DataType; +using FloatType = DataType; +using DoubleType = DataType; +using ByteArrayType = DataType; +using FLBAType = DataType; + +template +inline std::string format_fwf(int width) { + std::stringstream ss; + ss << "%-" << width << type_traits::printf_code; + return ss.str(); +} + +PARQUET_EXPORT std::string CompressionToString(Compression::type t); + +PARQUET_EXPORT std::string EncodingToString(Encoding::type t); + +PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t); + +PARQUET_EXPORT std::string TypeToString(Type::type t); + +PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, + const std::string& val); + +/// \deprecated Since 1.5.0 +ARROW_DEPRECATED("Use std::string instead of char* as input") +PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); + +PARQUET_EXPORT int GetTypeByteSize(Type::type t); + +PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive); + +PARQUET_EXPORT SortOrder::type GetSortOrder(LogicalType::type converted, + Type::type primitive); + +PARQUET_EXPORT SortOrder::type GetSortOrder( + const std::shared_ptr& annotation, Type::type primitive); + +} // namespace parquet + +#endif // PARQUET_TYPES_H diff --git a/r/R/inst/include/parquet/windows_compatibility.h b/r/R/inst/include/parquet/windows_compatibility.h new file mode 100644 index 00000000000..31ca04c8b66 --- /dev/null +++ b/r/R/inst/include/parquet/windows_compatibility.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/windows_compatibility.h" + +#ifdef _WIN32 + +// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from +// above, so we undefine it +#ifdef OPTIONAL +#undef OPTIONAL +#endif + +#endif From 80e204db339efeeeb36428e26a12a0222a48d400 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 5 Jun 2019 15:19:35 +0200 Subject: [PATCH 3/4] use the headers from inst/include --- r/R/inst/include/arrow/adapters/orc/adapter.h | 152 - .../include/arrow/adapters/orc/adapter_util.h | 44 - .../arrow/adapters/tensorflow/convert.h | 158 - r/R/inst/include/arrow/allocator.h | 151 - r/R/inst/include/arrow/api.h | 43 - r/R/inst/include/arrow/array.h | 1072 --- .../include/arrow/array/builder_adaptive.h | 175 - r/R/inst/include/arrow/array/builder_base.h | 219 - r/R/inst/include/arrow/array/builder_binary.h | 365 - .../include/arrow/array/builder_decimal.h | 51 - r/R/inst/include/arrow/array/builder_dict.h | 369 - r/R/inst/include/arrow/array/builder_nested.h | 200 - .../include/arrow/array/builder_primitive.h | 427 - r/R/inst/include/arrow/array/builder_time.h | 70 - r/R/inst/include/arrow/array/builder_union.h | 106 - r/R/inst/include/arrow/array/concatenate.h | 39 - r/R/inst/include/arrow/buffer-builder.h | 376 - r/R/inst/include/arrow/buffer.h | 444 - r/R/inst/include/arrow/builder.h | 58 - r/R/inst/include/arrow/compare.h | 101 - r/R/inst/include/arrow/compute/api.h | 33 - .../include/arrow/compute/benchmark-util.h | 74 - r/R/inst/include/arrow/compute/context.h | 82 - r/R/inst/include/arrow/compute/expression.h | 261 - r/R/inst/include/arrow/compute/kernel.h | 271 - .../include/arrow/compute/kernels/aggregate.h | 115 - .../include/arrow/compute/kernels/boolean.h | 76 - r/R/inst/include/arrow/compute/kernels/cast.h | 98 - .../include/arrow/compute/kernels/compare.h | 116 - .../include/arrow/compute/kernels/count.h | 88 - .../include/arrow/compute/kernels/filter.h | 67 - .../kernels/generated/cast-codegen-internal.h | 208 - r/R/inst/include/arrow/compute/kernels/hash.h | 105 - r/R/inst/include/arrow/compute/kernels/mean.h | 66 - .../arrow/compute/kernels/sum-internal.h | 207 - r/R/inst/include/arrow/compute/kernels/sum.h | 70 - r/R/inst/include/arrow/compute/kernels/take.h | 83 - .../arrow/compute/kernels/util-internal.h | 144 - r/R/inst/include/arrow/compute/logical_type.h | 308 - r/R/inst/include/arrow/compute/operation.h | 52 - .../include/arrow/compute/operations/cast.h | 46 - .../arrow/compute/operations/literal.h | 45 - r/R/inst/include/arrow/compute/test-util.h | 110 - r/R/inst/include/arrow/compute/type_fwd.h | 38 - r/R/inst/include/arrow/csv/api.h | 24 - r/R/inst/include/arrow/csv/chunker.h | 69 - r/R/inst/include/arrow/csv/column-builder.h | 87 - r/R/inst/include/arrow/csv/converter.h | 68 - r/R/inst/include/arrow/csv/options.h | 98 - r/R/inst/include/arrow/csv/parser.h | 149 - r/R/inst/include/arrow/csv/reader.h | 53 - r/R/inst/include/arrow/csv/test-common.h | 71 - r/R/inst/include/arrow/dbi/hiveserver2/api.h | 27 - .../arrow/dbi/hiveserver2/columnar-row-set.h | 155 - .../include/arrow/dbi/hiveserver2/operation.h | 127 - .../include/arrow/dbi/hiveserver2/service.h | 140 - .../include/arrow/dbi/hiveserver2/session.h | 84 - .../arrow/dbi/hiveserver2/thrift-internal.h | 91 - .../include/arrow/dbi/hiveserver2/types.h | 131 - r/R/inst/include/arrow/dbi/hiveserver2/util.h | 36 - r/R/inst/include/arrow/extension_type.h | 115 - .../include/arrow/filesystem/filesystem.h | 247 - r/R/inst/include/arrow/filesystem/localfs.h | 67 - r/R/inst/include/arrow/filesystem/mockfs.h | 104 - r/R/inst/include/arrow/filesystem/path-util.h | 70 - r/R/inst/include/arrow/filesystem/test-util.h | 126 - .../include/arrow/filesystem/util-internal.h | 36 - r/R/inst/include/arrow/flight/api.h | 24 - r/R/inst/include/arrow/flight/client.h | 178 - r/R/inst/include/arrow/flight/client_auth.h | 62 - .../include/arrow/flight/customize_protobuf.h | 129 - r/R/inst/include/arrow/flight/internal.h | 100 - r/R/inst/include/arrow/flight/platform.h | 32 - .../include/arrow/flight/protocol-internal.h | 28 - .../arrow/flight/serialization-internal.h | 67 - r/R/inst/include/arrow/flight/server.h | 207 - r/R/inst/include/arrow/flight/server_auth.h | 78 - r/R/inst/include/arrow/flight/test-util.h | 188 - r/R/inst/include/arrow/flight/types.h | 290 - r/R/inst/include/arrow/flight/visibility.h | 48 - r/R/inst/include/arrow/gpu/cuda_api.h | 26 - r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h | 77 - r/R/inst/include/arrow/gpu/cuda_common.h | 40 - r/R/inst/include/arrow/gpu/cuda_context.h | 168 - r/R/inst/include/arrow/gpu/cuda_memory.h | 232 - r/R/inst/include/arrow/io/api.h | 28 - r/R/inst/include/arrow/io/buffered.h | 160 - r/R/inst/include/arrow/io/compressed.h | 115 - r/R/inst/include/arrow/io/file.h | 246 - r/R/inst/include/arrow/io/hdfs-internal.h | 224 - r/R/inst/include/arrow/io/hdfs.h | 258 - r/R/inst/include/arrow/io/interfaces.h | 206 - r/R/inst/include/arrow/io/memory.h | 172 - r/R/inst/include/arrow/io/mman.h | 181 - r/R/inst/include/arrow/io/readahead.h | 98 - r/R/inst/include/arrow/io/test-common.h | 61 - r/R/inst/include/arrow/ipc/api.h | 28 - r/R/inst/include/arrow/ipc/dictionary.h | 106 - r/R/inst/include/arrow/ipc/feather-internal.h | 235 - r/R/inst/include/arrow/ipc/feather.h | 173 - r/R/inst/include/arrow/ipc/json-integration.h | 133 - r/R/inst/include/arrow/ipc/json-internal.h | 120 - r/R/inst/include/arrow/ipc/json-simple.h | 56 - r/R/inst/include/arrow/ipc/message.h | 241 - .../include/arrow/ipc/metadata-internal.h | 176 - r/R/inst/include/arrow/ipc/reader.h | 291 - r/R/inst/include/arrow/ipc/test-common.h | 134 - r/R/inst/include/arrow/ipc/util.h | 48 - r/R/inst/include/arrow/ipc/writer.h | 366 - r/R/inst/include/arrow/json/api.h | 21 - r/R/inst/include/arrow/json/chunked-builder.h | 76 - r/R/inst/include/arrow/json/chunker.h | 69 - r/R/inst/include/arrow/json/converter.h | 94 - r/R/inst/include/arrow/json/options.h | 63 - r/R/inst/include/arrow/json/parser.h | 96 - r/R/inst/include/arrow/json/rapidjson-defs.h | 44 - r/R/inst/include/arrow/json/reader.h | 62 - r/R/inst/include/arrow/json/test-common.h | 183 - r/R/inst/include/arrow/memory_pool-test.h | 90 - r/R/inst/include/arrow/memory_pool.h | 155 - r/R/inst/include/arrow/pretty_print.h | 112 - r/R/inst/include/arrow/python/api.h | 32 - .../include/arrow/python/arrow_to_pandas.h | 97 - r/R/inst/include/arrow/python/benchmark.h | 39 - r/R/inst/include/arrow/python/common.h | 265 - r/R/inst/include/arrow/python/config.h | 42 - r/R/inst/include/arrow/python/decimal.h | 113 - r/R/inst/include/arrow/python/deserialize.h | 92 - r/R/inst/include/arrow/python/flight.h | 207 - r/R/inst/include/arrow/python/helpers.h | 136 - r/R/inst/include/arrow/python/inference.h | 64 - r/R/inst/include/arrow/python/init.h | 29 - r/R/inst/include/arrow/python/io.h | 108 - r/R/inst/include/arrow/python/iterators.h | 157 - .../include/arrow/python/numpy-internal.h | 179 - r/R/inst/include/arrow/python/numpy_convert.h | 74 - r/R/inst/include/arrow/python/numpy_interop.h | 99 - .../include/arrow/python/numpy_to_arrow.h | 75 - r/R/inst/include/arrow/python/platform.h | 34 - r/R/inst/include/arrow/python/pyarrow.h | 86 - r/R/inst/include/arrow/python/pyarrow_api.h | 187 - r/R/inst/include/arrow/python/pyarrow_lib.h | 81 - .../include/arrow/python/python_to_arrow.h | 83 - r/R/inst/include/arrow/python/serialize.h | 136 - r/R/inst/include/arrow/python/type_traits.h | 302 - r/R/inst/include/arrow/python/util/datetime.h | 308 - r/R/inst/include/arrow/python/visibility.h | 39 - r/R/inst/include/arrow/record_batch.h | 190 - r/R/inst/include/arrow/scalar.h | 199 - r/R/inst/include/arrow/sparse_tensor.h | 259 - r/R/inst/include/arrow/status.h | 424 - r/R/inst/include/arrow/stl.h | 373 - r/R/inst/include/arrow/table.h | 377 - r/R/inst/include/arrow/table_builder.h | 113 - r/R/inst/include/arrow/tensor.h | 167 - r/R/inst/include/arrow/testing/gtest_common.h | 133 - r/R/inst/include/arrow/testing/gtest_util.h | 302 - r/R/inst/include/arrow/testing/random.h | 272 - r/R/inst/include/arrow/testing/util.h | 126 - r/R/inst/include/arrow/type.h | 1104 --- r/R/inst/include/arrow/type_fwd.h | 225 - r/R/inst/include/arrow/type_traits.h | 590 -- r/R/inst/include/arrow/util/basic_decimal.h | 175 - .../include/arrow/util/bit-stream-utils.h | 416 - r/R/inst/include/arrow/util/bit-util.h | 855 -- r/R/inst/include/arrow/util/bpacking.h | 3308 ------- r/R/inst/include/arrow/util/checked_cast.h | 54 - r/R/inst/include/arrow/util/compiler-util.h | 25 - r/R/inst/include/arrow/util/compression.h | 153 - .../include/arrow/util/compression_brotli.h | 55 - r/R/inst/include/arrow/util/compression_bz2.h | 55 - r/R/inst/include/arrow/util/compression_lz4.h | 55 - .../include/arrow/util/compression_snappy.h | 54 - .../include/arrow/util/compression_zlib.h | 70 - .../include/arrow/util/compression_zstd.h | 55 - r/R/inst/include/arrow/util/cpu-info.h | 101 - r/R/inst/include/arrow/util/decimal.h | 133 - r/R/inst/include/arrow/util/hash-util.h | 310 - r/R/inst/include/arrow/util/hashing.h | 807 -- r/R/inst/include/arrow/util/int-util.h | 89 - r/R/inst/include/arrow/util/io-util.h | 263 - .../include/arrow/util/key_value_metadata.h | 81 - r/R/inst/include/arrow/util/lazy.h | 128 - r/R/inst/include/arrow/util/logging.h | 244 - r/R/inst/include/arrow/util/macros.h | 164 - r/R/inst/include/arrow/util/memory.h | 46 - r/R/inst/include/arrow/util/neon-util.h | 59 - r/R/inst/include/arrow/util/parallel.h | 95 - r/R/inst/include/arrow/util/parsing.h | 512 -- r/R/inst/include/arrow/util/rle-encoding.h | 604 -- r/R/inst/include/arrow/util/sse-util.h | 122 - r/R/inst/include/arrow/util/stl.h | 95 - r/R/inst/include/arrow/util/stopwatch.h | 48 - r/R/inst/include/arrow/util/string.h | 68 - r/R/inst/include/arrow/util/string_builder.h | 69 - r/R/inst/include/arrow/util/string_view.h | 33 - r/R/inst/include/arrow/util/task-group.h | 91 - r/R/inst/include/arrow/util/thread-pool.h | 169 - r/R/inst/include/arrow/util/trie.h | 245 - r/R/inst/include/arrow/util/type_traits.h | 48 - r/R/inst/include/arrow/util/ubsan.h | 53 - r/R/inst/include/arrow/util/uri.h | 70 - r/R/inst/include/arrow/util/utf8.h | 176 - r/R/inst/include/arrow/util/variant.h | 36 - r/R/inst/include/arrow/util/visibility.h | 56 - .../arrow/util/windows_compatibility.h | 40 - r/R/inst/include/arrow/vendored/datetime.h | 21 - .../include/arrow/vendored/datetime/date.h | 8025 ----------------- .../include/arrow/vendored/datetime/ios.h | 53 - r/R/inst/include/arrow/vendored/datetime/tz.h | 2590 ------ .../arrow/vendored/datetime/tz_private.h | 321 - .../arrow/vendored/datetime/visibility.h | 26 - .../include/arrow/vendored/xxhash/xxhash.h | 330 - r/R/inst/include/arrow/visitor.h | 138 - r/R/inst/include/arrow/visitor_inline.h | 277 - r/R/inst/include/parquet/api/io.h | 24 - r/R/inst/include/parquet/api/reader.h | 37 - r/R/inst/include/parquet/api/schema.h | 24 - r/R/inst/include/parquet/api/writer.h | 27 - r/R/inst/include/parquet/arrow/reader.h | 356 - .../include/parquet/arrow/record_reader.h | 122 - r/R/inst/include/parquet/arrow/schema.h | 100 - r/R/inst/include/parquet/arrow/test-util.h | 485 - r/R/inst/include/parquet/arrow/writer.h | 250 - r/R/inst/include/parquet/bloom_filter.h | 255 - r/R/inst/include/parquet/column_page.h | 173 - r/R/inst/include/parquet/column_reader.h | 255 - r/R/inst/include/parquet/column_scanner.h | 265 - r/R/inst/include/parquet/column_writer.h | 192 - r/R/inst/include/parquet/deprecated_io.h | 135 - r/R/inst/include/parquet/encoding.h | 358 - .../include/parquet/encryption_internal.h | 114 - r/R/inst/include/parquet/exception.h | 91 - r/R/inst/include/parquet/file_reader.h | 141 - r/R/inst/include/parquet/file_writer.h | 237 - r/R/inst/include/parquet/hasher.h | 75 - r/R/inst/include/parquet/metadata.h | 304 - r/R/inst/include/parquet/murmur3.h | 57 - r/R/inst/include/parquet/platform.h | 112 - r/R/inst/include/parquet/printer.h | 49 - r/R/inst/include/parquet/properties.h | 428 - r/R/inst/include/parquet/schema-internal.h | 139 - r/R/inst/include/parquet/schema.h | 470 - r/R/inst/include/parquet/statistics.h | 307 - r/R/inst/include/parquet/test-util.h | 710 -- r/R/inst/include/parquet/thrift.h | 214 - r/R/inst/include/parquet/types.h | 662 -- .../include/parquet/windows_compatibility.h | 30 - r/configure | 15 +- r/man/table.Rd | 5 +- 250 files changed, 10 insertions(+), 53745 deletions(-) delete mode 100644 r/R/inst/include/arrow/adapters/orc/adapter.h delete mode 100644 r/R/inst/include/arrow/adapters/orc/adapter_util.h delete mode 100644 r/R/inst/include/arrow/adapters/tensorflow/convert.h delete mode 100644 r/R/inst/include/arrow/allocator.h delete mode 100644 r/R/inst/include/arrow/api.h delete mode 100644 r/R/inst/include/arrow/array.h delete mode 100644 r/R/inst/include/arrow/array/builder_adaptive.h delete mode 100644 r/R/inst/include/arrow/array/builder_base.h delete mode 100644 r/R/inst/include/arrow/array/builder_binary.h delete mode 100644 r/R/inst/include/arrow/array/builder_decimal.h delete mode 100644 r/R/inst/include/arrow/array/builder_dict.h delete mode 100644 r/R/inst/include/arrow/array/builder_nested.h delete mode 100644 r/R/inst/include/arrow/array/builder_primitive.h delete mode 100644 r/R/inst/include/arrow/array/builder_time.h delete mode 100644 r/R/inst/include/arrow/array/builder_union.h delete mode 100644 r/R/inst/include/arrow/array/concatenate.h delete mode 100644 r/R/inst/include/arrow/buffer-builder.h delete mode 100644 r/R/inst/include/arrow/buffer.h delete mode 100644 r/R/inst/include/arrow/builder.h delete mode 100644 r/R/inst/include/arrow/compare.h delete mode 100644 r/R/inst/include/arrow/compute/api.h delete mode 100644 r/R/inst/include/arrow/compute/benchmark-util.h delete mode 100644 r/R/inst/include/arrow/compute/context.h delete mode 100644 r/R/inst/include/arrow/compute/expression.h delete mode 100644 r/R/inst/include/arrow/compute/kernel.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/aggregate.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/boolean.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/cast.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/compare.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/count.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/filter.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/hash.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/mean.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/sum-internal.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/sum.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/take.h delete mode 100644 r/R/inst/include/arrow/compute/kernels/util-internal.h delete mode 100644 r/R/inst/include/arrow/compute/logical_type.h delete mode 100644 r/R/inst/include/arrow/compute/operation.h delete mode 100644 r/R/inst/include/arrow/compute/operations/cast.h delete mode 100644 r/R/inst/include/arrow/compute/operations/literal.h delete mode 100644 r/R/inst/include/arrow/compute/test-util.h delete mode 100644 r/R/inst/include/arrow/compute/type_fwd.h delete mode 100644 r/R/inst/include/arrow/csv/api.h delete mode 100644 r/R/inst/include/arrow/csv/chunker.h delete mode 100644 r/R/inst/include/arrow/csv/column-builder.h delete mode 100644 r/R/inst/include/arrow/csv/converter.h delete mode 100644 r/R/inst/include/arrow/csv/options.h delete mode 100644 r/R/inst/include/arrow/csv/parser.h delete mode 100644 r/R/inst/include/arrow/csv/reader.h delete mode 100644 r/R/inst/include/arrow/csv/test-common.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/api.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/operation.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/service.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/session.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/types.h delete mode 100644 r/R/inst/include/arrow/dbi/hiveserver2/util.h delete mode 100644 r/R/inst/include/arrow/extension_type.h delete mode 100644 r/R/inst/include/arrow/filesystem/filesystem.h delete mode 100644 r/R/inst/include/arrow/filesystem/localfs.h delete mode 100644 r/R/inst/include/arrow/filesystem/mockfs.h delete mode 100644 r/R/inst/include/arrow/filesystem/path-util.h delete mode 100644 r/R/inst/include/arrow/filesystem/test-util.h delete mode 100644 r/R/inst/include/arrow/filesystem/util-internal.h delete mode 100644 r/R/inst/include/arrow/flight/api.h delete mode 100644 r/R/inst/include/arrow/flight/client.h delete mode 100644 r/R/inst/include/arrow/flight/client_auth.h delete mode 100644 r/R/inst/include/arrow/flight/customize_protobuf.h delete mode 100644 r/R/inst/include/arrow/flight/internal.h delete mode 100644 r/R/inst/include/arrow/flight/platform.h delete mode 100644 r/R/inst/include/arrow/flight/protocol-internal.h delete mode 100644 r/R/inst/include/arrow/flight/serialization-internal.h delete mode 100644 r/R/inst/include/arrow/flight/server.h delete mode 100644 r/R/inst/include/arrow/flight/server_auth.h delete mode 100644 r/R/inst/include/arrow/flight/test-util.h delete mode 100644 r/R/inst/include/arrow/flight/types.h delete mode 100644 r/R/inst/include/arrow/flight/visibility.h delete mode 100644 r/R/inst/include/arrow/gpu/cuda_api.h delete mode 100644 r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h delete mode 100644 r/R/inst/include/arrow/gpu/cuda_common.h delete mode 100644 r/R/inst/include/arrow/gpu/cuda_context.h delete mode 100644 r/R/inst/include/arrow/gpu/cuda_memory.h delete mode 100644 r/R/inst/include/arrow/io/api.h delete mode 100644 r/R/inst/include/arrow/io/buffered.h delete mode 100644 r/R/inst/include/arrow/io/compressed.h delete mode 100644 r/R/inst/include/arrow/io/file.h delete mode 100644 r/R/inst/include/arrow/io/hdfs-internal.h delete mode 100644 r/R/inst/include/arrow/io/hdfs.h delete mode 100644 r/R/inst/include/arrow/io/interfaces.h delete mode 100644 r/R/inst/include/arrow/io/memory.h delete mode 100644 r/R/inst/include/arrow/io/mman.h delete mode 100644 r/R/inst/include/arrow/io/readahead.h delete mode 100644 r/R/inst/include/arrow/io/test-common.h delete mode 100644 r/R/inst/include/arrow/ipc/api.h delete mode 100644 r/R/inst/include/arrow/ipc/dictionary.h delete mode 100644 r/R/inst/include/arrow/ipc/feather-internal.h delete mode 100644 r/R/inst/include/arrow/ipc/feather.h delete mode 100644 r/R/inst/include/arrow/ipc/json-integration.h delete mode 100644 r/R/inst/include/arrow/ipc/json-internal.h delete mode 100644 r/R/inst/include/arrow/ipc/json-simple.h delete mode 100644 r/R/inst/include/arrow/ipc/message.h delete mode 100644 r/R/inst/include/arrow/ipc/metadata-internal.h delete mode 100644 r/R/inst/include/arrow/ipc/reader.h delete mode 100644 r/R/inst/include/arrow/ipc/test-common.h delete mode 100644 r/R/inst/include/arrow/ipc/util.h delete mode 100644 r/R/inst/include/arrow/ipc/writer.h delete mode 100644 r/R/inst/include/arrow/json/api.h delete mode 100644 r/R/inst/include/arrow/json/chunked-builder.h delete mode 100644 r/R/inst/include/arrow/json/chunker.h delete mode 100644 r/R/inst/include/arrow/json/converter.h delete mode 100644 r/R/inst/include/arrow/json/options.h delete mode 100644 r/R/inst/include/arrow/json/parser.h delete mode 100644 r/R/inst/include/arrow/json/rapidjson-defs.h delete mode 100644 r/R/inst/include/arrow/json/reader.h delete mode 100644 r/R/inst/include/arrow/json/test-common.h delete mode 100644 r/R/inst/include/arrow/memory_pool-test.h delete mode 100644 r/R/inst/include/arrow/memory_pool.h delete mode 100644 r/R/inst/include/arrow/pretty_print.h delete mode 100644 r/R/inst/include/arrow/python/api.h delete mode 100644 r/R/inst/include/arrow/python/arrow_to_pandas.h delete mode 100644 r/R/inst/include/arrow/python/benchmark.h delete mode 100644 r/R/inst/include/arrow/python/common.h delete mode 100644 r/R/inst/include/arrow/python/config.h delete mode 100644 r/R/inst/include/arrow/python/decimal.h delete mode 100644 r/R/inst/include/arrow/python/deserialize.h delete mode 100644 r/R/inst/include/arrow/python/flight.h delete mode 100644 r/R/inst/include/arrow/python/helpers.h delete mode 100644 r/R/inst/include/arrow/python/inference.h delete mode 100644 r/R/inst/include/arrow/python/init.h delete mode 100644 r/R/inst/include/arrow/python/io.h delete mode 100644 r/R/inst/include/arrow/python/iterators.h delete mode 100644 r/R/inst/include/arrow/python/numpy-internal.h delete mode 100644 r/R/inst/include/arrow/python/numpy_convert.h delete mode 100644 r/R/inst/include/arrow/python/numpy_interop.h delete mode 100644 r/R/inst/include/arrow/python/numpy_to_arrow.h delete mode 100644 r/R/inst/include/arrow/python/platform.h delete mode 100644 r/R/inst/include/arrow/python/pyarrow.h delete mode 100644 r/R/inst/include/arrow/python/pyarrow_api.h delete mode 100644 r/R/inst/include/arrow/python/pyarrow_lib.h delete mode 100644 r/R/inst/include/arrow/python/python_to_arrow.h delete mode 100644 r/R/inst/include/arrow/python/serialize.h delete mode 100644 r/R/inst/include/arrow/python/type_traits.h delete mode 100644 r/R/inst/include/arrow/python/util/datetime.h delete mode 100644 r/R/inst/include/arrow/python/visibility.h delete mode 100644 r/R/inst/include/arrow/record_batch.h delete mode 100644 r/R/inst/include/arrow/scalar.h delete mode 100644 r/R/inst/include/arrow/sparse_tensor.h delete mode 100644 r/R/inst/include/arrow/status.h delete mode 100644 r/R/inst/include/arrow/stl.h delete mode 100644 r/R/inst/include/arrow/table.h delete mode 100644 r/R/inst/include/arrow/table_builder.h delete mode 100644 r/R/inst/include/arrow/tensor.h delete mode 100644 r/R/inst/include/arrow/testing/gtest_common.h delete mode 100644 r/R/inst/include/arrow/testing/gtest_util.h delete mode 100644 r/R/inst/include/arrow/testing/random.h delete mode 100644 r/R/inst/include/arrow/testing/util.h delete mode 100644 r/R/inst/include/arrow/type.h delete mode 100644 r/R/inst/include/arrow/type_fwd.h delete mode 100644 r/R/inst/include/arrow/type_traits.h delete mode 100644 r/R/inst/include/arrow/util/basic_decimal.h delete mode 100644 r/R/inst/include/arrow/util/bit-stream-utils.h delete mode 100644 r/R/inst/include/arrow/util/bit-util.h delete mode 100644 r/R/inst/include/arrow/util/bpacking.h delete mode 100644 r/R/inst/include/arrow/util/checked_cast.h delete mode 100644 r/R/inst/include/arrow/util/compiler-util.h delete mode 100644 r/R/inst/include/arrow/util/compression.h delete mode 100644 r/R/inst/include/arrow/util/compression_brotli.h delete mode 100644 r/R/inst/include/arrow/util/compression_bz2.h delete mode 100644 r/R/inst/include/arrow/util/compression_lz4.h delete mode 100644 r/R/inst/include/arrow/util/compression_snappy.h delete mode 100644 r/R/inst/include/arrow/util/compression_zlib.h delete mode 100644 r/R/inst/include/arrow/util/compression_zstd.h delete mode 100644 r/R/inst/include/arrow/util/cpu-info.h delete mode 100644 r/R/inst/include/arrow/util/decimal.h delete mode 100644 r/R/inst/include/arrow/util/hash-util.h delete mode 100644 r/R/inst/include/arrow/util/hashing.h delete mode 100644 r/R/inst/include/arrow/util/int-util.h delete mode 100644 r/R/inst/include/arrow/util/io-util.h delete mode 100644 r/R/inst/include/arrow/util/key_value_metadata.h delete mode 100644 r/R/inst/include/arrow/util/lazy.h delete mode 100644 r/R/inst/include/arrow/util/logging.h delete mode 100644 r/R/inst/include/arrow/util/macros.h delete mode 100644 r/R/inst/include/arrow/util/memory.h delete mode 100644 r/R/inst/include/arrow/util/neon-util.h delete mode 100644 r/R/inst/include/arrow/util/parallel.h delete mode 100644 r/R/inst/include/arrow/util/parsing.h delete mode 100644 r/R/inst/include/arrow/util/rle-encoding.h delete mode 100644 r/R/inst/include/arrow/util/sse-util.h delete mode 100644 r/R/inst/include/arrow/util/stl.h delete mode 100644 r/R/inst/include/arrow/util/stopwatch.h delete mode 100644 r/R/inst/include/arrow/util/string.h delete mode 100644 r/R/inst/include/arrow/util/string_builder.h delete mode 100644 r/R/inst/include/arrow/util/string_view.h delete mode 100644 r/R/inst/include/arrow/util/task-group.h delete mode 100644 r/R/inst/include/arrow/util/thread-pool.h delete mode 100644 r/R/inst/include/arrow/util/trie.h delete mode 100644 r/R/inst/include/arrow/util/type_traits.h delete mode 100644 r/R/inst/include/arrow/util/ubsan.h delete mode 100644 r/R/inst/include/arrow/util/uri.h delete mode 100644 r/R/inst/include/arrow/util/utf8.h delete mode 100644 r/R/inst/include/arrow/util/variant.h delete mode 100644 r/R/inst/include/arrow/util/visibility.h delete mode 100644 r/R/inst/include/arrow/util/windows_compatibility.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime/date.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime/ios.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime/tz.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime/tz_private.h delete mode 100644 r/R/inst/include/arrow/vendored/datetime/visibility.h delete mode 100644 r/R/inst/include/arrow/vendored/xxhash/xxhash.h delete mode 100644 r/R/inst/include/arrow/visitor.h delete mode 100644 r/R/inst/include/arrow/visitor_inline.h delete mode 100644 r/R/inst/include/parquet/api/io.h delete mode 100644 r/R/inst/include/parquet/api/reader.h delete mode 100644 r/R/inst/include/parquet/api/schema.h delete mode 100644 r/R/inst/include/parquet/api/writer.h delete mode 100644 r/R/inst/include/parquet/arrow/reader.h delete mode 100644 r/R/inst/include/parquet/arrow/record_reader.h delete mode 100644 r/R/inst/include/parquet/arrow/schema.h delete mode 100644 r/R/inst/include/parquet/arrow/test-util.h delete mode 100644 r/R/inst/include/parquet/arrow/writer.h delete mode 100644 r/R/inst/include/parquet/bloom_filter.h delete mode 100644 r/R/inst/include/parquet/column_page.h delete mode 100644 r/R/inst/include/parquet/column_reader.h delete mode 100644 r/R/inst/include/parquet/column_scanner.h delete mode 100644 r/R/inst/include/parquet/column_writer.h delete mode 100644 r/R/inst/include/parquet/deprecated_io.h delete mode 100644 r/R/inst/include/parquet/encoding.h delete mode 100644 r/R/inst/include/parquet/encryption_internal.h delete mode 100644 r/R/inst/include/parquet/exception.h delete mode 100644 r/R/inst/include/parquet/file_reader.h delete mode 100644 r/R/inst/include/parquet/file_writer.h delete mode 100644 r/R/inst/include/parquet/hasher.h delete mode 100644 r/R/inst/include/parquet/metadata.h delete mode 100644 r/R/inst/include/parquet/murmur3.h delete mode 100644 r/R/inst/include/parquet/platform.h delete mode 100644 r/R/inst/include/parquet/printer.h delete mode 100644 r/R/inst/include/parquet/properties.h delete mode 100644 r/R/inst/include/parquet/schema-internal.h delete mode 100644 r/R/inst/include/parquet/schema.h delete mode 100644 r/R/inst/include/parquet/statistics.h delete mode 100644 r/R/inst/include/parquet/test-util.h delete mode 100644 r/R/inst/include/parquet/thrift.h delete mode 100644 r/R/inst/include/parquet/types.h delete mode 100644 r/R/inst/include/parquet/windows_compatibility.h diff --git a/r/R/inst/include/arrow/adapters/orc/adapter.h b/r/R/inst/include/arrow/adapters/orc/adapter.h deleted file mode 100644 index 6279f687dc1..00000000000 --- a/r/R/inst/include/arrow/adapters/orc/adapter.h +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_ORC_CONVERTER_H -#define ARROW_ORC_CONVERTER_H - -#include -#include -#include - -#include "arrow/io/interfaces.h" -#include "arrow/memory_pool.h" -#include "arrow/record_batch.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -namespace adapters { - -namespace orc { - -/// \class ORCFileReader -/// \brief Read an Arrow Table or RecordBatch from an ORC file. -class ARROW_EXPORT ORCFileReader { - public: - ~ORCFileReader(); - - /// \brief Creates a new ORC reader. - /// - /// \param[in] file the data source - /// \param[in] pool a MemoryPool to use for buffer allocations - /// \param[out] reader the returned reader object - /// \return Status - static Status Open(const std::shared_ptr& file, MemoryPool* pool, - std::unique_ptr* reader); - - /// \brief Return the schema read from the ORC file - /// - /// \param[out] out the returned Schema object - Status ReadSchema(std::shared_ptr* out); - - /// \brief Read the file as a Table - /// - /// The table will be composed of one record batch per stripe. - /// - /// \param[out] out the returned Table - Status Read(std::shared_ptr
* out); - - /// \brief Read the file as a Table - /// - /// The table will be composed of one record batch per stripe. - /// - /// \param[in] schema the Table schema - /// \param[out] out the returned Table - Status Read(const std::shared_ptr& schema, std::shared_ptr
* out); - - /// \brief Read the file as a Table - /// - /// The table will be composed of one record batch per stripe. - /// - /// \param[in] include_indices the selected field indices to read - /// \param[out] out the returned Table - Status Read(const std::vector& include_indices, std::shared_ptr
* out); - - /// \brief Read the file as a Table - /// - /// The table will be composed of one record batch per stripe. - /// - /// \param[in] schema the Table schema - /// \param[in] include_indices the selected field indices to read - /// \param[out] out the returned Table - Status Read(const std::shared_ptr& schema, - const std::vector& include_indices, std::shared_ptr
* out); - - /// \brief Read a single stripe as a RecordBatch - /// - /// \param[in] stripe the stripe index - /// \param[out] out the returned RecordBatch - Status ReadStripe(int64_t stripe, std::shared_ptr* out); - - /// \brief Read a single stripe as a RecordBatch - /// - /// \param[in] stripe the stripe index - /// \param[in] include_indices the selected field indices to read - /// \param[out] out the returned RecordBatch - Status ReadStripe(int64_t stripe, const std::vector& include_indices, - std::shared_ptr* out); - - /// \brief Seek to designated row. Invoke NextStripeReader() after seek - /// will return stripe reader starting from designated row. - /// - /// \param[in] row_number the rows number to seek - Status Seek(int64_t row_number); - - /// \brief Get a stripe level record batch iterator with specified row count - /// in each record batch. NextStripeReader serves as an fine grain - /// alternative to ReadStripe which may cause OOM issue by loading - /// the whole stripes into memory. - /// - /// \param[in] batch_size the number of rows each record batch contains in - /// record batch iteration. - /// \param[out] out the returned stripe reader - Status NextStripeReader(int64_t batch_size, std::shared_ptr* out); - - /// \brief Get a stripe level record batch iterator with specified row count - /// in each record batch. NextStripeReader serves as an fine grain - /// alternative to ReadStripe which may cause OOM issue by loading - /// the whole stripes into memory. - /// - /// \param[in] batch_size Get a stripe level record batch iterator with specified row - /// count in each record batch. - /// - /// \param[in] include_indices the selected field indices to read - /// \param[out] out the returned stripe reader - Status NextStripeReader(int64_t batch_size, const std::vector& include_indices, - std::shared_ptr* out); - - /// \brief The number of stripes in the file - int64_t NumberOfStripes(); - - /// \brief The number of rows in the file - int64_t NumberOfRows(); - - private: - class Impl; - std::unique_ptr impl_; - ORCFileReader(); -}; - -} // namespace orc - -} // namespace adapters - -} // namespace arrow - -#endif // ARROW_ORC_CONVERTER_H diff --git a/r/R/inst/include/arrow/adapters/orc/adapter_util.h b/r/R/inst/include/arrow/adapters/orc/adapter_util.h deleted file mode 100644 index eede23051d2..00000000000 --- a/r/R/inst/include/arrow/adapters/orc/adapter_util.h +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_ADAPATER_UTIL_H -#define ARROW_ADAPATER_UTIL_H - -#include -#include - -#include "arrow/array/builder_base.h" -#include "arrow/status.h" -#include "orc/OrcFile.hh" - -namespace liborc = orc; - -namespace arrow { - -namespace adapters { - -namespace orc { - -Status GetArrowType(const liborc::Type* type, std::shared_ptr* out); - -Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, - int64_t offset, int64_t length, ArrayBuilder* builder); -} // namespace orc -} // namespace adapters -} // namespace arrow - -#endif // ARROW_ADAPATER_UTIL_H diff --git a/r/R/inst/include/arrow/adapters/tensorflow/convert.h b/r/R/inst/include/arrow/adapters/tensorflow/convert.h deleted file mode 100644 index 93b7e621ef8..00000000000 --- a/r/R/inst/include/arrow/adapters/tensorflow/convert.h +++ /dev/null @@ -1,158 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TENSORFLOW_CONVERTER_H -#define ARROW_TENSORFLOW_CONVERTER_H - -#include - -#include "tensorflow/core/framework/op.h" - -#include "arrow/type.h" - -// These utilities are supposed to be included in TensorFlow operators -// that need to be compiled separately from Arrow because of ABI issues. -// They therefore need to be header-only. - -namespace arrow { - -namespace adapters { - -namespace tensorflow { - -Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr* out) { - switch (dtype) { - case ::tensorflow::DT_BOOL: - *out = arrow::boolean(); - break; - case ::tensorflow::DT_FLOAT: - *out = arrow::float32(); - break; - case ::tensorflow::DT_DOUBLE: - *out = arrow::float64(); - break; - case ::tensorflow::DT_HALF: - *out = arrow::float16(); - break; - case ::tensorflow::DT_INT8: - *out = arrow::int8(); - break; - case ::tensorflow::DT_INT16: - *out = arrow::int16(); - break; - case ::tensorflow::DT_INT32: - *out = arrow::int32(); - break; - case ::tensorflow::DT_INT64: - *out = arrow::int64(); - break; - case ::tensorflow::DT_UINT8: - *out = arrow::uint8(); - break; - case ::tensorflow::DT_UINT16: - *out = arrow::uint16(); - break; - case ::tensorflow::DT_UINT32: - *out = arrow::uint32(); - break; - case ::tensorflow::DT_UINT64: - *out = arrow::uint64(); - break; - case ::tensorflow::DT_BFLOAT16: - case ::tensorflow::DT_COMPLEX64: - case ::tensorflow::DT_COMPLEX128: - case ::tensorflow::DT_INVALID: - case ::tensorflow::DT_QINT8: - case ::tensorflow::DT_QINT16: - case ::tensorflow::DT_QINT32: - case ::tensorflow::DT_QUINT8: - case ::tensorflow::DT_QUINT16: - case ::tensorflow::DT_RESOURCE: - case ::tensorflow::DT_STRING: - case ::tensorflow::DT_VARIANT: - default: - return Status::TypeError("TensorFlow data type is not supported"); - } - return Status::OK(); -} - -Status GetTensorFlowType(std::shared_ptr dtype, ::tensorflow::DataType* out) { - switch (dtype->id()) { - case Type::BOOL: - *out = ::tensorflow::DT_BOOL; - break; - case Type::UINT8: - *out = ::tensorflow::DT_UINT8; - break; - case Type::INT8: - *out = ::tensorflow::DT_INT8; - break; - case Type::UINT16: - *out = ::tensorflow::DT_UINT16; - break; - case Type::INT16: - *out = ::tensorflow::DT_INT16; - break; - case Type::UINT32: - *out = ::tensorflow::DT_UINT32; - break; - case Type::INT32: - *out = ::tensorflow::DT_INT32; - break; - case Type::UINT64: - *out = ::tensorflow::DT_UINT64; - break; - case Type::INT64: - *out = ::tensorflow::DT_INT64; - break; - case Type::HALF_FLOAT: - *out = ::tensorflow::DT_HALF; - break; - case Type::FLOAT: - *out = ::tensorflow::DT_FLOAT; - break; - case Type::DOUBLE: - *out = ::tensorflow::DT_DOUBLE; - break; - case Type::STRING: - case Type::BINARY: - case Type::FIXED_SIZE_BINARY: - case Type::DATE32: - case Type::DATE64: - case Type::TIMESTAMP: - case Type::TIME32: - case Type::TIME64: - case Type::INTERVAL: - case Type::DECIMAL: - case Type::LIST: - case Type::STRUCT: - case Type::UNION: - case Type::DICTIONARY: - case Type::MAP: - default: - return Status::TypeError("Arrow data type is not supported"); - } - return arrow::Status::OK(); -} - -} // namespace tensorflow - -} // namespace adapters - -} // namespace arrow - -#endif // ARROW_TENSORFLOW_CONVERTER_H diff --git a/r/R/inst/include/arrow/allocator.h b/r/R/inst/include/arrow/allocator.h deleted file mode 100644 index a02b8e64bb0..00000000000 --- a/r/R/inst/include/arrow/allocator.h +++ /dev/null @@ -1,151 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_ALLOCATOR_H -#define ARROW_ALLOCATOR_H - -#include -#include -#include -#include - -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" - -namespace arrow { - -/// \brief A STL allocator delegating allocations to a Arrow MemoryPool -template -class stl_allocator { - public: - using value_type = T; - using pointer = T*; - using const_pointer = const T*; - using reference = T&; - using const_reference = const T&; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; - - template - struct rebind { - using other = stl_allocator; - }; - - /// \brief Construct an allocator from the default MemoryPool - stl_allocator() noexcept : pool_(default_memory_pool()) {} - /// \brief Construct an allocator from the given MemoryPool - explicit stl_allocator(MemoryPool* pool) noexcept : pool_(pool) {} - - template - stl_allocator(const stl_allocator& rhs) noexcept : pool_(rhs.pool_) {} - - ~stl_allocator() { pool_ = NULLPTR; } - - pointer address(reference r) const noexcept { return std::addressof(r); } - - const_pointer address(const_reference r) const noexcept { return std::addressof(r); } - - pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) { - uint8_t* data; - Status s = pool_->Allocate(n * sizeof(T), &data); - if (!s.ok()) throw std::bad_alloc(); - return reinterpret_cast(data); - } - - void deallocate(pointer p, size_type n) { - pool_->Free(reinterpret_cast(p), n * sizeof(T)); - } - - size_type size_max() const noexcept { return size_type(-1) / sizeof(T); } - - template - void construct(U* p, Args&&... args) { - new (reinterpret_cast(p)) U(std::forward(args)...); - } - - template - void destroy(U* p) { - p->~U(); - } - - MemoryPool* pool() const noexcept { return pool_; } - - private: - MemoryPool* pool_; -}; - -/// \brief A MemoryPool implementation delegating allocations to a STL allocator -/// -/// Note that STL allocators don't provide a resizing operation, and therefore -/// any buffer resizes will do a full reallocation and copy. -template > -class STLMemoryPool : public MemoryPool { - public: - /// \brief Construct a memory pool from the given allocator - explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {} - - Status Allocate(int64_t size, uint8_t** out) override { - try { - *out = alloc_.allocate(size); - } catch (std::bad_alloc& e) { - return Status::OutOfMemory(e.what()); - } - stats_.UpdateAllocatedBytes(size); - return Status::OK(); - } - - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override { - uint8_t* old_ptr = *ptr; - try { - *ptr = alloc_.allocate(new_size); - } catch (std::bad_alloc& e) { - return Status::OutOfMemory(e.what()); - } - memcpy(*ptr, old_ptr, std::min(old_size, new_size)); - alloc_.deallocate(old_ptr, old_size); - stats_.UpdateAllocatedBytes(new_size - old_size); - return Status::OK(); - } - - void Free(uint8_t* buffer, int64_t size) override { - alloc_.deallocate(buffer, size); - stats_.UpdateAllocatedBytes(-size); - } - - int64_t bytes_allocated() const override { return stats_.bytes_allocated(); } - - int64_t max_memory() const override { return stats_.max_memory(); } - - private: - Allocator alloc_; - internal::MemoryPoolStats stats_; -}; - -template -bool operator==(const stl_allocator& lhs, const stl_allocator& rhs) noexcept { - return lhs.pool() == rhs.pool(); -} - -template -bool operator!=(const stl_allocator& lhs, const stl_allocator& rhs) noexcept { - return !(lhs == rhs); -} - -} // namespace arrow - -#endif // ARROW_ALLOCATOR_H diff --git a/r/R/inst/include/arrow/api.h b/r/R/inst/include/arrow/api.h deleted file mode 100644 index 3d6a17961b6..00000000000 --- a/r/R/inst/include/arrow/api.h +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Coarse public API while the library is in development - -#ifndef ARROW_API_H -#define ARROW_API_H - -#include "arrow/array.h" // IYWU pragma: export -#include "arrow/buffer.h" // IYWU pragma: export -#include "arrow/builder.h" // IYWU pragma: export -#include "arrow/compare.h" // IYWU pragma: export -#include "arrow/extension_type.h" // IYWU pragma: export -#include "arrow/memory_pool.h" // IYWU pragma: export -#include "arrow/pretty_print.h" // IYWU pragma: export -#include "arrow/record_batch.h" // IYWU pragma: export -#include "arrow/status.h" // IYWU pragma: export -#include "arrow/table.h" // IYWU pragma: export -#include "arrow/table_builder.h" // IYWU pragma: export -#include "arrow/tensor.h" // IYWU pragma: export -#include "arrow/type.h" // IYWU pragma: export -#include "arrow/util/config.h" // IYWU pragma: export -#include "arrow/util/key_value_metadata.h" // IWYU pragma: export -#include "arrow/visitor.h" // IYWU pragma: export - -/// \brief Top-level namespace for Apache Arrow C++ API -namespace arrow {} - -#endif // ARROW_API_H diff --git a/r/R/inst/include/arrow/array.h b/r/R/inst/include/arrow/array.h deleted file mode 100644 index de8df2bb031..00000000000 --- a/r/R/inst/include/arrow/array.h +++ /dev/null @@ -1,1072 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_ARRAY_H -#define ARROW_ARRAY_H - -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/compare.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" // IWYU pragma: export -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class ArrayVisitor; - -// When slicing, we do not know the null count of the sliced range without -// doing some computation. To avoid doing this eagerly, we set the null count -// to -1 (any negative number will do). When Array::null_count is called the -// first time, the null count will be computed. See ARROW-33 -constexpr int64_t kUnknownNullCount = -1; - -class MemoryPool; -class Status; - -// ---------------------------------------------------------------------- -// Generic array data container - -/// \class ArrayData -/// \brief Mutable container for generic Arrow array data -/// -/// This data structure is a self-contained representation of the memory and -/// metadata inside an Arrow array data structure (called vectors in Java). The -/// classes arrow::Array and its subclasses provide strongly-typed accessors -/// with support for the visitor pattern and other affordances. -/// -/// This class is designed for easy internal data manipulation, analytical data -/// processing, and data transport to and from IPC messages. For example, we -/// could cast from int64 to float64 like so: -/// -/// Int64Array arr = GetMyData(); -/// auto new_data = arr.data()->Copy(); -/// new_data->type = arrow::float64(); -/// DoubleArray double_arr(new_data); -/// -/// This object is also useful in an analytics setting where memory may be -/// reused. For example, if we had a group of operations all returning doubles, -/// say: -/// -/// Log(Sqrt(Expr(arr))) -/// -/// Then the low-level implementations of each of these functions could have -/// the signatures -/// -/// void Log(const ArrayData& values, ArrayData* out); -/// -/// As another example a function may consume one or more memory buffers in an -/// input array and replace them with newly-allocated data, changing the output -/// data type as well. -struct ARROW_EXPORT ArrayData { - ArrayData() : length(0), null_count(0), offset(0) {} - - ArrayData(const std::shared_ptr& type, int64_t length, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : type(type), length(length), null_count(null_count), offset(offset) {} - - ArrayData(const std::shared_ptr& type, int64_t length, - const std::vector>& buffers, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : ArrayData(type, length, null_count, offset) { - this->buffers = buffers; - } - - ArrayData(const std::shared_ptr& type, int64_t length, - const std::vector>& buffers, - const std::vector>& child_data, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : ArrayData(type, length, null_count, offset) { - this->buffers = buffers; - this->child_data = child_data; - } - - ArrayData(const std::shared_ptr& type, int64_t length, - std::vector>&& buffers, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : ArrayData(type, length, null_count, offset) { - this->buffers = std::move(buffers); - } - - static std::shared_ptr Make(const std::shared_ptr& type, - int64_t length, - std::vector>&& buffers, - int64_t null_count = kUnknownNullCount, - int64_t offset = 0); - - static std::shared_ptr Make( - const std::shared_ptr& type, int64_t length, - const std::vector>& buffers, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - static std::shared_ptr Make( - const std::shared_ptr& type, int64_t length, - const std::vector>& buffers, - const std::vector>& child_data, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - static std::shared_ptr Make(const std::shared_ptr& type, - int64_t length, - int64_t null_count = kUnknownNullCount, - int64_t offset = 0); - - // Move constructor - ArrayData(ArrayData&& other) noexcept - : type(std::move(other.type)), - length(other.length), - null_count(other.null_count), - offset(other.offset), - buffers(std::move(other.buffers)), - child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) {} - - // Copy constructor - ArrayData(const ArrayData& other) noexcept - : type(other.type), - length(other.length), - null_count(other.null_count), - offset(other.offset), - buffers(other.buffers), - child_data(other.child_data), - dictionary(other.dictionary) {} - - // Move assignment - ArrayData& operator=(ArrayData&& other) = default; - - // Copy assignment - ArrayData& operator=(const ArrayData& other) = default; - - std::shared_ptr Copy() const { return std::make_shared(*this); } - - // Access a buffer's data as a typed C pointer - template - inline const T* GetValues(int i, int64_t absolute_offset) const { - if (buffers[i]) { - return reinterpret_cast(buffers[i]->data()) + absolute_offset; - } else { - return NULLPTR; - } - } - - template - inline const T* GetValues(int i) const { - return GetValues(i, offset); - } - - // Access a buffer's data as a typed C pointer - template - inline T* GetMutableValues(int i, int64_t absolute_offset) { - if (buffers[i]) { - return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; - } else { - return NULLPTR; - } - } - - template - inline T* GetMutableValues(int i) { - return GetMutableValues(i, offset); - } - - // Construct a zero-copy slice of the data with the indicated offset and length - ArrayData Slice(int64_t offset, int64_t length) const; - - /// \brief Return null count, or compute and set it if it's not known - int64_t GetNullCount() const; - - std::shared_ptr type; - int64_t length; - mutable int64_t null_count; - // The logical start point into the physical buffers (in values, not bytes). - // Note that, for child data, this must be *added* to the child data's own offset. - int64_t offset; - std::vector> buffers; - std::vector> child_data; - - // The dictionary for this Array, if any. Only used for dictionary - // type - std::shared_ptr dictionary; -}; - -/// \brief Create a strongly-typed Array instance from generic ArrayData -/// \param[in] data the array contents -/// \return the resulting Array instance -ARROW_EXPORT -std::shared_ptr MakeArray(const std::shared_ptr& data); - -// ---------------------------------------------------------------------- -// User array accessor types - -/// \brief Array base type -/// Immutable data array with some logical type and some length. -/// -/// Any memory is owned by the respective Buffer instance (or its parents). -/// -/// The base class is only required to have a null bitmap buffer if the null -/// count is greater than 0 -/// -/// If known, the null count can be provided in the base Array constructor. If -/// the null count is not known, pass -1 to indicate that the null count is to -/// be computed on the first call to null_count() -class ARROW_EXPORT Array { - public: - virtual ~Array() = default; - - /// \brief Return true if value at index is null. Does not boundscheck - bool IsNull(int64_t i) const { - return null_bitmap_data_ != NULLPTR && - !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); - } - - /// \brief Return true if value at index is valid (not null). Does not - /// boundscheck - bool IsValid(int64_t i) const { - return null_bitmap_data_ == NULLPTR || - BitUtil::GetBit(null_bitmap_data_, i + data_->offset); - } - - /// Size in the number of elements this array contains. - int64_t length() const { return data_->length; } - - /// A relative position into another array's data, to enable zero-copy - /// slicing. This value defaults to zero - int64_t offset() const { return data_->offset; } - - /// The number of null entries in the array. If the null count was not known - /// at time of construction (and set to a negative value), then the null - /// count will be computed and cached on the first invocation of this - /// function - int64_t null_count() const; - - std::shared_ptr type() const { return data_->type; } - Type::type type_id() const { return data_->type->id(); } - - /// Buffer for the null bitmap. - /// - /// Note that for `null_count == 0`, this can be null. - /// This buffer does not account for any slice offset - std::shared_ptr null_bitmap() const { return data_->buffers[0]; } - - /// Raw pointer to the null bitmap. - /// - /// Note that for `null_count == 0`, this can be null. - /// This buffer does not account for any slice offset - const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } - - /// Equality comparison with another array - bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; - bool Equals(const std::shared_ptr& arr, - const EqualOptions& = EqualOptions::Defaults()) const; - - /// Approximate equality comparison with another array - /// - /// epsilon is only used if this is FloatArray or DoubleArray - bool ApproxEquals(const std::shared_ptr& arr, - const EqualOptions& = EqualOptions::Defaults()) const; - bool ApproxEquals(const Array& arr, - const EqualOptions& = EqualOptions::Defaults()) const; - - /// Compare if the range of slots specified are equal for the given array and - /// this array. end_idx exclusive. This methods does not bounds check. - bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, - const Array& other) const; - bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, - const std::shared_ptr& other) const; - bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, - int64_t other_start_idx) const; - bool RangeEquals(const std::shared_ptr& other, int64_t start_idx, - int64_t end_idx, int64_t other_start_idx) const; - - Status Accept(ArrayVisitor* visitor) const; - - /// Construct a zero-copy slice of the array with the indicated offset and - /// length - /// - /// \param[in] offset the position of the first element in the constructed - /// slice - /// \param[in] length the length of the slice. If there are not enough - /// elements in the array, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr - std::shared_ptr Slice(int64_t offset, int64_t length) const; - - /// Slice from offset until end of the array - std::shared_ptr Slice(int64_t offset) const; - - std::shared_ptr data() const { return data_; } - - int num_fields() const { return static_cast(data_->child_data.size()); } - - /// \return PrettyPrint representation of array suitable for debugging - std::string ToString() const; - - protected: - Array() : null_bitmap_data_(NULLPTR) {} - - std::shared_ptr data_; - const uint8_t* null_bitmap_data_; - - /// Protected method for constructors - inline void SetData(const std::shared_ptr& data) { - if (data->buffers.size() > 0 && data->buffers[0]) { - null_bitmap_data_ = data->buffers[0]->data(); - } else { - null_bitmap_data_ = NULLPTR; - } - data_ = data; - } - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Array); -}; - -using ArrayVector = std::vector>; - -namespace internal { - -/// Given a number of ArrayVectors, treat each ArrayVector as the -/// chunks of a chunked array. Then rechunk each ArrayVector such that -/// all ArrayVectors are chunked identically. It is mandatory that -/// all ArrayVectors contain the same total number of elements. -ARROW_EXPORT -std::vector RechunkArraysConsistently(const std::vector&); - -} // namespace internal - -static inline std::ostream& operator<<(std::ostream& os, const Array& x) { - os << x.ToString(); - return os; -} - -/// Base class for non-nested arrays -class ARROW_EXPORT FlatArray : public Array { - protected: - using Array::Array; -}; - -/// Degenerate null type Array -class ARROW_EXPORT NullArray : public FlatArray { - public: - using TypeClass = NullType; - - explicit NullArray(const std::shared_ptr& data) { SetData(data); } - explicit NullArray(int64_t length); - - private: - inline void SetData(const std::shared_ptr& data) { - null_bitmap_data_ = NULLPTR; - data->null_count = data->length; - data_ = data; - } -}; - -/// Base class for arrays of fixed-size logical types -class ARROW_EXPORT PrimitiveArray : public FlatArray { - public: - PrimitiveArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - /// Does not account for any slice offset - std::shared_ptr values() const { return data_->buffers[1]; } - - protected: - PrimitiveArray() : raw_values_(NULLPTR) {} - - inline void SetData(const std::shared_ptr& data) { - auto values = data->buffers[1]; - this->Array::SetData(data); - raw_values_ = values == NULLPTR ? NULLPTR : values->data(); - } - - explicit inline PrimitiveArray(const std::shared_ptr& data) { - SetData(data); - } - - const uint8_t* raw_values_; -}; - -/// Concrete Array class for numeric data. -template -class NumericArray : public PrimitiveArray { - public: - using TypeClass = TYPE; - using value_type = typename TypeClass::c_type; - - explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} - - // Only enable this constructor without a type argument for types without additional - // metadata - template - NumericArray( - typename std::enable_if::is_parameter_free, int64_t>::type length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, - null_count, offset) {} - - const value_type* raw_values() const { - return reinterpret_cast(raw_values_) + data_->offset; - } - - value_type Value(int64_t i) const { return raw_values()[i]; } - - // For API compatibility with BinaryArray etc. - value_type GetView(int64_t i) const { return Value(i); } - - protected: - using PrimitiveArray::PrimitiveArray; -}; - -/// Concrete Array class for boolean data -class ARROW_EXPORT BooleanArray : public PrimitiveArray { - public: - using TypeClass = BooleanType; - - explicit BooleanArray(const std::shared_ptr& data); - - BooleanArray(int64_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - bool Value(int64_t i) const { - return BitUtil::GetBit(reinterpret_cast(raw_values_), - i + data_->offset); - } - - bool GetView(int64_t i) const { return Value(i); } - - protected: - using PrimitiveArray::PrimitiveArray; -}; - -// ---------------------------------------------------------------------- -// ListArray - -/// Concrete Array class for list data -class ARROW_EXPORT ListArray : public Array { - public: - using TypeClass = ListType; - - explicit ListArray(const std::shared_ptr& data); - - ListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, - const std::shared_ptr& values, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - /// \brief Construct ListArray from array of offsets and child value array - /// - /// This function does the bare minimum of validation of the offsets and - /// input types, and will allocate a new offsets array if necessary (i.e. if - /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed - /// - /// \param[in] offsets Array containing n + 1 offsets encoding length and - /// size. Must be of int32 type - /// \param[in] values Array containing - /// \param[in] pool MemoryPool in case new offsets array needs to be - /// allocated because of null values - /// \param[out] out Will have length equal to offsets.length() - 1 - static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, - std::shared_ptr* out); - - const ListType* list_type() const; - - /// \brief Return array object containing the list's values - std::shared_ptr values() const; - - /// Note that this buffer does not account for any slice offset - std::shared_ptr value_offsets() const { return data_->buffers[1]; } - - std::shared_ptr value_type() const; - - /// Return pointer to raw value offsets accounting for any slice offset - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - - // Neither of these functions will perform boundschecking - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - int32_t value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } - - protected: - void SetData(const std::shared_ptr& data); - const int32_t* raw_value_offsets_; - - private: - std::shared_ptr values_; -}; - -// ---------------------------------------------------------------------- -// FixedSizeListArray - -/// Concrete Array class for fixed size list data -class ARROW_EXPORT FixedSizeListArray : public Array { - public: - using TypeClass = FixedSizeListType; - - explicit FixedSizeListArray(const std::shared_ptr& data); - - FixedSizeListArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& values, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - const FixedSizeListType* list_type() const; - - /// \brief Return array object containing the list's values - std::shared_ptr values() const; - - std::shared_ptr value_type() const; - - // Neither of these functions will perform boundschecking - int32_t value_offset(int64_t i) const { - i += data_->offset; - return static_cast(list_size_ * i); - } - int32_t value_length(int64_t i = 0) const { return list_size_; } - - protected: - void SetData(const std::shared_ptr& data); - int32_t list_size_; - - private: - std::shared_ptr values_; -}; - -// ---------------------------------------------------------------------- -// Binary and String - -/// Concrete Array class for variable-size binary data -class ARROW_EXPORT BinaryArray : public FlatArray { - public: - using TypeClass = BinaryType; - - explicit BinaryArray(const std::shared_ptr& data); - - BinaryArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - /// Return the pointer to the given elements bytes - // XXX should GetValue(int64_t i) return a string_view? - const uint8_t* GetValue(int64_t i, int32_t* out_length) const { - // Account for base offset - i += data_->offset; - const int32_t pos = raw_value_offsets_[i]; - *out_length = raw_value_offsets_[i + 1] - pos; - return raw_data_ + pos; - } - - /// \brief Get binary value as a string_view - /// - /// \param i the value index - /// \return the view over the selected value - util::string_view GetView(int64_t i) const { - // Account for base offset - i += data_->offset; - const int32_t pos = raw_value_offsets_[i]; - return util::string_view(reinterpret_cast(raw_data_ + pos), - raw_value_offsets_[i + 1] - pos); - } - - /// \brief Get binary value as a std::string - /// - /// \param i the value index - /// \return the value copied into a std::string - std::string GetString(int64_t i) const { return std::string(GetView(i)); } - - /// Note that this buffer does not account for any slice offset - std::shared_ptr value_offsets() const { return data_->buffers[1]; } - - /// Note that this buffer does not account for any slice offset - std::shared_ptr value_data() const { return data_->buffers[2]; } - - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - - // Neither of these functions will perform boundschecking - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - int32_t value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } - - protected: - // For subclasses - BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} - - /// Protected method for constructors - void SetData(const std::shared_ptr& data); - - // Constructor that allows sub-classes/builders to propagate there logical type up the - // class hierarchy. - BinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - const int32_t* raw_value_offsets_; - const uint8_t* raw_data_; -}; - -/// Concrete Array class for variable-size string (utf-8) data -class ARROW_EXPORT StringArray : public BinaryArray { - public: - using TypeClass = StringType; - - explicit StringArray(const std::shared_ptr& data); - - StringArray(int64_t length, const std::shared_ptr& value_offsets, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); -}; - -// ---------------------------------------------------------------------- -// Fixed width binary - -/// Concrete Array class for fixed-size binary data -class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { - public: - using TypeClass = FixedSizeBinaryType; - - explicit FixedSizeBinaryArray(const std::shared_ptr& data); - - FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - const uint8_t* GetValue(int64_t i) const; - const uint8_t* Value(int64_t i) const { return GetValue(i); } - - util::string_view GetView(int64_t i) const { - return util::string_view(reinterpret_cast(GetValue(i)), byte_width()); - } - - std::string GetString(int64_t i) const { return std::string(GetView(i)); } - - int32_t byte_width() const { return byte_width_; } - - const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } - - protected: - inline void SetData(const std::shared_ptr& data) { - this->PrimitiveArray::SetData(data); - byte_width_ = - internal::checked_cast(*type()).byte_width(); - } - - int32_t byte_width_; -}; - -/// DayTimeArray -/// --------------------- -/// \brief Array of Day and Millisecond values. -class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { - public: - using TypeClass = DayTimeIntervalType; - - explicit DayTimeIntervalArray(const std::shared_ptr& data); - - DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, - const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - TypeClass::DayMilliseconds GetValue(int64_t i) const; - TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } - - // For compability with Take kernel. - TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } - - int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } - - const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } - - protected: - inline void SetData(const std::shared_ptr& data) { - this->PrimitiveArray::SetData(data); - } -}; - -// ---------------------------------------------------------------------- -// Decimal128Array - -/// Concrete Array class for 128-bit decimal data -class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { - public: - using TypeClass = Decimal128Type; - - using FixedSizeBinaryArray::FixedSizeBinaryArray; - - /// \brief Construct Decimal128Array from ArrayData instance - explicit Decimal128Array(const std::shared_ptr& data); - - std::string FormatValue(int64_t i) const; -}; - -// Backward compatibility -using DecimalArray = Decimal128Array; - -// ---------------------------------------------------------------------- -// Struct - -/// Concrete Array class for struct data -class ARROW_EXPORT StructArray : public Array { - public: - using TypeClass = StructType; - - explicit StructArray(const std::shared_ptr& data); - - StructArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - std::shared_ptr null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - const StructType* struct_type() const; - - // Return a shared pointer in case the requestor desires to share ownership - // with this array. The returned array has its offset, length and null - // count adjusted. - std::shared_ptr field(int pos) const; - - /// Returns null if name not found - std::shared_ptr GetFieldByName(const std::string& name) const; - - /// \brief Flatten this array as a vector of arrays, one for each field - /// - /// \param[in] pool The pool to allocate null bitmaps from, if necessary - /// \param[out] out The resulting vector of arrays - Status Flatten(MemoryPool* pool, ArrayVector* out) const; - - private: - // For caching boxed child data - mutable std::vector> boxed_fields_; -}; - -// ---------------------------------------------------------------------- -// Union - -/// Concrete Array class for union data -class ARROW_EXPORT UnionArray : public Array { - public: - using TypeClass = UnionType; - using type_id_t = uint8_t; - - explicit UnionArray(const std::shared_ptr& data); - - UnionArray(const std::shared_ptr& type, int64_t length, - const std::vector>& children, - const std::shared_ptr& type_ids, - const std::shared_ptr& value_offsets = NULLPTR, - const std::shared_ptr& null_bitmap = NULLPTR, - int64_t null_count = kUnknownNullCount, int64_t offset = 0); - - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. - /// \param[in] type_codes Vector of type codes. - /// \param[out] out Will have length equal to value_offsets.length() - static Status MakeDense(const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& field_names, - const std::vector& type_codes, - std::shared_ptr* out); - - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. - /// \param[out] out Will have length equal to value_offsets.length() - static Status MakeDense(const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& field_names, - std::shared_ptr* out) { - return MakeDense(type_ids, value_offsets, children, field_names, {}, out); - } - - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] type_codes Vector of type codes. - /// \param[out] out Will have length equal to value_offsets.length() - static Status MakeDense(const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - const std::vector& type_codes, - std::shared_ptr* out) { - return MakeDense(type_ids, value_offsets, children, {}, type_codes, out); - } - - /// \brief Construct Dense UnionArray from types_ids, value_offsets and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. The value_offsets are assumed to be well-formed. - /// - /// The name of each field is filled by the index of the field. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] value_offsets An array of signed int32 values indicating the - /// relative offset into the respective child array for the type in a given slot. - /// The respective offsets for each child value array must be in order / increasing. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[out] out Will have length equal to value_offsets.length() - static Status MakeDense(const Array& type_ids, const Array& value_offsets, - const std::vector>& children, - std::shared_ptr* out) { - return MakeDense(type_ids, value_offsets, children, {}, {}, out); - } - - /// \brief Construct Sparse UnionArray from type_ids and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. - /// \param[in] type_codes Vector of type codes. - /// \param[out] out Will have length equal to type_ids.length() - static Status MakeSparse(const Array& type_ids, - const std::vector>& children, - const std::vector& field_names, - const std::vector& type_codes, - std::shared_ptr* out); - - /// \brief Construct Sparse UnionArray from type_ids and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] field_names Vector of strings containing the name of each field. - /// \param[out] out Will have length equal to type_ids.length() - static Status MakeSparse(const Array& type_ids, - const std::vector>& children, - const std::vector& field_names, - std::shared_ptr* out) { - return MakeSparse(type_ids, children, field_names, {}, out); - } - - /// \brief Construct Sparse UnionArray from type_ids and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[in] type_codes Vector of type codes. - /// \param[out] out Will have length equal to type_ids.length() - static Status MakeSparse(const Array& type_ids, - const std::vector>& children, - const std::vector& type_codes, - std::shared_ptr* out) { - return MakeSparse(type_ids, children, {}, type_codes, out); - } - - /// \brief Construct Sparse UnionArray from type_ids and children - /// - /// This function does the bare minimum of validation of the offsets and - /// input types. - /// - /// The name of each field is filled by the index of the field. - /// - /// \param[in] type_ids An array of 8-bit signed integers, enumerated from - /// 0 corresponding to each type. - /// \param[in] children Vector of children Arrays containing the data for each type. - /// \param[out] out Will have length equal to type_ids.length() - static Status MakeSparse(const Array& type_ids, - const std::vector>& children, - std::shared_ptr* out) { - return MakeSparse(type_ids, children, {}, {}, out); - } - - /// Note that this buffer does not account for any slice offset - std::shared_ptr type_ids() const { return data_->buffers[1]; } - - /// Note that this buffer does not account for any slice offset - std::shared_ptr value_offsets() const { return data_->buffers[2]; } - - int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - - const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } - const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } - - UnionMode::type mode() const { - return internal::checked_cast(*type()).mode(); - } - - // Return the given field as an individual array. - // For sparse unions, the returned array has its offset, length and null - // count adjusted. - // For dense unions, the returned array is unchanged. - std::shared_ptr child(int pos) const; - - /// Only use this while the UnionArray is in scope - const Array* UnsafeChild(int pos) const; - - protected: - void SetData(const std::shared_ptr& data); - - const type_id_t* raw_type_ids_; - const int32_t* raw_value_offsets_; - - // For caching boxed child data - mutable std::vector> boxed_fields_; -}; - -// ---------------------------------------------------------------------- -// DictionaryArray - -/// \brief Array type for dictionary-encoded data with a -/// data-dependent dictionary -/// -/// A dictionary array contains an array of non-negative integers (the -/// "dictionary indices") along with a data type containing a "dictionary" -/// corresponding to the distinct values represented in the data. -/// -/// For example, the array -/// -/// ["foo", "bar", "foo", "bar", "foo", "bar"] -/// -/// with dictionary ["bar", "foo"], would have dictionary array representation -/// -/// indices: [1, 0, 1, 0, 1, 0] -/// dictionary: ["bar", "foo"] -/// -/// The indices in principle may have any integer type (signed or unsigned), -/// though presently data in IPC exchanges must be signed int32. -class ARROW_EXPORT DictionaryArray : public Array { - public: - using TypeClass = DictionaryType; - - explicit DictionaryArray(const std::shared_ptr& data); - - DictionaryArray(const std::shared_ptr& type, - const std::shared_ptr& indices, - const std::shared_ptr& dictionary); - - /// \brief Construct DictionaryArray from dictionary and indices - /// array and validate - /// - /// This function does the validation of the indices and input type. It checks if - /// all indices are non-negative and smaller than the size of the dictionary - /// - /// \param[in] type a dictionary type - /// \param[in] dictionary the dictionary with same value type as the - /// type object - /// \param[in] indices an array of non-negative signed - /// integers smaller than the size of the dictionary - /// \param[out] out the resulting DictionaryArray instance - static Status FromArrays(const std::shared_ptr& type, - const std::shared_ptr& indices, - const std::shared_ptr& dictionary, - std::shared_ptr* out); - - /// \brief Transpose this DictionaryArray - /// - /// This method constructs a new dictionary array with the given dictionary type, - /// transposing indices using the transpose map. - /// The type and the transpose map are typically computed using - /// DictionaryType::Unify. - /// - /// \param[in] pool a pool to allocate the array data from - /// \param[in] type the new type object - /// \param[in] dictionary the new dictionary - /// \param[in] transpose_map a vector transposing this array's indices - /// into the target array's indices - /// \param[out] out the resulting DictionaryArray instance - Status Transpose(MemoryPool* pool, const std::shared_ptr& type, - const std::shared_ptr& dictionary, - const std::vector& transpose_map, - std::shared_ptr* out) const; - - /// \brief Return the dictionary for this array, which is stored as - /// a member of the ArrayData internal structure - std::shared_ptr dictionary() const; - std::shared_ptr indices() const; - - const DictionaryType* dict_type() const { return dict_type_; } - - private: - void SetData(const std::shared_ptr& data); - const DictionaryType* dict_type_; - std::shared_ptr indices_; -}; - -/// \brief Perform any validation checks to determine obvious inconsistencies -/// with the array's internal data -/// -/// This can be an expensive check. -/// -/// \param array an Array instance -/// \return Status -ARROW_EXPORT -Status ValidateArray(const Array& array); - -} // namespace arrow - -#endif // ARROW_ARRAY_H diff --git a/r/R/inst/include/arrow/array/builder_adaptive.h b/r/R/inst/include/arrow/array/builder_adaptive.h deleted file mode 100644 index 7f24109526b..00000000000 --- a/r/R/inst/include/arrow/array/builder_adaptive.h +++ /dev/null @@ -1,175 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/array/builder_base.h" - -namespace arrow { - -namespace internal { - -class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { - public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); - - /// \brief Append multiple nulls - /// \param[in] length the number of nulls to append - Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(CommitPendingData()); - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeSetNull(length); - return Status::OK(); - } - - Status AppendNull() final { - pending_data_[pending_pos_] = 0; - pending_valid_[pending_pos_] = 0; - pending_has_nulls_ = true; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - virtual Status CommitPendingData() = 0; - - std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; - - static constexpr int32_t pending_size_ = 1024; - uint8_t pending_valid_[pending_size_]; - uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; -}; - -} // namespace internal - -class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const uint64_t val) { - pending_data_[pending_pos_] = val; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const int64_t val) { - auto v = static_cast(val); - - pending_data_[pending_pos_] = v; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_base.h b/r/R/inst/include/arrow/array/builder_base.h deleted file mode 100644 index 36f6c7a2a4d..00000000000 --- a/r/R/inst/include/arrow/array/builder_base.h +++ /dev/null @@ -1,219 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include // IWYU pragma: keep -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer-builder.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -struct ArrayData; -class MemoryPool; - -constexpr int64_t kMinBuilderCapacity = 1 << 5; -constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - -/// Base class for all data array builders. -/// -/// This class provides a facilities for incrementally building the null bitmap -/// (see Append methods) and as a side effect the current number of slots and -/// the null count. -/// -/// \note Users are expected to use builders as one of the concrete types below. -/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. -class ARROW_EXPORT ArrayBuilder { - public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), pool_(pool), null_bitmap_builder_(pool) {} - - virtual ~ArrayBuilder() = default; - - /// For nested types. Since the objects are owned by this class instance, we - /// skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { return children_[i].get(); } - - int num_children() const { return static_cast(children_.size()); } - - int64_t length() const { return length_; } - int64_t null_count() const { return null_count_; } - int64_t capacity() const { return capacity_; } - - /// \brief Ensure that enough memory has been allocated to fit the indicated - /// number of total elements in the builder, including any that have already - /// been appended. Does not account for reallocations that may be due to - /// variable size data, like binary values. To make space for incremental - /// appends, use Reserve instead. - /// - /// \param[in] capacity the minimum number of total array values to - /// accommodate. Must be greater than the current capacity. - /// \return Status - virtual Status Resize(int64_t capacity); - - /// \brief Ensure that there is enough space allocated to add the indicated - /// number of elements without any further calls to Resize. Overallocation is - /// used in order to minimize the impact of incremental Reserve() calls. - /// - /// \param[in] additional_capacity the number of additional array values - /// \return Status - Status Reserve(int64_t additional_capacity) { - auto current_capacity = capacity(); - auto min_capacity = length() + additional_capacity; - if (min_capacity <= current_capacity) return Status::OK(); - - // leave growth factor up to BufferBuilder - auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); - return Resize(new_capacity); - } - - /// Reset the builder. - virtual void Reset(); - - virtual Status AppendNull() = 0; - virtual Status AppendNulls(int64_t length) = 0; - - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - Status Advance(int64_t elements); - - /// \brief Return result of builder as an internal generic ArrayData - /// object. Resets builder except for dictionary builder - /// - /// \param[out] out the finalized ArrayData object - /// \return Status - virtual Status FinishInternal(std::shared_ptr* out) = 0; - - /// \brief Return result of builder as an Array object. - /// - /// The builder is reset except for DictionaryBuilder. - /// - /// \param[out] out the finalized Array object - /// \return Status - Status Finish(std::shared_ptr* out); - - std::shared_ptr type() const { return type_; } - - protected: - /// Append to null bitmap - Status AppendToBitmap(bool is_valid); - - /// Vector append. Treat each zero byte as a null. If valid_bytes is null - /// assume all of length bits are valid. - Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - /// Uniform append. Append N times the same validity bit. - Status AppendToBitmap(int64_t num_bits, bool value); - - /// Set the next length bits to not null (i.e. valid). - Status SetNotNull(int64_t length); - - // Unsafe operations (don't check capacity/don't resize) - - void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } - - // Append to null bitmap, update the length - void UnsafeAppendToBitmap(bool is_valid) { - null_bitmap_builder_.UnsafeAppend(is_valid); - ++length_; - if (!is_valid) ++null_count_; - } - - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null - // assume all of length bits are valid. - void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - if (valid_bytes == NULLPTR) { - return UnsafeSetNotNull(length); - } - null_bitmap_builder_.UnsafeAppend(valid_bytes, length); - length_ += length; - null_count_ = null_bitmap_builder_.false_count(); - } - - // Append the same validity value a given number of times. - void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { - if (value) { - UnsafeSetNotNull(num_bits); - } else { - UnsafeSetNull(num_bits); - } - } - - void UnsafeAppendToBitmap(const std::vector& is_valid); - - // Set the next validity bits to not null (i.e. valid). - void UnsafeSetNotNull(int64_t length); - - // Set the next validity bits to null (i.e. invalid). - void UnsafeSetNull(int64_t length); - - static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); - - /// \brief Finish to an array of the specified ArrayType - template - Status FinishTyped(std::shared_ptr* out) { - std::shared_ptr out_untyped; - ARROW_RETURN_NOT_OK(Finish(&out_untyped)); - *out = std::static_pointer_cast(std::move(out_untyped)); - return Status::OK(); - } - - static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) { - return Status::Invalid("Resize capacity must be positive"); - } - - if (new_capacity < old_capacity) { - return Status::Invalid("Resize cannot downsize"); - } - - return Status::OK(); - } - - std::shared_ptr type_; - MemoryPool* pool_; - - TypedBufferBuilder null_bitmap_builder_; - int64_t null_count_ = 0; - - // Array length, so far. Also, the index of the next element to be added - int64_t length_ = 0; - int64_t capacity_ = 0; - - // Child value array builders. These are owned by this class - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_binary.h b/r/R/inst/include/arrow/array/builder_binary.h deleted file mode 100644 index 23a96450366..00000000000 --- a/r/R/inst/include/arrow/array/builder_binary.h +++ /dev/null @@ -1,365 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_base.h" -#include "arrow/buffer-builder.h" -#include "arrow/status.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" // IWYU pragma: export - -namespace arrow { - -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; - -// ---------------------------------------------------------------------- -// Binary and String - -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { - public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - Status Append(const uint8_t* value, int32_t length) { - ARROW_RETURN_NOT_OK(Reserve(1)); - ARROW_RETURN_NOT_OK(AppendNextOffset()); - // Safety check for UBSAN. - if (ARROW_PREDICT_TRUE(length > 0)) { - ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); - } - - UnsafeAppendToBitmap(true); - return Status::OK(); - } - - Status AppendNulls(int64_t length) final { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - return AppendOverflow(num_bytes); - } - ARROW_RETURN_NOT_OK(Reserve(length)); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } - UnsafeAppendToBitmap(length, false); - return Status::OK(); - } - - Status AppendNull() final { - ARROW_RETURN_NOT_OK(AppendNextOffset()); - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - - /// \brief Append without checking capacity - /// - /// Offsets and data should have been presized using Reserve() and - /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(value, length); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppend(const char* value, int32_t length) { - UnsafeAppend(reinterpret_cast(value), length); - } - - void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); - } - - void UnsafeAppend(util::string_view value) { - UnsafeAppend(value.data(), static_cast(value.size())); - } - - void UnsafeAppendNull() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - UnsafeAppendToBitmap(false); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - /// \brief Ensures there is enough allocated capacity to append the indicated - /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \return size of values buffer so far - int64_t value_data_length() const { return value_data_builder_.length(); } - /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - TypedBufferBuilder offsets_builder_; - TypedBufferBuilder value_data_builder_; - - Status AppendOverflow(int64_t num_bytes); - - Status AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - return AppendOverflow(num_bytes); - } - return offsets_builder_.Append(static_cast(num_bytes)); - } - - void UnsafeAppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } -}; - -/// \class StringBuilder -/// \brief Builder class for UTF8 strings -class ARROW_EXPORT StringBuilder : public BinaryBuilder { - public: - using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; - - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } -}; - -// ---------------------------------------------------------------------- -// FixedSizeBinaryBuilder - -class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { - public: - FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - Status Append(const uint8_t* value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(value); - return Status::OK(); - } - - Status Append(const char* value) { - return Append(reinterpret_cast(value)); - } - - Status Append(const util::string_view& view) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(view); - return Status::OK(); - } - - Status Append(const std::string& s) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(s); - return Status::OK(); - } - - template - Status Append(const std::array& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend( - util::string_view(reinterpret_cast(value.data()), value.size())); - return Status::OK(); - } - - Status AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status AppendNull() final; - - Status AppendNulls(int64_t length) final; - - void UnsafeAppend(const uint8_t* value) { - UnsafeAppendToBitmap(true); - if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { - byte_builder_.UnsafeAppend(value, byte_width_); - } - } - - void UnsafeAppend(util::string_view value) { -#ifndef NDEBUG - CheckValueSize(static_cast(value.size())); -#endif - UnsafeAppend(reinterpret_cast(value.data())); - } - - void UnsafeAppendNull() { - UnsafeAppendToBitmap(false); - byte_builder_.UnsafeAdvance(byte_width_); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \return size of values buffer so far - int64_t value_data_length() const { return byte_builder_.length(); } - - int32_t byte_width() const { return byte_width_; } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - int32_t byte_width_; - BufferBuilder byte_builder_; - -#ifndef NDEBUG - void CheckValueSize(int64_t size); -#endif -}; - -// ---------------------------------------------------------------------- -// Chunked builders: build a sequence of BinaryArray or StringArray that are -// limited to a particular size (to the upper limit of 2GB) - -namespace internal { - -class ARROW_EXPORT ChunkedBinaryBuilder { - public: - ChunkedBinaryBuilder(int32_t max_chunk_size, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - virtual ~ChunkedBinaryBuilder() = default; - - Status Append(const uint8_t* value, int32_t length) { - if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { - // Move onto next chunk, unless the builder length is currently 0, which - // means that max_chunk_size_ is less than the item length - if (builder_->length() > 0) { - ARROW_RETURN_NOT_OK(NextChunk()); - } - // else fall through - } - - chunk_data_size_ += length; - return builder_->Append(value, length); - } - - Status Append(const util::string_view& value) { - return Append(reinterpret_cast(value.data()), - static_cast(value.size())); - } - - Status AppendNull() { - if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { - ARROW_RETURN_NOT_OK(NextChunk()); - } - return builder_->AppendNull(); - } - - Status Reserve(int64_t values) { return builder_->Reserve(values); } - - virtual Status Finish(ArrayVector* out); - - protected: - Status NextChunk(); - - int64_t max_chunk_size_; - int64_t chunk_data_size_; - - std::unique_ptr builder_; - std::vector> chunks_; -}; - -class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { - public: - using ChunkedBinaryBuilder::ChunkedBinaryBuilder; - - Status Finish(ArrayVector* out) override; -}; - -} // namespace internal - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_decimal.h b/r/R/inst/include/arrow/array/builder_decimal.h deleted file mode 100644 index d5a26ff42f5..00000000000 --- a/r/R/inst/include/arrow/array/builder_decimal.h +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/array/builder_base.h" -#include "arrow/array/builder_binary.h" - -namespace arrow { - -class Decimal128; - -class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { - public: - explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using FixedSizeBinaryBuilder::Append; - using FixedSizeBinaryBuilder::AppendValues; - using FixedSizeBinaryBuilder::Reset; - - Status Append(const Decimal128& val); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } -}; - -using DecimalBuilder = Decimal128Builder; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_dict.h b/r/R/inst/include/arrow/array/builder_dict.h deleted file mode 100644 index 93cad2975a2..00000000000 --- a/r/R/inst/include/arrow/array/builder_dict.h +++ /dev/null @@ -1,369 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/array/builder_adaptive.h" // IWYU pragma: export -#include "arrow/array/builder_base.h" // IWYU pragma: export - -#include "arrow/array.h" - -namespace arrow { - -// ---------------------------------------------------------------------- -// Dictionary builder - -namespace internal { - -template -struct DictionaryScalar { - using type = typename T::c_type; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -class ARROW_EXPORT DictionaryMemoTable { - public: - explicit DictionaryMemoTable(const std::shared_ptr& type); - explicit DictionaryMemoTable(const std::shared_ptr& dictionary); - ~DictionaryMemoTable(); - - int32_t GetOrInsert(const bool& value); - int32_t GetOrInsert(const int8_t& value); - int32_t GetOrInsert(const int16_t& value); - int32_t GetOrInsert(const int32_t& value); - int32_t GetOrInsert(const int64_t& value); - int32_t GetOrInsert(const uint8_t& value); - int32_t GetOrInsert(const uint16_t& value); - int32_t GetOrInsert(const uint32_t& value); - int32_t GetOrInsert(const uint64_t& value); - int32_t GetOrInsert(const float& value); - int32_t GetOrInsert(const double& value); - int32_t GetOrInsert(const util::string_view& value); - - Status GetArrayData(MemoryPool* pool, int64_t start_offset, - std::shared_ptr* out); - - int32_t size() const; - - private: - class DictionaryMemoTableImpl; - std::unique_ptr impl_; -}; - -} // namespace internal - -/// \brief Array builder for created encoded DictionaryArray from -/// dense array -/// -/// Unlike other builders, dictionary builder does not completely -/// reset the state on Finish calls. The arrays built after the -/// initial Finish call will reuse the previously created encoding and -/// build a delta dictionary when new terms occur. -/// -/// data -template -class DictionaryBuilder : public ArrayBuilder { - public: - using Scalar = typename internal::DictionaryScalar::type; - - // WARNING: the type given below is the value type, not the DictionaryType. - // The DictionaryType is instantiated on the Finish() call. - template - DictionaryBuilder( - typename std::enable_if::value, - const std::shared_ptr&>::type type, - MemoryPool* pool) - : ArrayBuilder(type, pool), - memo_table_(new internal::DictionaryMemoTable(type)), - delta_offset_(0), - byte_width_(-1), - values_builder_(pool) {} - - template - explicit DictionaryBuilder( - typename std::enable_if::value, - const std::shared_ptr&>::type type, - MemoryPool* pool) - : ArrayBuilder(type, pool), - memo_table_(new internal::DictionaryMemoTable(type)), - delta_offset_(0), - byte_width_(static_cast(*type).byte_width()), - values_builder_(pool) {} - - template - explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) - : ArrayBuilder(dictionary->type(), pool), - memo_table_(new internal::DictionaryMemoTable(dictionary)), - delta_offset_(0), - byte_width_(-1), - values_builder_(pool) {} - - ~DictionaryBuilder() override = default; - - /// \brief Append a scalar value - Status Append(const Scalar& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - - auto memo_index = memo_table_->GetOrInsert(value); - ARROW_RETURN_NOT_OK(values_builder_.Append(memo_index)); - length_ += 1; - - return Status::OK(); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const uint8_t*>::type value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const char*>::type value) { - return Append(util::string_view(value, byte_width_)); - } - - /// \brief Append a scalar null value - Status AppendNull() final { - length_ += 1; - null_count_ += 1; - - return values_builder_.AppendNull(); - } - - Status AppendNulls(int64_t length) final { - length_ += length; - null_count_ += length; - - return values_builder_.AppendNulls(length); - } - - /// \brief Append a whole dense array to the builder - template - Status AppendArray( - typename std::enable_if::value, - const Array&>::type array) { - using ArrayType = typename TypeTraits::ArrayType; - - const auto& concrete_array = static_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - ARROW_RETURN_NOT_OK(AppendNull()); - } else { - ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); - } - } - return Status::OK(); - } - - template - Status AppendArray( - typename std::enable_if::value, - const Array&>::type array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid( - "Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& concrete_array = static_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - ARROW_RETURN_NOT_OK(AppendNull()); - } else { - ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); - } - } - return Status::OK(); - } - - void Reset() override { - ArrayBuilder::Reset(); - values_builder_.Reset(); - memo_table_.reset(new internal::DictionaryMemoTable(type_)); - delta_offset_ = 0; - } - - Status Resize(int64_t capacity) override { - ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - if (capacity_ == 0) { - // Initialize hash table - // XXX should we let the user pass additional size heuristics? - delta_offset_ = 0; - } - ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override { - // Finalize indices array - ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); - - // Generate dictionary array from hash table contents - std::shared_ptr dictionary_data; - - ARROW_RETURN_NOT_OK( - memo_table_->GetArrayData(pool_, delta_offset_, &dictionary_data)); - - // Set type of array data to the right dictionary type - (*out)->type = dictionary((*out)->type, type_); - (*out)->dictionary = MakeArray(dictionary_data); - - // Update internals for further uses of this DictionaryBuilder - delta_offset_ = memo_table_->size(); - values_builder_.Reset(); - - return Status::OK(); - } - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// is the dictionary builder in the delta building mode - bool is_building_delta() { return delta_offset_ > 0; } - - protected: - std::unique_ptr memo_table_; - - int32_t delta_offset_; - // Only used for FixedSizeBinaryType - int32_t byte_width_; - - AdaptiveIntBuilder values_builder_; -}; - -template <> -class DictionaryBuilder : public ArrayBuilder { - public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), values_builder_(pool) {} - explicit DictionaryBuilder(MemoryPool* pool) - : ArrayBuilder(null(), pool), values_builder_(pool) {} - - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) - : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {} - - /// \brief Append a scalar null value - Status AppendNull() final { - length_ += 1; - null_count_ += 1; - - return values_builder_.AppendNull(); - } - - Status AppendNulls(int64_t length) final { - length_ += length; - null_count_ += length; - - return values_builder_.AppendNulls(length); - } - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array) { - for (int64_t i = 0; i < array.length(); i++) { - ARROW_RETURN_NOT_OK(AppendNull()); - } - return Status::OK(); - } - - Status Resize(int64_t capacity) override { - ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override { - std::shared_ptr dictionary = std::make_shared(0); - - ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); - (*out)->type = std::make_shared((*out)->type, type_); - (*out)->dictionary = dictionary; - - return Status::OK(); - } - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - protected: - AdaptiveIntBuilder values_builder_; -}; - -class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -/// \brief Dictionary array builder with convenience methods for strings -class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_nested.h b/r/R/inst/include/arrow/array/builder_nested.h deleted file mode 100644 index d3695e525a9..00000000000 --- a/r/R/inst/include/arrow/array/builder_nested.h +++ /dev/null @@ -1,200 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/array/builder_base.h" -#include "arrow/buffer-builder.h" - -namespace arrow { - -// ---------------------------------------------------------------------- -// List builder - -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the value array along with offsets and - /// null bitmap. - ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Start a new variable-length list slot - /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true); - - Status AppendNull() final { return Append(false); } - - Status AppendNulls(int64_t length) final; - - ArrayBuilder* value_builder() const; - - protected: - TypedBufferBuilder offsets_builder_; - std::shared_ptr value_builder_; - std::shared_ptr values_; - - Status CheckNextOffset() const; - Status AppendNextOffset(); - Status AppendNextOffset(int64_t num_repeats); -}; - -// ---------------------------------------------------------------------- -// FixedSizeList builder - -/// \class FixedSizeListBuilder -/// \brief Builder class for fixed-length list array value types -class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { - public: - FixedSizeListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - int32_t list_size); - - FixedSizeListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Append a valid fixed length list. - /// - /// This function affects only the validity bitmap; the child values must be appended - /// using the child array builder. - Status Append(); - - /// \brief Vector append - /// - /// If passed, valid_bytes wil be read and any zero byte - /// will cause the corresponding slot to be null - /// - /// This function affects only the validity bitmap; the child values must be appended - /// using the child array builder. This includes appending nulls for null lists. - /// XXX this restriction is confusing, should this method be omitted? - Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a null fixed length list. - /// - /// The child array builder will have the approriate number of nulls appended - /// automatically. - Status AppendNull() final; - - /// \brief Append length null fixed length lists. - /// - /// The child array builder will have the approriate number of nulls appended - /// automatically. - Status AppendNulls(int64_t length) final; - - ArrayBuilder* value_builder() const { return value_builder_.get(); } - - protected: - const int32_t list_size_; - std::shared_ptr value_builder_; -}; - -// ---------------------------------------------------------------------- -// Struct - -// --------------------------------------------------------------------------------- -// StructArray builder -/// Append, Resize and Reserve methods are acting on StructBuilder. -/// Please make sure all these methods of all child-builders' are consistently -/// called to maintain data-structure consistency. -class ARROW_EXPORT StructBuilder : public ArrayBuilder { - public: - StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// Null bitmap is of equal length to every child field, and any zero byte - /// will be considered as a null for that field, but users must using app- - /// end methods or advance methods of the child builders' independently to - /// insert data. - Status AppendValues(int64_t length, const uint8_t* valid_bytes) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// Append an element to the Struct. All child-builders' Append method must - /// be called independently to maintain data-structure consistency. - Status Append(bool is_valid = true) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - Status AppendNull() final { return Append(false); } - - Status AppendNulls(int64_t length) final; - - void Reset() override; - - ArrayBuilder* field_builder(int i) const { return children_[i].get(); } - - int num_fields() const { return static_cast(children_.size()); } -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_primitive.h b/r/R/inst/include/arrow/array/builder_primitive.h deleted file mode 100644 index 3d566846d19..00000000000 --- a/r/R/inst/include/arrow/array/builder_primitive.h +++ /dev/null @@ -1,427 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_base.h" -#include "arrow/type.h" - -namespace arrow { - -class ARROW_EXPORT NullBuilder : public ArrayBuilder { - public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} - - /// \brief Append the specified number of null elements - Status AppendNulls(int64_t length) final { - if (length < 0) return Status::Invalid("length must be positive"); - null_count_ += length; - length_ += length; - return Status::OK(); - } - - /// \brief Append a single null element - Status AppendNull() final { return AppendNulls(1); } - - Status Append(std::nullptr_t) { return AppendNull(); } - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class NumericBuilder : public ArrayBuilder { - public: - using value_type = typename T::c_type; - using ArrayType = typename TypeTraits::ArrayType; - using ArrayBuilder::ArrayBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(TypeTraits::type_singleton(), pool) {} - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - /// The memory at the corresponding data slot is set to 0 to prevent - /// uninitialized memory access - Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, static_cast(0)); - UnsafeSetNull(length); - return Status::OK(); - } - - /// \brief Append a single null element - Status AppendNull() final { - ARROW_RETURN_NOT_OK(Reserve(1)); - data_builder_.UnsafeAppend(static_cast(0)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } - - void Reset() override { data_builder_.Reset(); } - - Status Resize(int64_t capacity) override { - ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); - return ArrayBuilder::Resize(capacity); - } - - value_type operator[](int64_t index) const { return GetValue(index); } - - value_type& operator[](int64_t index) { - return reinterpret_cast(data_builder_.mutable_data())[index]; - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(values, length); - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(values, length); - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \return Status - Status AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); - } - - Status FinishInternal(std::shared_ptr* out) override { - std::shared_ptr data, null_bitmap; - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); - *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); - capacity_ = length_ = null_count_ = 0; - return Status::OK(); - } - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(values_begin, values_end); - // this updates the length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(values_begin, values_end); - null_bitmap_builder_.UnsafeAppend( - length, [&valid_begin]() -> bool { return *valid_begin++; }); - length_ = null_bitmap_builder_.length(); - null_count_ = null_bitmap_builder_.false_count(); - return Status::OK(); - } - - // Same as above, with a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(values_begin, values_end); - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - null_bitmap_builder_.UnsafeAppend( - length, [&valid_begin]() -> bool { return *valid_begin++; }); - length_ = null_bitmap_builder_.length(); - null_count_ = null_bitmap_builder_.false_count(); - } - - return Status::OK(); - } - - /// Append a single scalar under the assumption that the underlying Buffer is - /// large enough. - /// - /// This method does not capacity-check; make sure to call Reserve - /// beforehand. - void UnsafeAppend(const value_type val) { - ArrayBuilder::UnsafeAppendToBitmap(true); - data_builder_.UnsafeAppend(val); - } - - void UnsafeAppendNull() { - ArrayBuilder::UnsafeAppendToBitmap(false); - data_builder_.UnsafeAppend(0); - } - - protected: - TypedBufferBuilder data_builder_; -}; - -// Builders - -using UInt8Builder = NumericBuilder; -using UInt16Builder = NumericBuilder; -using UInt32Builder = NumericBuilder; -using UInt64Builder = NumericBuilder; - -using Int8Builder = NumericBuilder; -using Int16Builder = NumericBuilder; -using Int32Builder = NumericBuilder; -using Int64Builder = NumericBuilder; - -using HalfFloatBuilder = NumericBuilder; -using FloatBuilder = NumericBuilder; -using DoubleBuilder = NumericBuilder; - -class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { - public: - using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, false); - UnsafeSetNull(length); - return Status::OK(); - } - - Status AppendNull() final { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendNull(); - return Status::OK(); - } - - /// Scalar append - Status Append(const bool val) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - Status Append(const uint8_t val) { return Append(val != 0); } - - /// Scalar append, without checking for capacity - void UnsafeAppend(const bool val) { - data_builder_.UnsafeAppend(val); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppendNull() { - data_builder_.UnsafeAppend(false); - UnsafeAppendToBitmap(false); - } - - void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous array of bytes (non-zero is 1) - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// or null(0) values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend( - length, [&values_begin]() -> bool { return *values_begin++; }); - // this updates length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - data_builder_.UnsafeAppend( - length, [&values_begin]() -> bool { return *values_begin++; }); - null_bitmap_builder_.UnsafeAppend( - length, [&valid_begin]() -> bool { return *valid_begin++; }); - length_ = null_bitmap_builder_.length(); - null_count_ = null_bitmap_builder_.false_count(); - return Status::OK(); - } - - // Same as above, for a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend( - length, [&values_begin]() -> bool { return *values_begin++; }); - - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - null_bitmap_builder_.UnsafeAppend( - length, [&valid_begin]() -> bool { return *valid_begin++; }); - } - length_ = null_bitmap_builder_.length(); - null_count_ = null_bitmap_builder_.false_count(); - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - TypedBufferBuilder data_builder_; -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_time.h b/r/R/inst/include/arrow/array/builder_time.h deleted file mode 100644 index 3ff783b1b1c..00000000000 --- a/r/R/inst/include/arrow/array/builder_time.h +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Contains declarations of time related Arrow builder types. - -#pragma once - -#include - -#include "arrow/array.h" -#include "arrow/array/builder_base.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_primitive.h" -#include "arrow/buffer-builder.h" -#include "arrow/status.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" - -namespace arrow { - -class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { - public: - using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; - - explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : DayTimeIntervalBuilder(day_time_interval(), pool) {} - - DayTimeIntervalBuilder(std::shared_ptr type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(type, pool), - builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} - - void Reset() override { builder_.Reset(); } - Status Resize(int64_t capacity) override { return builder_.Resize(capacity); } - Status Append(DayMilliseconds day_millis) { - return builder_.Append(reinterpret_cast(&day_millis)); - } - void UnsafeAppend(DayMilliseconds day_millis) { - builder_.UnsafeAppend(reinterpret_cast(&day_millis)); - } - using ArrayBuilder::UnsafeAppendNull; - Status AppendNull() override { return builder_.AppendNull(); } - Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); } - Status FinishInternal(std::shared_ptr* out) override { - auto result = builder_.FinishInternal(out); - if (*out != NULLPTR) { - (*out)->type = type(); - } - return result; - } - - private: - FixedSizeBinaryBuilder builder_; -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/builder_union.h b/r/R/inst/include/arrow/array/builder_union.h deleted file mode 100644 index aac2e54f9a2..00000000000 --- a/r/R/inst/include/arrow/array/builder_union.h +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/array/builder_base.h" -#include "arrow/buffer-builder.h" - -namespace arrow { - -/// \class DenseUnionBuilder -/// -/// You need to call AppendChild for each of the children builders you want -/// to use. The function will return an int8_t, which is the type tag -/// associated with that child. You can then call Append with that tag -/// (followed by an append on the child builder) to add elements to -/// the union array. -/// -/// You can either specify the type when the UnionBuilder is constructed -/// or let the UnionBuilder infer the type at runtime (by omitting the -/// type argument from the constructor). -/// -/// This API is EXPERIMENTAL. -class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the union array along - /// with types, offsets, and null bitmap. - explicit DenseUnionBuilder(MemoryPool* pool, - const std::shared_ptr& type = NULLPTR); - - Status AppendNull() final { - ARROW_RETURN_NOT_OK(types_builder_.Append(0)); - ARROW_RETURN_NOT_OK(offsets_builder_.Append(0)); - return AppendToBitmap(false); - } - - Status AppendNulls(int64_t length) final { - ARROW_RETURN_NOT_OK(types_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length)); - ARROW_RETURN_NOT_OK(Reserve(length)); - for (int64_t i = 0; i < length; ++i) { - types_builder_.UnsafeAppend(0); - offsets_builder_.UnsafeAppend(0); - } - return AppendToBitmap(length, false); - } - - /// \brief Append an element to the UnionArray. This must be followed - /// by an append to the appropriate child builder. - /// \param[in] type index of the child the value will be appended - /// \param[in] offset offset of the value in that child - Status Append(int8_t type, int32_t offset) { - ARROW_RETURN_NOT_OK(types_builder_.Append(type)); - ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset)); - return AppendToBitmap(true); - } - - Status FinishInternal(std::shared_ptr* out) override; - - /// \cond FALSE - using ArrayBuilder::Finish; - /// \endcond - - Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - - /// \brief Make a new child builder available to the UnionArray - /// - /// \param[in] child the child builder - /// \param[in] field_name the name of the field in the union array type - /// if type inference is used - /// \return child index, which is the "type" argument that needs - /// to be passed to the "Append" method to add a new element to - /// the union array. - int8_t AppendChild(const std::shared_ptr& child, - const std::string& field_name = "") { - children_.push_back(child); - field_names_.push_back(field_name); - return static_cast(children_.size() - 1); - } - - private: - TypedBufferBuilder types_builder_; - TypedBufferBuilder offsets_builder_; - std::vector field_names_; -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/array/concatenate.h b/r/R/inst/include/arrow/array/concatenate.h deleted file mode 100644 index 67738d547f4..00000000000 --- a/r/R/inst/include/arrow/array/concatenate.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/array.h" -#include "arrow/memory_pool.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -/// \brief Concatenate arrays -/// -/// \param[in] arrays a vector of arrays to be concatenated -/// \param[in] pool memory to store the result will be allocated from this memory pool -/// \param[out] out the resulting concatenated array -/// \return Status -ARROW_EXPORT -Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, - std::shared_ptr* out); - -} // namespace arrow diff --git a/r/R/inst/include/arrow/buffer-builder.h b/r/R/inst/include/arrow/buffer-builder.h deleted file mode 100644 index f069ea4d7dd..00000000000 --- a/r/R/inst/include/arrow/buffer-builder.h +++ /dev/null @@ -1,376 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_BUFFER_BUILDER_H -#define ARROW_BUFFER_BUILDER_H - -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/status.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/ubsan.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -// ---------------------------------------------------------------------- -// Buffer builder classes - -/// \class BufferBuilder -/// \brief A class for incrementally building a contiguous chunk of in-memory -/// data -class ARROW_EXPORT BufferBuilder { - public: - explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : pool_(pool), - data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ - &util::internal::non_null_filler), - - capacity_(0), - size_(0) {} - - /// \brief Resize the buffer to the nearest multiple of 64 bytes - /// - /// \param new_capacity the new capacity of the of the builder. Will be - /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if - /// new capacity is smaller than the existing size, reallocate internal - /// buffer. Set to false to avoid reallocations when shrinking the builder. - /// \return Status - Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { - // Resize(0) is a no-op - if (new_capacity == 0) { - return Status::OK(); - } - if (buffer_ == NULLPTR) { - ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity, &buffer_)); - } else { - ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); - } - capacity_ = buffer_->capacity(); - data_ = buffer_->mutable_data(); - return Status::OK(); - } - - /// \brief Ensure that builder can accommodate the additional number of bytes - /// without the need to perform allocations - /// - /// \param[in] additional_bytes number of additional bytes to make space for - /// \return Status - Status Reserve(const int64_t additional_bytes) { - auto min_capacity = size_ + additional_bytes; - if (min_capacity <= capacity_) { - return Status::OK(); - } - return Resize(GrowByFactor(capacity_, min_capacity), false); - } - - /// \brief Return a capacity expanded by an unspecified growth factor - static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) { - // NOTE: Doubling isn't a great overallocation practice - // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md - // for discussion. - // Grow exactly if a large upsize (the caller might know the exact final size). - // Otherwise overallocate by 1.5 to keep a linear amortized cost. - return std::max(new_capacity, current_capacity * 3 / 2); - } - - /// \brief Append the given data to the buffer - /// - /// The buffer is automatically expanded if necessary. - Status Append(const void* data, const int64_t length) { - if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { - ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false)); - } - UnsafeAppend(data, length); - return Status::OK(); - } - - /// \brief Append copies of a value to the buffer - /// - /// The buffer is automatically expanded if necessary. - Status Append(const int64_t num_copies, uint8_t value) { - ARROW_RETURN_NOT_OK(Reserve(num_copies)); - UnsafeAppend(num_copies, value); - return Status::OK(); - } - - // Advance pointer and zero out memory - Status Advance(const int64_t length) { return Append(length, 0); } - - // Advance pointer, but don't allocate or zero memory - void UnsafeAdvance(const int64_t length) { size_ += length; } - - // Unsafe methods don't check existing size - void UnsafeAppend(const void* data, const int64_t length) { - memcpy(data_ + size_, data, static_cast(length)); - size_ += length; - } - - void UnsafeAppend(const int64_t num_copies, uint8_t value) { - memset(data_ + size_, value, static_cast(num_copies)); - size_ += num_copies; - } - - /// \brief Return result of builder as a Buffer object. - /// - /// The builder is reset and can be reused afterwards. - /// - /// \param[out] out the finalized Buffer object - /// \param shrink_to_fit if the buffer size is smaller than its capacity, - /// reallocate to fit more tightly in memory. Set to false to avoid - /// a reallocation, at the expense of potentially more memory consumption. - /// \return Status - Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { - ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); - if (size_ != 0) buffer_->ZeroPadding(); - *out = buffer_; - Reset(); - return Status::OK(); - } - - void Reset() { - buffer_ = NULLPTR; - capacity_ = size_ = 0; - } - - /// \brief Set size to a smaller value without modifying builder - /// contents. For reusable BufferBuilder classes - /// \param[in] position must be non-negative and less than or equal - /// to the current length() - void Rewind(int64_t position) { size_ = position; } - - int64_t capacity() const { return capacity_; } - int64_t length() const { return size_; } - const uint8_t* data() const { return data_; } - uint8_t* mutable_data() { return data_; } - - private: - std::shared_ptr buffer_; - MemoryPool* pool_; - uint8_t* data_; - int64_t capacity_; - int64_t size_; -}; - -template -class TypedBufferBuilder; - -/// \brief A BufferBuilder for building a buffer of arithmetic elements -template -class TypedBufferBuilder::value>::type> { - public: - explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : bytes_builder_(pool) {} - - Status Append(T value) { - return bytes_builder_.Append(reinterpret_cast(&value), sizeof(T)); - } - - Status Append(const T* values, int64_t num_elements) { - return bytes_builder_.Append(reinterpret_cast(values), - num_elements * sizeof(T)); - } - - Status Append(const int64_t num_copies, T value) { - ARROW_RETURN_NOT_OK(Reserve(num_copies + length())); - UnsafeAppend(num_copies, value); - return Status::OK(); - } - - void UnsafeAppend(T value) { - bytes_builder_.UnsafeAppend(reinterpret_cast(&value), sizeof(T)); - } - - void UnsafeAppend(const T* values, int64_t num_elements) { - bytes_builder_.UnsafeAppend(reinterpret_cast(values), - num_elements * sizeof(T)); - } - - template - void UnsafeAppend(Iter values_begin, Iter values_end) { - int64_t num_elements = static_cast(std::distance(values_begin, values_end)); - auto data = mutable_data() + length(); - bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); - std::copy(values_begin, values_end, data); - } - - void UnsafeAppend(const int64_t num_copies, T value) { - auto data = mutable_data() + length(); - bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); - for (const auto end = data + num_copies; data != end; ++data) { - *data = value; - } - } - - Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { - return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit); - } - - Status Reserve(const int64_t additional_elements) { - return bytes_builder_.Reserve(additional_elements * sizeof(T)); - } - - Status Advance(const int64_t length) { - return bytes_builder_.Advance(length * sizeof(T)); - } - - Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { - return bytes_builder_.Finish(out, shrink_to_fit); - } - - void Reset() { bytes_builder_.Reset(); } - - int64_t length() const { return bytes_builder_.length() / sizeof(T); } - int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); } - const T* data() const { return reinterpret_cast(bytes_builder_.data()); } - T* mutable_data() { return reinterpret_cast(bytes_builder_.mutable_data()); } - - private: - BufferBuilder bytes_builder_; -}; - -/// \brief A BufferBuilder for building a buffer containing a bitmap -template <> -class TypedBufferBuilder { - public: - explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : bytes_builder_(pool) {} - - Status Append(bool value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(value); - return Status::OK(); - } - - Status Append(const uint8_t* valid_bytes, int64_t num_elements) { - ARROW_RETURN_NOT_OK(Reserve(num_elements)); - UnsafeAppend(valid_bytes, num_elements); - return Status::OK(); - } - - Status Append(const int64_t num_copies, bool value) { - ARROW_RETURN_NOT_OK(Reserve(num_copies)); - UnsafeAppend(num_copies, value); - return Status::OK(); - } - - void UnsafeAppend(bool value) { - BitUtil::SetBitTo(mutable_data(), bit_length_, value); - if (!value) { - ++false_count_; - } - ++bit_length_; - } - - void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { - if (num_elements == 0) return; - int64_t i = 0; - internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { - bool value = bytes[i++]; - false_count_ += !value; - return value; - }); - bit_length_ += num_elements; - } - - void UnsafeAppend(const int64_t num_copies, bool value) { - BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); - false_count_ += num_copies * !value; - bit_length_ += num_copies; - } - - template - void UnsafeAppend(const int64_t num_elements, Generator&& gen) { - if (num_elements == 0) return; - - if (count_falses) { - internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { - bool value = gen(); - false_count_ += !value; - return value; - }); - } else { - internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, - std::forward(gen)); - } - bit_length_ += num_elements; - } - - Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { - const int64_t old_byte_capacity = bytes_builder_.capacity(); - ARROW_RETURN_NOT_OK( - bytes_builder_.Resize(BitUtil::BytesForBits(new_capacity), shrink_to_fit)); - // Resize() may have chosen a larger capacity (e.g. for padding), - // so ask it again before calling memset(). - const int64_t new_byte_capacity = bytes_builder_.capacity(); - if (new_byte_capacity > old_byte_capacity) { - // The additional buffer space is 0-initialized for convenience, - // so that other methods can simply bump the length. - memset(mutable_data() + old_byte_capacity, 0, - static_cast(new_byte_capacity - old_byte_capacity)); - } - return Status::OK(); - } - - Status Reserve(const int64_t additional_elements) { - return Resize( - BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements), - false); - } - - Status Advance(const int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - bit_length_ += length; - false_count_ += length; - return Status::OK(); - } - - Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { - // set bytes_builder_.size_ == byte size of data - bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) - - bytes_builder_.length()); - bit_length_ = false_count_ = 0; - return bytes_builder_.Finish(out, shrink_to_fit); - } - - void Reset() { - bytes_builder_.Reset(); - bit_length_ = false_count_ = 0; - } - - int64_t length() const { return bit_length_; } - int64_t capacity() const { return bytes_builder_.capacity() * 8; } - const uint8_t* data() const { return bytes_builder_.data(); } - uint8_t* mutable_data() { return bytes_builder_.mutable_data(); } - int64_t false_count() const { return false_count_; } - - private: - BufferBuilder bytes_builder_; - int64_t bit_length_ = 0; - int64_t false_count_ = 0; -}; - -} // namespace arrow - -#endif // ARROW_BUFFER_BUILDER_H diff --git a/r/R/inst/include/arrow/buffer.h b/r/R/inst/include/arrow/buffer.h deleted file mode 100644 index 3eb9b033b92..00000000000 --- a/r/R/inst/include/arrow/buffer.h +++ /dev/null @@ -1,444 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_BUFFER_H -#define ARROW_BUFFER_H - -#include -#include -#include -#include -#include -#include - -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -// ---------------------------------------------------------------------- -// Buffer classes - -/// \class Buffer -/// \brief Object containing a pointer to a piece of contiguous memory with a -/// particular size. -/// -/// Buffers have two related notions of length: size and capacity. Size is -/// the number of bytes that might have valid data. Capacity is the number -/// of bytes that were allocated for the buffer in total. -/// -/// The Buffer base class does not own its memory, but subclasses often do. -/// -/// The following invariant is always true: Size <= Capacity -class ARROW_EXPORT Buffer { - public: - /// \brief Construct from buffer and size without copying memory - /// - /// \param[in] data a memory buffer - /// \param[in] size buffer size - /// - /// \note The passed memory must be kept alive through some other means - Buffer(const uint8_t* data, int64_t size) - : is_mutable_(false), - data_(data), - mutable_data_(NULLPTR), - size_(size), - capacity_(size) {} - - /// \brief Construct from string_view without copying memory - /// - /// \param[in] data a string_view object - /// - /// \note The memory viewed by data must not be deallocated in the lifetime of the - /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere - explicit Buffer(util::string_view data) - : Buffer(reinterpret_cast(data.data()), - static_cast(data.size())) {} - - virtual ~Buffer() = default; - - /// An offset into data that is owned by another buffer, but we want to be - /// able to retain a valid pointer to it even after other shared_ptr's to the - /// parent buffer have been destroyed - /// - /// This method makes no assertions about alignment or padding of the buffer but - /// in general we expected buffers to be aligned and padded to 64 bytes. In the future - /// we might add utility methods to help determine if a buffer satisfies this contract. - Buffer(const std::shared_ptr& parent, const int64_t offset, const int64_t size) - : Buffer(parent->data() + offset, size) { - parent_ = parent; - } - - uint8_t operator[](std::size_t i) const { return data_[i]; } - - bool is_mutable() const { return is_mutable_; } - - /// \brief Construct a new std::string with a hexadecimal representation of the buffer. - /// \return std::string - std::string ToHexString(); - - /// Return true if both buffers are the same size and contain the same bytes - /// up to the number of compared bytes - bool Equals(const Buffer& other, int64_t nbytes) const; - - /// Return true if both buffers are the same size and contain the same bytes - bool Equals(const Buffer& other) const; - - /// Copy a section of the buffer into a new Buffer. - Status Copy(const int64_t start, const int64_t nbytes, MemoryPool* pool, - std::shared_ptr* out) const; - - /// Copy a section of the buffer using the default memory pool into a new Buffer. - Status Copy(const int64_t start, const int64_t nbytes, - std::shared_ptr* out) const; - - /// Zero bytes in padding, i.e. bytes between size_ and capacity_. - void ZeroPadding() { -#ifndef NDEBUG - CheckMutable(); -#endif - // A zero-capacity buffer can have a null data pointer - if (capacity_ != 0) { - memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); - } - } - - /// \brief Construct a new buffer that owns its memory from a std::string - /// - /// \param[in] data a std::string object - /// \param[in] pool a memory pool - /// \param[out] out the created buffer - /// - /// \return Status message - static Status FromString(const std::string& data, MemoryPool* pool, - std::shared_ptr* out); - - /// \brief Construct a new buffer that owns its memory from a std::string - /// using the default memory pool - static Status FromString(const std::string& data, std::shared_ptr* out); - - /// \brief Construct an immutable buffer that takes ownership of the contents - /// of an std::string - /// \param[in] data an rvalue-reference of a string - /// \return a new Buffer instance - static std::shared_ptr FromString(std::string&& data); - - /// \brief Create buffer referencing typed memory with some length without - /// copying - /// \param[in] data the typed memory as C array - /// \param[in] length the number of values in the array - /// \return a new shared_ptr - template - static std::shared_ptr Wrap(const T* data, SizeType length) { - return std::make_shared(reinterpret_cast(data), - static_cast(sizeof(T) * length)); - } - - /// \brief Create buffer referencing std::vector with some length without - /// copying - /// \param[in] data the vector to be referenced. If this vector is changed, - /// the buffer may become invalid - /// \return a new shared_ptr - template - static std::shared_ptr Wrap(const std::vector& data) { - return std::make_shared(reinterpret_cast(data.data()), - static_cast(sizeof(T) * data.size())); - } - - /// \brief Copy buffer contents into a new std::string - /// \return std::string - /// \note Can throw std::bad_alloc if buffer is large - std::string ToString() const; - - /// \brief View buffer contents as a util::string_view - /// \return util::string_view - explicit operator util::string_view() const { - return util::string_view(reinterpret_cast(data_), size_); - } - - /// \brief Return a pointer to the buffer's data - const uint8_t* data() const { return data_; } - /// \brief Return a writable pointer to the buffer's data - /// - /// The buffer has to be mutable. Otherwise, an assertion may be thrown - /// or a null pointer may be returned. - uint8_t* mutable_data() { -#ifndef NDEBUG - CheckMutable(); -#endif - return mutable_data_; - } - - /// \brief Return the buffer's size in bytes - int64_t size() const { return size_; } - - /// \brief Return the buffer's capacity (number of allocated bytes) - int64_t capacity() const { return capacity_; } - - std::shared_ptr parent() const { return parent_; } - - protected: - bool is_mutable_; - const uint8_t* data_; - uint8_t* mutable_data_; - int64_t size_; - int64_t capacity_; - - // null by default, but may be set - std::shared_ptr parent_; - - void CheckMutable() const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); -}; - -using BufferVector = std::vector>; - -/// \defgroup buffer-slicing-functions Functions for slicing buffers -/// -/// @{ - -/// \brief Construct a view on a buffer at the given offset and length. -/// -/// This function cannot fail and does not check for errors (except in debug builds) -static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, - const int64_t offset, - const int64_t length) { - return std::make_shared(buffer, offset, length); -} - -/// \brief Construct a view on a buffer at the given offset, up to the buffer's end. -/// -/// This function cannot fail and does not check for errors (except in debug builds) -static inline std::shared_ptr SliceBuffer(const std::shared_ptr& buffer, - const int64_t offset) { - int64_t length = buffer->size() - offset; - return SliceBuffer(buffer, offset, length); -} - -/// \brief Like SliceBuffer, but construct a mutable buffer slice. -/// -/// If the parent buffer is not mutable, behavior is undefined (it may abort -/// in debug builds). -ARROW_EXPORT -std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, - const int64_t offset, const int64_t length); - -/// \brief Like SliceBuffer, but construct a mutable buffer slice. -/// -/// If the parent buffer is not mutable, behavior is undefined (it may abort -/// in debug builds). -static inline std::shared_ptr SliceMutableBuffer( - const std::shared_ptr& buffer, const int64_t offset) { - int64_t length = buffer->size() - offset; - return SliceMutableBuffer(buffer, offset, length); -} - -/// @} - -/// \class MutableBuffer -/// \brief A Buffer whose contents can be mutated. May or may not own its data. -class ARROW_EXPORT MutableBuffer : public Buffer { - public: - MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { - mutable_data_ = data; - is_mutable_ = true; - } - - MutableBuffer(const std::shared_ptr& parent, const int64_t offset, - const int64_t size); - - /// \brief Create buffer referencing typed memory with some length - /// \param[in] data the typed memory as C array - /// \param[in] length the number of values in the array - /// \return a new shared_ptr - template - static std::shared_ptr Wrap(T* data, SizeType length) { - return std::make_shared(reinterpret_cast(data), - static_cast(sizeof(T) * length)); - } - - protected: - MutableBuffer() : Buffer(NULLPTR, 0) {} -}; - -/// \class ResizableBuffer -/// \brief A mutable buffer that can be resized -class ARROW_EXPORT ResizableBuffer : public MutableBuffer { - public: - /// Change buffer reported size to indicated size, allocating memory if - /// necessary. This will ensure that the capacity of the buffer is a multiple - /// of 64 bytes as defined in Layout.md. - /// Consider using ZeroPadding afterwards, to conform to the Arrow layout - /// specification. - /// - /// @param new_size The new size for the buffer. - /// @param shrink_to_fit Whether to shrink the capacity if new size < current size - virtual Status Resize(const int64_t new_size, bool shrink_to_fit = true) = 0; - - /// Ensure that buffer has enough memory allocated to fit the indicated - /// capacity (and meets the 64 byte padding requirement in Layout.md). - /// It does not change buffer's reported size and doesn't zero the padding. - virtual Status Reserve(const int64_t new_capacity) = 0; - - template - Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) { - return Resize(sizeof(T) * new_nb_elements, shrink_to_fit); - } - - template - Status TypedReserve(const int64_t new_nb_elements) { - return Reserve(sizeof(T) * new_nb_elements); - } - - protected: - ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} -}; - -/// \defgroup buffer-allocation-functions Functions for allocating buffers -/// -/// @{ - -/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. -/// -/// \param[in] pool a memory pool -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer (contains padding) -/// -/// \return Status message -ARROW_EXPORT -Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::shared_ptr* out); - -/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding. -/// -/// \param[in] pool a memory pool -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer (contains padding) -/// -/// \return Status message -ARROW_EXPORT -Status AllocateBuffer(MemoryPool* pool, const int64_t size, std::unique_ptr* out); - -/// \brief Allocate a fixed-size mutable buffer from the default memory pool -/// -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer (contains padding) -/// -/// \return Status message -ARROW_EXPORT -Status AllocateBuffer(const int64_t size, std::shared_ptr* out); - -/// \brief Allocate a fixed-size mutable buffer from the default memory pool -/// -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer (contains padding) -/// -/// \return Status message -ARROW_EXPORT -Status AllocateBuffer(const int64_t size, std::unique_ptr* out); - -/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. -/// -/// \param[in] pool a memory pool -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, - std::shared_ptr* out); - -/// \brief Allocate a resizeable buffer from a memory pool, zero its padding. -/// -/// \param[in] pool a memory pool -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateResizableBuffer(MemoryPool* pool, const int64_t size, - std::unique_ptr* out); - -/// \brief Allocate a resizeable buffer from the default memory pool -/// -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateResizableBuffer(const int64_t size, std::shared_ptr* out); - -/// \brief Allocate a resizeable buffer from the default memory pool -/// -/// \param[in] size size of buffer to allocate -/// \param[out] out the allocated buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateResizableBuffer(const int64_t size, std::unique_ptr* out); - -/// \brief Allocate a bitmap buffer from a memory pool -/// no guarantee on values is provided. -/// -/// \param[in] pool memory pool to allocate memory from -/// \param[in] length size in bits of bitmap to allocate -/// \param[out] out the resulting buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out); - -/// \brief Allocate a zero-initialized bitmap buffer from a memory pool -/// -/// \param[in] pool memory pool to allocate memory from -/// \param[in] length size in bits of bitmap to allocate -/// \param[out] out the resulting buffer (zero-initialized). -/// -/// \return Status message -ARROW_EXPORT -Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, - std::shared_ptr* out); - -/// \brief Allocate a zero-initialized bitmap buffer from the default memory pool -/// -/// \param[in] length size in bits of bitmap to allocate -/// \param[out] out the resulting buffer -/// -/// \return Status message -ARROW_EXPORT -Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); - -/// \brief Concatenate multiple buffers into a single buffer -/// -/// \param[in] buffers to be concatenated -/// \param[in] pool memory pool to allocate the new buffer from -/// \param[out] out the concatenated buffer -/// -/// \return Status -ARROW_EXPORT -Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out); - -/// @} - -} // namespace arrow - -#endif // ARROW_BUFFER_H diff --git a/r/R/inst/include/arrow/builder.h b/r/R/inst/include/arrow/builder.h deleted file mode 100644 index 56c3e2b3716..00000000000 --- a/r/R/inst/include/arrow/builder.h +++ /dev/null @@ -1,58 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/array/builder_adaptive.h" // IWYU pragma: export -#include "arrow/array/builder_base.h" // IWYU pragma: export -#include "arrow/array/builder_binary.h" // IWYU pragma: export -#include "arrow/array/builder_decimal.h" // IWYU pragma: export -#include "arrow/array/builder_dict.h" // IWYU pragma: export -#include "arrow/array/builder_nested.h" // IWYU pragma: export -#include "arrow/array/builder_primitive.h" // IWYU pragma: export -#include "arrow/array/builder_time.h" // IWYU pragma: export -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class DataType; -class MemoryPool; - -/// \brief Construct an empty ArrayBuilder corresponding to the data -/// type -/// \param[in] pool the MemoryPool to use for allocations -/// \param[in] type an instance of DictionaryType -/// \param[out] out the created ArrayBuilder -ARROW_EXPORT -Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, - std::unique_ptr* out); - -/// \brief Construct an empty DictionaryBuilder initialized optionally -/// with a pre-existing dictionary -/// \param[in] pool the MemoryPool to use for allocations -/// \param[in] type an instance of DictionaryType -/// \param[in] dictionary the initial dictionary, if any. May be nullptr -/// \param[out] out the created ArrayBuilder -ARROW_EXPORT -Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& type, - const std::shared_ptr& dictionary, - std::unique_ptr* out); - -} // namespace arrow diff --git a/r/R/inst/include/arrow/compare.h b/r/R/inst/include/arrow/compare.h deleted file mode 100644 index 21da16b79e4..00000000000 --- a/r/R/inst/include/arrow/compare.h +++ /dev/null @@ -1,101 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for comparing Arrow data structures - -#ifndef ARROW_COMPARE_H -#define ARROW_COMPARE_H - -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -class Tensor; -class SparseTensor; -struct Scalar; - -static constexpr double kDefaultAbsoluteTolerance = 1E-5; - -/// A container of options for equality comparisons -class EqualOptions { - public: - /// Whether or not NaNs are considered equal. - bool nans_equal() const { return nans_equal_; } - - /// Return a new EqualOptions object with the "nans_equal" property changed. - EqualOptions nans_equal(bool v) const { - auto res = EqualOptions(*this); - res.nans_equal_ = v; - return res; - } - - /// The absolute tolerance for approximate comparisons of floating-point values. - double atol() const { return atol_; } - - /// Return a new EqualOptions object with the "atol" property changed. - EqualOptions atol(double v) const { - auto res = EqualOptions(*this); - res.atol_ = v; - return res; - } - - static EqualOptions Defaults() { return EqualOptions(); } - - protected: - double atol_ = kDefaultAbsoluteTolerance; - bool nans_equal_ = false; -}; - -/// Returns true if the arrays are exactly equal -bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right, - const EqualOptions& = EqualOptions::Defaults()); - -bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right); - -/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal -bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right); - -/// Returns true if the arrays are approximately equal. For non-floating point -/// types, this is equivalent to ArrayEquals(left, right) -bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right, - const EqualOptions& = EqualOptions::Defaults()); - -/// Returns true if indicated equal-length segment of arrays is exactly equal -bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right, - int64_t start_idx, int64_t end_idx, - int64_t other_start_idx); - -/// Returns true if the type metadata are exactly equal -/// \param[in] left a DataType -/// \param[in] right a DataType -/// \param[in] check_metadata whether to compare KeyValueMetadata for child -/// fields -bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right, - bool check_metadata = true); - -/// Returns true if scalars are equal -/// \param[in] left a Scalar -/// \param[in] right a Scalar -bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right); - -} // namespace arrow - -#endif // ARROW_COMPARE_H diff --git a/r/R/inst/include/arrow/compute/api.h b/r/R/inst/include/arrow/compute/api.h deleted file mode 100644 index 2a2e79f1a4c..00000000000 --- a/r/R/inst/include/arrow/compute/api.h +++ /dev/null @@ -1,33 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_API_H -#define ARROW_COMPUTE_API_H - -#include "arrow/compute/context.h" // IWYU pragma: export -#include "arrow/compute/kernel.h" // IWYU pragma: export - -#include "arrow/compute/kernels/boolean.h" // IWYU pragma: export -#include "arrow/compute/kernels/cast.h" // IWYU pragma: export -#include "arrow/compute/kernels/compare.h" // IWYU pragma: export -#include "arrow/compute/kernels/count.h" // IWYU pragma: export -#include "arrow/compute/kernels/hash.h" // IWYU pragma: export -#include "arrow/compute/kernels/mean.h" // IWYU pragma: export -#include "arrow/compute/kernels/sum.h" // IWYU pragma: export -#include "arrow/compute/kernels/take.h" // IWYU pragma: export - -#endif // ARROW_COMPUTE_API_H diff --git a/r/R/inst/include/arrow/compute/benchmark-util.h b/r/R/inst/include/arrow/compute/benchmark-util.h deleted file mode 100644 index ee9cb9504a3..00000000000 --- a/r/R/inst/include/arrow/compute/benchmark-util.h +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/testing/gtest_util.h" -#include "arrow/util/cpu-info.h" - -namespace arrow { -namespace compute { - -using internal::CpuInfo; -static CpuInfo* cpu_info = CpuInfo::GetInstance(); - -static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE); -static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::L2_CACHE); -static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); -static const int64_t kCantFitInL3Size = kL3Size * 4; -static const std::vector kMemorySizes = {kL1Size, kL2Size, kL3Size, - kCantFitInL3Size}; - -template -struct BenchmarkArgsType; - -// Pattern matching that extracts the vector element type of Benchmark::Args() -template -struct BenchmarkArgsType&)> { - using type = Values; -}; - -// Benchmark changed its parameter type between releases from -// int to int64_t. As it doesn't have version macros, we need -// to apply C++ template magic. -using ArgsType = - typename BenchmarkArgsType::type; - -void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench, - const std::vector& sizes = kMemorySizes) { - bench->Unit(benchmark::kMicrosecond); - - for (auto size : sizes) - for (auto nulls : std::vector({0, 1, 10, 50})) - bench->Args({static_cast(size), nulls}); -} - -void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { - BenchmarkSetArgsWithSizes(bench, kMemorySizes); -} - -void RegressionSetArgs(benchmark::internal::Benchmark* bench) { - // Regression do not need to account for cache hierarchy, thus optimize for - // the best case. - BenchmarkSetArgsWithSizes(bench, {kL1Size}); -} - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/context.h b/r/R/inst/include/arrow/compute/context.h deleted file mode 100644 index 8ac4700b91f..00000000000 --- a/r/R/inst/include/arrow/compute/context.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_CONTEXT_H -#define ARROW_COMPUTE_CONTEXT_H - -#include -#include - -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; - -namespace internal { -class CpuInfo; -} // namespace internal - -namespace compute { - -#define RETURN_IF_ERROR(ctx) \ - if (ARROW_PREDICT_FALSE(ctx->HasError())) { \ - Status s = ctx->status(); \ - ctx->ResetStatus(); \ - return s; \ - } - -/// \brief Container for variables and options used by function evaluation -class ARROW_EXPORT FunctionContext { - public: - explicit FunctionContext(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - MemoryPool* memory_pool() const; - - /// \brief Allocate buffer from the context's memory pool - Status Allocate(const int64_t nbytes, std::shared_ptr* out); - - /// \brief Indicate that an error has occurred, to be checked by a parent caller - /// \param[in] status a Status instance - /// - /// \note Will not overwrite a prior set Status, so we will have the first - /// error that occurred until FunctionContext::ResetStatus is called - void SetStatus(const Status& status); - - /// \brief Clear any error status - void ResetStatus(); - - /// \brief Return true if an error has occurred - bool HasError() const { return !status_.ok(); } - - /// \brief Return the current status of the context - const Status& status() const { return status_; } - - internal::CpuInfo* cpu_info() const { return cpu_info_; } - - private: - Status status_; - MemoryPool* pool_; - internal::CpuInfo* cpu_info_; -}; - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_CONTEXT_H diff --git a/r/R/inst/include/arrow/compute/expression.h b/r/R/inst/include/arrow/compute/expression.h deleted file mode 100644 index cc558141546..00000000000 --- a/r/R/inst/include/arrow/compute/expression.h +++ /dev/null @@ -1,261 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/compute/type_fwd.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - -class LogicalType; -class ExprVisitor; -class Operation; - -/// \brief Base class for all analytic expressions. Expressions may represent -/// data values (scalars, arrays, tables) -class ARROW_EXPORT Expr { - public: - /// \brief Instantiate expression from an abstract operation - /// \param[in] op the operation that generates the expression - explicit Expr(ConstOpPtr op); - - virtual ~Expr() = default; - - /// \brief A unique string identifier for the kind of expression - virtual std::string kind() const = 0; - - /// \brief Accept expression visitor - /// TODO(wesm) - // virtual Status Accept(ExprVisitor* visitor) const = 0; - - /// \brief The underlying operation - ConstOpPtr op() const { return op_; } - - protected: - ConstOpPtr op_; -}; - -/// The value cardinality: one or many. These correspond to the arrow::Scalar -/// and arrow::Array types -enum class ValueRank { SCALAR, ARRAY }; - -/// \brief Base class for a data-generated expression with a fixed and known -/// type. This includes arrays and scalars -class ARROW_EXPORT ValueExpr : public Expr { - public: - /// \brief The name of the expression, if any. The default is unnamed - // virtual const ExprName& name() const; - LogicalTypePtr type() const; - - /// \brief The value cardinality (scalar or array) of the expression - virtual ValueRank rank() const = 0; - - protected: - ValueExpr(ConstOpPtr op, LogicalTypePtr type); - - /// \brief The semantic data type of the expression - LogicalTypePtr type_; -}; - -class ARROW_EXPORT ArrayExpr : public ValueExpr { - protected: - using ValueExpr::ValueExpr; - std::string kind() const override; - ValueRank rank() const override; -}; - -class ARROW_EXPORT ScalarExpr : public ValueExpr { - protected: - using ValueExpr::ValueExpr; - std::string kind() const override; - ValueRank rank() const override; -}; - -namespace value { - -// These are mixin classes to provide a type hierarchy for values identify -class ValueMixin {}; -class Null : public ValueMixin {}; -class Bool : public ValueMixin {}; -class Number : public ValueMixin {}; -class Integer : public Number {}; -class SignedInteger : public Integer {}; -class Int8 : public SignedInteger {}; -class Int16 : public SignedInteger {}; -class Int32 : public SignedInteger {}; -class Int64 : public SignedInteger {}; -class UnsignedInteger : public Integer {}; -class UInt8 : public UnsignedInteger {}; -class UInt16 : public UnsignedInteger {}; -class UInt32 : public UnsignedInteger {}; -class UInt64 : public UnsignedInteger {}; -class Floating : public Number {}; -class Float16 : public Floating {}; -class Float32 : public Floating {}; -class Float64 : public Floating {}; -class Binary : public ValueMixin {}; -class Utf8 : public Binary {}; -class List : public ValueMixin {}; -class Struct : public ValueMixin {}; - -} // namespace value - -#define SIMPLE_EXPR_FACTORY(NAME) ARROW_EXPORT ExprPtr NAME(ConstOpPtr op); - -namespace scalar { - -#define DECLARE_SCALAR_EXPR(TYPE) \ - class ARROW_EXPORT TYPE : public ScalarExpr, public value::TYPE { \ - public: \ - explicit TYPE(ConstOpPtr op); \ - using ScalarExpr::kind; \ - }; - -DECLARE_SCALAR_EXPR(Null) -DECLARE_SCALAR_EXPR(Bool) -DECLARE_SCALAR_EXPR(Int8) -DECLARE_SCALAR_EXPR(Int16) -DECLARE_SCALAR_EXPR(Int32) -DECLARE_SCALAR_EXPR(Int64) -DECLARE_SCALAR_EXPR(UInt8) -DECLARE_SCALAR_EXPR(UInt16) -DECLARE_SCALAR_EXPR(UInt32) -DECLARE_SCALAR_EXPR(UInt64) -DECLARE_SCALAR_EXPR(Float16) -DECLARE_SCALAR_EXPR(Float32) -DECLARE_SCALAR_EXPR(Float64) -DECLARE_SCALAR_EXPR(Binary) -DECLARE_SCALAR_EXPR(Utf8) - -#undef DECLARE_SCALAR_EXPR - -SIMPLE_EXPR_FACTORY(null); -SIMPLE_EXPR_FACTORY(boolean); -SIMPLE_EXPR_FACTORY(int8); -SIMPLE_EXPR_FACTORY(int16); -SIMPLE_EXPR_FACTORY(int32); -SIMPLE_EXPR_FACTORY(int64); -SIMPLE_EXPR_FACTORY(uint8); -SIMPLE_EXPR_FACTORY(uint16); -SIMPLE_EXPR_FACTORY(uint32); -SIMPLE_EXPR_FACTORY(uint64); -SIMPLE_EXPR_FACTORY(float16); -SIMPLE_EXPR_FACTORY(float32); -SIMPLE_EXPR_FACTORY(float64); -SIMPLE_EXPR_FACTORY(binary); -SIMPLE_EXPR_FACTORY(utf8); - -class ARROW_EXPORT List : public ScalarExpr, public value::List { - public: - List(ConstOpPtr op, LogicalTypePtr type); - using ScalarExpr::kind; -}; - -class ARROW_EXPORT Struct : public ScalarExpr, public value::Struct { - public: - Struct(ConstOpPtr op, LogicalTypePtr type); - using ScalarExpr::kind; -}; - -} // namespace scalar - -namespace array { - -#define DECLARE_ARRAY_EXPR(TYPE) \ - class ARROW_EXPORT TYPE : public ArrayExpr, public value::TYPE { \ - public: \ - explicit TYPE(ConstOpPtr op); \ - using ArrayExpr::kind; \ - }; - -DECLARE_ARRAY_EXPR(Null) -DECLARE_ARRAY_EXPR(Bool) -DECLARE_ARRAY_EXPR(Int8) -DECLARE_ARRAY_EXPR(Int16) -DECLARE_ARRAY_EXPR(Int32) -DECLARE_ARRAY_EXPR(Int64) -DECLARE_ARRAY_EXPR(UInt8) -DECLARE_ARRAY_EXPR(UInt16) -DECLARE_ARRAY_EXPR(UInt32) -DECLARE_ARRAY_EXPR(UInt64) -DECLARE_ARRAY_EXPR(Float16) -DECLARE_ARRAY_EXPR(Float32) -DECLARE_ARRAY_EXPR(Float64) -DECLARE_ARRAY_EXPR(Binary) -DECLARE_ARRAY_EXPR(Utf8) - -#undef DECLARE_ARRAY_EXPR - -SIMPLE_EXPR_FACTORY(null); -SIMPLE_EXPR_FACTORY(boolean); -SIMPLE_EXPR_FACTORY(int8); -SIMPLE_EXPR_FACTORY(int16); -SIMPLE_EXPR_FACTORY(int32); -SIMPLE_EXPR_FACTORY(int64); -SIMPLE_EXPR_FACTORY(uint8); -SIMPLE_EXPR_FACTORY(uint16); -SIMPLE_EXPR_FACTORY(uint32); -SIMPLE_EXPR_FACTORY(uint64); -SIMPLE_EXPR_FACTORY(float16); -SIMPLE_EXPR_FACTORY(float32); -SIMPLE_EXPR_FACTORY(float64); -SIMPLE_EXPR_FACTORY(binary); -SIMPLE_EXPR_FACTORY(utf8); - -class ARROW_EXPORT List : public ArrayExpr, public value::List { - public: - List(ConstOpPtr op, LogicalTypePtr type); - using ArrayExpr::kind; -}; - -class ARROW_EXPORT Struct : public ArrayExpr, public value::Struct { - public: - Struct(ConstOpPtr op, LogicalTypePtr type); - using ArrayExpr::kind; -}; - -} // namespace array - -#undef SIMPLE_EXPR_FACTORY - -template -inline bool InheritsFrom(const ObjectType* obj) { - return dynamic_cast(obj) != NULLPTR; -} - -template -inline bool InheritsFrom(const ObjectType& obj) { - return dynamic_cast(&obj) != NULLPTR; -} - -/// \brief Construct a ScalarExpr containing an Operation given a logical type -ARROW_EXPORT -Status GetScalarExpr(ConstOpPtr op, LogicalTypePtr ty, ExprPtr* out); - -/// \brief Construct an ArrayExpr containing an Operation given a logical type -ARROW_EXPORT -Status GetArrayExpr(ConstOpPtr op, LogicalTypePtr ty, ExprPtr* out); - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernel.h b/r/R/inst/include/arrow/compute/kernel.h deleted file mode 100644 index aba659ebdd3..00000000000 --- a/r/R/inst/include/arrow/compute/kernel.h +++ /dev/null @@ -1,271 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_KERNEL_H -#define ARROW_COMPUTE_KERNEL_H - -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/record_batch.h" -#include "arrow/scalar.h" -#include "arrow/table.h" -#include "arrow/util/macros.h" -#include "arrow/util/memory.h" -#include "arrow/util/variant.h" // IWYU pragma: export -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - -class FunctionContext; - -/// \class OpKernel -/// \brief Base class for operator kernels -/// -/// Note to implementors: -/// Operator kernels are intended to be the lowest level of an analytics/compute -/// engine. They will generally not be exposed directly to end-users. Instead -/// they will be wrapped by higher level constructs (e.g. top-level functions -/// or physical execution plan nodes). These higher level constructs are -/// responsible for user input validation and returning the appropriate -/// error Status. -/// -/// Due to this design, implementations of Call (the execution -/// method on subclasses) should use assertions (i.e. DCHECK) to double-check -/// parameter arguments when in higher level components returning an -/// InvalidArgument error might be more appropriate. -/// -class ARROW_EXPORT OpKernel { - public: - virtual ~OpKernel() = default; - /// \brief EXPERIMENTAL The output data type of the kernel - /// \return the output type - virtual std::shared_ptr out_type() const = 0; -}; - -struct Datum; -static inline bool CollectionEquals(const std::vector& left, - const std::vector& right); - -/// \class Datum -/// \brief Variant type for various Arrow C++ data structures -struct ARROW_EXPORT Datum { - enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION }; - - util::variant, std::shared_ptr, - std::shared_ptr, std::shared_ptr, - std::shared_ptr
, std::vector> - value; - - /// \brief Empty datum, to be populated elsewhere - Datum() : value(NULLPTR) {} - - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : value(value) {} - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : value(value) {} - - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : Datum(value ? value->data() : NULLPTR) {} - - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : value(value) {} - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : value(value) {} - Datum(const std::shared_ptr
& value) // NOLINT implicit conversion - : value(value) {} - Datum(const std::vector& value) // NOLINT implicit conversion - : value(value) {} - - // Cast from subtypes of Array to Datum - template ::value>::type> - Datum(const std::shared_ptr& value) // NOLINT implicit conversion - : Datum(std::shared_ptr(value)) {} - - // Convenience constructors - explicit Datum(bool value) : value(std::make_shared(value)) {} - explicit Datum(int8_t value) : value(std::make_shared(value)) {} - explicit Datum(uint8_t value) : value(std::make_shared(value)) {} - explicit Datum(int16_t value) : value(std::make_shared(value)) {} - explicit Datum(uint16_t value) : value(std::make_shared(value)) {} - explicit Datum(int32_t value) : value(std::make_shared(value)) {} - explicit Datum(uint32_t value) : value(std::make_shared(value)) {} - explicit Datum(int64_t value) : value(std::make_shared(value)) {} - explicit Datum(uint64_t value) : value(std::make_shared(value)) {} - explicit Datum(float value) : value(std::make_shared(value)) {} - explicit Datum(double value) : value(std::make_shared(value)) {} - - ~Datum() {} - - Datum(const Datum& other) noexcept { this->value = other.value; } - - Datum& operator=(const Datum& other) noexcept { - value = other.value; - return *this; - } - - // Define move constructor and move assignment, for better performance - Datum(Datum&& other) noexcept : value(std::move(other.value)) {} - - Datum& operator=(Datum&& other) noexcept { - value = std::move(other.value); - return *this; - } - - Datum::type kind() const { - switch (this->value.index()) { - case 0: - return Datum::NONE; - case 1: - return Datum::SCALAR; - case 2: - return Datum::ARRAY; - case 3: - return Datum::CHUNKED_ARRAY; - case 4: - return Datum::RECORD_BATCH; - case 5: - return Datum::TABLE; - case 6: - return Datum::COLLECTION; - default: - return Datum::NONE; - } - } - - std::shared_ptr array() const { - return util::get>(this->value); - } - - std::shared_ptr make_array() const { - return MakeArray(util::get>(this->value)); - } - - std::shared_ptr chunked_array() const { - return util::get>(this->value); - } - - std::shared_ptr record_batch() const { - return util::get>(this->value); - } - - std::shared_ptr
table() const { - return util::get>(this->value); - } - - const std::vector collection() const { - return util::get>(this->value); - } - - std::shared_ptr scalar() const { - return util::get>(this->value); - } - - bool is_array() const { return this->kind() == Datum::ARRAY; } - - bool is_arraylike() const { - return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY; - } - - bool is_scalar() const { return this->kind() == Datum::SCALAR; } - - /// \brief The value type of the variant, if any - /// - /// \return nullptr if no type - std::shared_ptr type() const { - if (this->kind() == Datum::ARRAY) { - return util::get>(this->value)->type; - } else if (this->kind() == Datum::CHUNKED_ARRAY) { - return util::get>(this->value)->type(); - } else if (this->kind() == Datum::SCALAR) { - return util::get>(this->value)->type; - } - return NULLPTR; - } - - bool Equals(const Datum& other) const { - if (this->kind() != other.kind()) return false; - - switch (this->kind()) { - case Datum::NONE: - return true; - case Datum::SCALAR: - return internal::SharedPtrEquals(this->scalar(), other.scalar()); - case Datum::ARRAY: - return internal::SharedPtrEquals(this->make_array(), other.make_array()); - case Datum::CHUNKED_ARRAY: - return internal::SharedPtrEquals(this->chunked_array(), other.chunked_array()); - case Datum::RECORD_BATCH: - return internal::SharedPtrEquals(this->record_batch(), other.record_batch()); - case Datum::TABLE: - return internal::SharedPtrEquals(this->table(), other.table()); - case Datum::COLLECTION: - return CollectionEquals(this->collection(), other.collection()); - default: - return false; - } - } -}; - -/// \class UnaryKernel -/// \brief An array-valued function of a single input argument. -/// -/// Note to implementors: Try to avoid making kernels that allocate memory if -/// the output size is a deterministic function of the Input Datum's metadata. -/// Instead separate the logic of the kernel and allocations necessary into -/// two different kernels. Some reusable kernels that allocate buffers -/// and delegate computation to another kernel are available in util-internal.h. -class ARROW_EXPORT UnaryKernel : public OpKernel { - public: - /// \brief Executes the kernel. - /// - /// \param[in] ctx The function context for the kernel - /// \param[in] input The kernel input data - /// \param[out] out The output of the function. Each implementation of this - /// function might assume different things about the existing contents of out - /// (e.g. which buffers are preallocated). In the future it is expected that - /// there will be a more generic mechansim for understanding the necessary - /// contracts. - virtual Status Call(FunctionContext* ctx, const Datum& input, Datum* out) = 0; -}; - -/// \class BinaryKernel -/// \brief An array-valued function of a two input arguments -class ARROW_EXPORT BinaryKernel : public OpKernel { - public: - virtual Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, - Datum* out) = 0; -}; - -static inline bool CollectionEquals(const std::vector& left, - const std::vector& right) { - if (left.size() != right.size()) return false; - - for (size_t i = 0; i < left.size(); i++) - if (!left[i].Equals(right[i])) return false; - - return true; -} - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_KERNEL_H diff --git a/r/R/inst/include/arrow/compute/kernels/aggregate.h b/r/R/inst/include/arrow/compute/kernels/aggregate.h deleted file mode 100644 index 2fe82636f81..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/aggregate.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/kernel.h" - -namespace arrow { - -class Array; -class Status; - -namespace compute { - -class FunctionContext; -struct Datum; - -/// AggregateFunction is an interface for Aggregates -/// -/// An aggregates transforms an array into single result called a state via the -/// Consume method.. State supports the merge operation via the Merge method. -/// State can be sealed into a final result via the Finalize method. -// -/// State ownership is handled by callers, thus the interface exposes 3 methods -/// for the caller to manage memory: -/// - Size -/// - New (placement new constructor invocation) -/// - Delete (state desctructor) -/// -/// Design inspired by ClickHouse aggregate functions. -class AggregateFunction { - public: - /// \brief Consume an array into a state. - virtual Status Consume(const Array& input, void* state) const = 0; - - /// \brief Merge states. - virtual Status Merge(const void* src, void* dst) const = 0; - - /// \brief Convert state into a final result. - virtual Status Finalize(const void* src, Datum* output) const = 0; - - virtual ~AggregateFunction() {} - - virtual std::shared_ptr out_type() const = 0; - - /// State management methods. - virtual int64_t Size() const = 0; - virtual void New(void* ptr) const = 0; - virtual void Delete(void* ptr) const = 0; -}; - -/// AggregateFunction partial implementation for static type state -template -class AggregateFunctionStaticState : public AggregateFunction { - virtual Status Consume(const Array& input, State* state) const = 0; - virtual Status Merge(const State& src, State* dst) const = 0; - virtual Status Finalize(const State& src, Datum* output) const = 0; - - Status Consume(const Array& input, void* state) const final { - return Consume(input, static_cast(state)); - } - - Status Merge(const void* src, void* dst) const final { - return Merge(*static_cast(src), static_cast(dst)); - } - - /// \brief Convert state into a final result. - Status Finalize(const void* src, Datum* output) const final { - return Finalize(*static_cast(src), output); - } - - int64_t Size() const final { return sizeof(State); } - - void New(void* ptr) const final { - // By using placement-new syntax, the constructor of the State is invoked - // in the memory location defined by the caller. This only supports State - // with a parameter-less constructor. - new (ptr) State; - } - - void Delete(void* ptr) const final { static_cast(ptr)->~State(); } -}; - -/// \brief UnaryKernel implemented by an AggregateState -class ARROW_EXPORT AggregateUnaryKernel : public UnaryKernel { - public: - explicit AggregateUnaryKernel(std::shared_ptr& aggregate) - : aggregate_function_(aggregate) {} - - Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; - - std::shared_ptr out_type() const override; - - private: - std::shared_ptr aggregate_function_; -}; - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/boolean.h b/r/R/inst/include/arrow/compute/kernels/boolean.h deleted file mode 100644 index fb88659dbc4..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/boolean.h +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_KERNELS_BOOLEAN_H -#define ARROW_COMPUTE_KERNELS_BOOLEAN_H - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - -struct Datum; -class FunctionContext; - -/// \brief Invert the values of a boolean datum -/// \param[in] context the FunctionContext -/// \param[in] value datum to invert -/// \param[out] out resulting datum -/// -/// \since 0.11.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Invert(FunctionContext* context, const Datum& value, Datum* out); - -/// \brief Element-wise AND of two boolean datums -/// \param[in] context the FunctionContext -/// \param[in] left left operand (array) -/// \param[in] right right operand (array) -/// \param[out] out resulting datum -/// -/// \since 0.11.0 -/// \note API not yet finalized -ARROW_EXPORT -Status And(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); - -/// \brief Element-wise OR of two boolean datums -/// \param[in] context the FunctionContext -/// \param[in] left left operand (array) -/// \param[in] right right operand (array) -/// \param[out] out resulting datum -/// -/// \since 0.11.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Or(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); - -/// \brief Element-wise XOR of two boolean datums -/// \param[in] context the FunctionContext -/// \param[in] left left operand (array) -/// \param[in] right right operand (array) -/// \param[out] out resulting datum -/// -/// \since 0.11.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Xor(FunctionContext* context, const Datum& left, const Datum& right, Datum* out); - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/r/R/inst/include/arrow/compute/kernels/cast.h b/r/R/inst/include/arrow/compute/kernels/cast.h deleted file mode 100644 index 5a7c5be93bd..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/cast.h +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_KERNELS_CAST_H -#define ARROW_COMPUTE_KERNELS_CAST_H - -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; - -namespace compute { - -struct Datum; -class FunctionContext; -class UnaryKernel; - -struct ARROW_EXPORT CastOptions { - CastOptions() - : allow_int_overflow(false), - allow_time_truncate(false), - allow_float_truncate(false), - allow_invalid_utf8(false) {} - - explicit CastOptions(bool safe) - : allow_int_overflow(!safe), - allow_time_truncate(!safe), - allow_float_truncate(!safe), - allow_invalid_utf8(!safe) {} - - static CastOptions Safe() { return CastOptions(true); } - - static CastOptions Unsafe() { return CastOptions(false); } - - bool allow_int_overflow; - bool allow_time_truncate; - bool allow_float_truncate; - // Indicate if conversions from Binary/FixedSizeBinary to string must - // validate the utf8 payload. - bool allow_invalid_utf8; -}; - -/// \since 0.7.0 -/// \note API not yet finalized -ARROW_EXPORT -Status GetCastFunction(const DataType& in_type, std::shared_ptr to_type, - const CastOptions& options, std::unique_ptr* kernel); - -/// \brief Cast from one array type to another -/// \param[in] context the FunctionContext -/// \param[in] value array to cast -/// \param[in] to_type type to cast to -/// \param[in] options casting options -/// \param[out] out resulting array -/// -/// \since 0.7.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Cast(FunctionContext* context, const Array& value, - std::shared_ptr to_type, const CastOptions& options, - std::shared_ptr* out); - -/// \brief Cast from one value to another -/// \param[in] context the FunctionContext -/// \param[in] value datum to cast -/// \param[in] to_type type to cast to -/// \param[in] options casting options -/// \param[out] out resulting datum -/// -/// \since 0.8.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Cast(FunctionContext* context, const Datum& value, - std::shared_ptr to_type, const CastOptions& options, Datum* out); - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/r/R/inst/include/arrow/compute/kernels/compare.h b/r/R/inst/include/arrow/compute/kernels/compare.h deleted file mode 100644 index a1924512916..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/compare.h +++ /dev/null @@ -1,116 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -struct Scalar; -class Status; - -namespace compute { - -struct Datum; -class FilterFunction; -class FunctionContext; - -enum CompareOperator { - EQUAL, - NOT_EQUAL, - GREATER, - GREATER_EQUAL, - LESS, - LESS_EQUAL, -}; - -template -struct Comparator; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs == rhs; } -}; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs != rhs; } -}; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs > rhs; } -}; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs >= rhs; } -}; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs < rhs; } -}; - -template -struct Comparator { - constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs <= rhs; } -}; - -struct CompareOptions { - explicit CompareOptions(CompareOperator op) : op(op) {} - - enum CompareOperator op; -}; - -/// \brief Return a Compare FilterFunction -/// -/// \param[in] context FunctionContext passing context information -/// \param[in] type required to specialize the kernel -/// \param[in] options required to specify the compare operator -/// -/// \since 0.14.0 -/// \note API not yet finalized -ARROW_EXPORT -std::shared_ptr MakeCompareFilterFunction(FunctionContext* context, - const DataType& type, - struct CompareOptions options); - -/// \brief Compare a numeric array with a scalar. -/// -/// \param[in] context the FunctionContext -/// \param[in] left datum to compare, must be an Array -/// \param[in] right datum to compare, must be a Scalar of the same type than -/// left Datum. -/// \param[in] options compare options -/// \param[out] out resulting datum -/// -/// Note on floating point arrays, this uses ieee-754 compare semantics. -/// -/// \since 0.14.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Compare(FunctionContext* context, const Datum& left, const Datum& right, - struct CompareOptions options, Datum* out); - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/count.h b/r/R/inst/include/arrow/compute/kernels/count.h deleted file mode 100644 index c33ac48665a..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/count.h +++ /dev/null @@ -1,88 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; - -namespace compute { - -struct Datum; -class FunctionContext; -class AggregateFunction; - -/// \class CountOptions -/// -/// The user control the Count kernel behavior with this class. By default, the -/// it will count all non-null values. -struct ARROW_EXPORT CountOptions { - enum mode { - // Count all non-null values. - COUNT_ALL = 0, - // Count all null values. - COUNT_NULL, - }; - - explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {} - - enum mode count_mode = COUNT_ALL; -}; - -/// \brief Return Count function aggregate -ARROW_EXPORT -std::shared_ptr MakeCount(FunctionContext* context, - const CountOptions& options); - -/// \brief Count non-null (or null) values in an array. -/// -/// \param[in] context the FunctionContext -/// \param[in] options counting options, see CountOptions for more information -/// \param[in] datum to count -/// \param[out] out resulting datum -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Count(FunctionContext* context, const CountOptions& options, const Datum& datum, - Datum* out); - -/// \brief Count non-null (or null) values in an array. -/// -/// \param[in] context the FunctionContext -/// \param[in] options counting options, see CountOptions for more information -/// \param[in] array to count -/// \param[out] out resulting datum -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Count(FunctionContext* context, const CountOptions& options, const Array& array, - Datum* out); - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/filter.h b/r/R/inst/include/arrow/compute/kernels/filter.h deleted file mode 100644 index becd2d5a11a..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/filter.h +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/kernel.h" - -namespace arrow { - -class Array; -struct Scalar; -class Status; - -namespace compute { - -class FunctionContext; -struct Datum; - -/// FilterFunction is an interface for Filters -/// -/// Filters takes an array and emits a selection vector. The selection vector -/// is given in the form of a bitmask as a BooleanArray result. -class ARROW_EXPORT FilterFunction { - public: - /// Filter an array with a scalar argument. - virtual Status Filter(const ArrayData& input, const Scalar& scalar, - ArrayData* output) const = 0; - - /// By default, FilterFunction emits a result bitmap. - virtual std::shared_ptr out_type() const { return boolean(); } - - virtual ~FilterFunction() {} -}; - -/// \brief BinaryKernel bound to a filter function -class ARROW_EXPORT FilterBinaryKernel : public BinaryKernel { - public: - explicit FilterBinaryKernel(std::shared_ptr& filter) - : filter_function_(filter) {} - - Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, - Datum* out) override; - - std::shared_ptr out_type() const override; - - private: - std::shared_ptr filter_function_; -}; - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h b/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h deleted file mode 100644 index 77334af36b5..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/generated/cast-codegen-internal.h +++ /dev/null @@ -1,208 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// THIS FILE IS AUTOMATICALLY GENERATED, DO NOT EDIT -// Generated by codegen.py script -#define BOOLEAN_CASES(TEMPLATE) \ - TEMPLATE(BooleanType, UInt8Type) \ - TEMPLATE(BooleanType, Int8Type) \ - TEMPLATE(BooleanType, UInt16Type) \ - TEMPLATE(BooleanType, Int16Type) \ - TEMPLATE(BooleanType, UInt32Type) \ - TEMPLATE(BooleanType, Int32Type) \ - TEMPLATE(BooleanType, UInt64Type) \ - TEMPLATE(BooleanType, Int64Type) \ - TEMPLATE(BooleanType, FloatType) \ - TEMPLATE(BooleanType, DoubleType) - -#define UINT8_CASES(TEMPLATE) \ - TEMPLATE(UInt8Type, BooleanType) \ - TEMPLATE(UInt8Type, Int8Type) \ - TEMPLATE(UInt8Type, UInt16Type) \ - TEMPLATE(UInt8Type, Int16Type) \ - TEMPLATE(UInt8Type, UInt32Type) \ - TEMPLATE(UInt8Type, Int32Type) \ - TEMPLATE(UInt8Type, UInt64Type) \ - TEMPLATE(UInt8Type, Int64Type) \ - TEMPLATE(UInt8Type, FloatType) \ - TEMPLATE(UInt8Type, DoubleType) - -#define INT8_CASES(TEMPLATE) \ - TEMPLATE(Int8Type, BooleanType) \ - TEMPLATE(Int8Type, UInt8Type) \ - TEMPLATE(Int8Type, UInt16Type) \ - TEMPLATE(Int8Type, Int16Type) \ - TEMPLATE(Int8Type, UInt32Type) \ - TEMPLATE(Int8Type, Int32Type) \ - TEMPLATE(Int8Type, UInt64Type) \ - TEMPLATE(Int8Type, Int64Type) \ - TEMPLATE(Int8Type, FloatType) \ - TEMPLATE(Int8Type, DoubleType) - -#define UINT16_CASES(TEMPLATE) \ - TEMPLATE(UInt16Type, BooleanType) \ - TEMPLATE(UInt16Type, UInt8Type) \ - TEMPLATE(UInt16Type, Int8Type) \ - TEMPLATE(UInt16Type, Int16Type) \ - TEMPLATE(UInt16Type, UInt32Type) \ - TEMPLATE(UInt16Type, Int32Type) \ - TEMPLATE(UInt16Type, UInt64Type) \ - TEMPLATE(UInt16Type, Int64Type) \ - TEMPLATE(UInt16Type, FloatType) \ - TEMPLATE(UInt16Type, DoubleType) - -#define INT16_CASES(TEMPLATE) \ - TEMPLATE(Int16Type, BooleanType) \ - TEMPLATE(Int16Type, UInt8Type) \ - TEMPLATE(Int16Type, Int8Type) \ - TEMPLATE(Int16Type, UInt16Type) \ - TEMPLATE(Int16Type, UInt32Type) \ - TEMPLATE(Int16Type, Int32Type) \ - TEMPLATE(Int16Type, UInt64Type) \ - TEMPLATE(Int16Type, Int64Type) \ - TEMPLATE(Int16Type, FloatType) \ - TEMPLATE(Int16Type, DoubleType) - -#define UINT32_CASES(TEMPLATE) \ - TEMPLATE(UInt32Type, BooleanType) \ - TEMPLATE(UInt32Type, UInt8Type) \ - TEMPLATE(UInt32Type, Int8Type) \ - TEMPLATE(UInt32Type, UInt16Type) \ - TEMPLATE(UInt32Type, Int16Type) \ - TEMPLATE(UInt32Type, Int32Type) \ - TEMPLATE(UInt32Type, UInt64Type) \ - TEMPLATE(UInt32Type, Int64Type) \ - TEMPLATE(UInt32Type, FloatType) \ - TEMPLATE(UInt32Type, DoubleType) - -#define UINT64_CASES(TEMPLATE) \ - TEMPLATE(UInt64Type, BooleanType) \ - TEMPLATE(UInt64Type, UInt8Type) \ - TEMPLATE(UInt64Type, Int8Type) \ - TEMPLATE(UInt64Type, UInt16Type) \ - TEMPLATE(UInt64Type, Int16Type) \ - TEMPLATE(UInt64Type, UInt32Type) \ - TEMPLATE(UInt64Type, Int32Type) \ - TEMPLATE(UInt64Type, Int64Type) \ - TEMPLATE(UInt64Type, FloatType) \ - TEMPLATE(UInt64Type, DoubleType) - -#define INT32_CASES(TEMPLATE) \ - TEMPLATE(Int32Type, BooleanType) \ - TEMPLATE(Int32Type, UInt8Type) \ - TEMPLATE(Int32Type, Int8Type) \ - TEMPLATE(Int32Type, UInt16Type) \ - TEMPLATE(Int32Type, Int16Type) \ - TEMPLATE(Int32Type, UInt32Type) \ - TEMPLATE(Int32Type, UInt64Type) \ - TEMPLATE(Int32Type, Int64Type) \ - TEMPLATE(Int32Type, FloatType) \ - TEMPLATE(Int32Type, DoubleType) - -#define INT64_CASES(TEMPLATE) \ - TEMPLATE(Int64Type, BooleanType) \ - TEMPLATE(Int64Type, UInt8Type) \ - TEMPLATE(Int64Type, Int8Type) \ - TEMPLATE(Int64Type, UInt16Type) \ - TEMPLATE(Int64Type, Int16Type) \ - TEMPLATE(Int64Type, UInt32Type) \ - TEMPLATE(Int64Type, Int32Type) \ - TEMPLATE(Int64Type, UInt64Type) \ - TEMPLATE(Int64Type, FloatType) \ - TEMPLATE(Int64Type, DoubleType) - -#define FLOAT_CASES(TEMPLATE) \ - TEMPLATE(FloatType, BooleanType) \ - TEMPLATE(FloatType, UInt8Type) \ - TEMPLATE(FloatType, Int8Type) \ - TEMPLATE(FloatType, UInt16Type) \ - TEMPLATE(FloatType, Int16Type) \ - TEMPLATE(FloatType, UInt32Type) \ - TEMPLATE(FloatType, Int32Type) \ - TEMPLATE(FloatType, UInt64Type) \ - TEMPLATE(FloatType, Int64Type) \ - TEMPLATE(FloatType, DoubleType) - -#define DOUBLE_CASES(TEMPLATE) \ - TEMPLATE(DoubleType, BooleanType) \ - TEMPLATE(DoubleType, UInt8Type) \ - TEMPLATE(DoubleType, Int8Type) \ - TEMPLATE(DoubleType, UInt16Type) \ - TEMPLATE(DoubleType, Int16Type) \ - TEMPLATE(DoubleType, UInt32Type) \ - TEMPLATE(DoubleType, Int32Type) \ - TEMPLATE(DoubleType, UInt64Type) \ - TEMPLATE(DoubleType, Int64Type) \ - TEMPLATE(DoubleType, FloatType) - -#define DATE32_CASES(TEMPLATE) \ - TEMPLATE(Date32Type, Date64Type) - -#define DATE64_CASES(TEMPLATE) \ - TEMPLATE(Date64Type, Date32Type) - -#define TIME32_CASES(TEMPLATE) \ - TEMPLATE(Time32Type, Time32Type) \ - TEMPLATE(Time32Type, Time64Type) - -#define TIME64_CASES(TEMPLATE) \ - TEMPLATE(Time64Type, Time32Type) \ - TEMPLATE(Time64Type, Time64Type) - -#define TIMESTAMP_CASES(TEMPLATE) \ - TEMPLATE(TimestampType, Date32Type) \ - TEMPLATE(TimestampType, Date64Type) \ - TEMPLATE(TimestampType, TimestampType) - -#define BINARY_CASES(TEMPLATE) \ - TEMPLATE(BinaryType, StringType) - -#define STRING_CASES(TEMPLATE) \ - TEMPLATE(StringType, BooleanType) \ - TEMPLATE(StringType, UInt8Type) \ - TEMPLATE(StringType, Int8Type) \ - TEMPLATE(StringType, UInt16Type) \ - TEMPLATE(StringType, Int16Type) \ - TEMPLATE(StringType, UInt32Type) \ - TEMPLATE(StringType, Int32Type) \ - TEMPLATE(StringType, UInt64Type) \ - TEMPLATE(StringType, Int64Type) \ - TEMPLATE(StringType, FloatType) \ - TEMPLATE(StringType, DoubleType) \ - TEMPLATE(StringType, TimestampType) - -#define DICTIONARY_CASES(TEMPLATE) \ - TEMPLATE(DictionaryType, UInt8Type) \ - TEMPLATE(DictionaryType, Int8Type) \ - TEMPLATE(DictionaryType, UInt16Type) \ - TEMPLATE(DictionaryType, Int16Type) \ - TEMPLATE(DictionaryType, UInt32Type) \ - TEMPLATE(DictionaryType, Int32Type) \ - TEMPLATE(DictionaryType, UInt64Type) \ - TEMPLATE(DictionaryType, Int64Type) \ - TEMPLATE(DictionaryType, FloatType) \ - TEMPLATE(DictionaryType, DoubleType) \ - TEMPLATE(DictionaryType, Date32Type) \ - TEMPLATE(DictionaryType, Date64Type) \ - TEMPLATE(DictionaryType, Time32Type) \ - TEMPLATE(DictionaryType, Time64Type) \ - TEMPLATE(DictionaryType, TimestampType) \ - TEMPLATE(DictionaryType, NullType) \ - TEMPLATE(DictionaryType, BinaryType) \ - TEMPLATE(DictionaryType, FixedSizeBinaryType) \ - TEMPLATE(DictionaryType, StringType) \ - TEMPLATE(DictionaryType, Decimal128Type) diff --git a/r/R/inst/include/arrow/compute/kernels/hash.h b/r/R/inst/include/arrow/compute/kernels/hash.h deleted file mode 100644 index edc7c493e46..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/hash.h +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_KERNELS_HASH_H -#define ARROW_COMPUTE_KERNELS_HASH_H - -#include - -#include "arrow/compute/kernel.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -struct ArrayData; - -namespace compute { - -class FunctionContext; - -/// \brief Compute unique elements from an array-like object -/// -/// Note if a null occurs in the input it will NOT be included in the output. -/// -/// \param[in] context the FunctionContext -/// \param[in] datum array-like input -/// \param[out] out result as Array -/// -/// \since 0.8.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Unique(FunctionContext* context, const Datum& datum, std::shared_ptr* out); - -// Constants for accessing the output of ValueCounts -ARROW_EXPORT extern const char kValuesFieldName[]; -ARROW_EXPORT extern const char kCountsFieldName[]; -ARROW_EXPORT extern const int32_t kValuesFieldIndex; -ARROW_EXPORT extern const int32_t kCountsFieldIndex; -/// \brief Return counts of unique elements from an array-like object. -/// -/// Note that the counts do not include counts for nulls in the array. These can be -/// obtained separately from metadata. -/// -/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values -/// which can lead to unexpected results if the input Array has these values. -/// -/// \param[in] context the FunctionContext -/// \param[in] value array-like input -/// \param[out] counts An array of structs. -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status ValueCounts(FunctionContext* context, const Datum& value, - std::shared_ptr* counts); - -/// \brief Dictionary-encode values in an array-like object -/// \param[in] context the FunctionContext -/// \param[in] data array-like input -/// \param[out] out result with same shape and type as input -/// -/// \since 0.8.0 -/// \note API not yet finalized -ARROW_EXPORT -Status DictionaryEncode(FunctionContext* context, const Datum& data, Datum* out); - -// TODO(wesm): Define API for incremental dictionary encoding - -// TODO(wesm): Define API for regularizing DictionaryArray objects with -// different dictionaries - -// -// ARROW_EXPORT -// Status DictionaryEncode(FunctionContext* context, const Datum& data, -// const Array& prior_dictionary, Datum* out); - -// TODO(wesm): Implement these next -// ARROW_EXPORT -// Status Match(FunctionContext* context, const Datum& values, const Datum& member_set, -// Datum* out); - -// ARROW_EXPORT -// Status IsIn(FunctionContext* context, const Datum& values, const Datum& member_set, -// Datum* out); - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_KERNELS_HASH_H diff --git a/r/R/inst/include/arrow/compute/kernels/mean.h b/r/R/inst/include/arrow/compute/kernels/mean.h deleted file mode 100644 index 5074d4e7b7d..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/mean.h +++ /dev/null @@ -1,66 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; - -namespace compute { - -struct Datum; -class FunctionContext; -class AggregateFunction; - -ARROW_EXPORT -std::shared_ptr MakeMeanAggregateFunction(const DataType& type, - FunctionContext* context); - -/// \brief Compute the mean of a numeric array. -/// -/// \param[in] context the FunctionContext -/// \param[in] value datum to compute the mean, expecting Array -/// \param[out] mean datum of the computed mean as a DoubleScalar -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Mean(FunctionContext* context, const Datum& value, Datum* mean); - -/// \brief Compute the mean of a numeric array. -/// -/// \param[in] context the FunctionContext -/// \param[in] array to compute the mean -/// \param[out] mean datum of the computed mean as a DoubleScalar -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Mean(FunctionContext* context, const Array& array, Datum* mean); - -} // namespace compute -}; // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/sum-internal.h b/r/R/inst/include/arrow/compute/kernels/sum-internal.h deleted file mode 100644 index a4e7ea63439..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/sum-internal.h +++ /dev/null @@ -1,207 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/compute/kernel.h" -#include "arrow/compute/kernels/aggregate.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/logging.h" - -namespace arrow { - -class Array; -class DataType; - -namespace compute { - -// Find the largest compatible primitive type for a primitive type. -template -struct FindAccumulatorType {}; - -template -struct FindAccumulatorType> { - using Type = Int64Type; -}; - -template -struct FindAccumulatorType> { - using Type = UInt64Type; -}; - -template -struct FindAccumulatorType> { - using Type = DoubleType; -}; - -template -class SumAggregateFunction final : public AggregateFunctionStaticState { - using CType = typename TypeTraits::CType; - using ArrayType = typename TypeTraits::ArrayType; - - // A small number of elements rounded to the next cacheline. This should - // amount to a maximum of 4 cachelines when dealing with 8 bytes elements. - static constexpr int64_t kTinyThreshold = 32; - static_assert(kTinyThreshold >= (2 * CHAR_BIT) + 1, - "ConsumeSparse requires 3 bytes of null bitmap, and 17 is the" - "required minimum number of bits/elements to cover 3 bytes."); - - public: - Status Consume(const Array& input, StateType* state) const override { - const ArrayType& array = static_cast(input); - - if (input.null_count() == 0) { - *state = ConsumeDense(array); - } else if (input.length() <= kTinyThreshold) { - // In order to simplify ConsumeSparse implementation (requires at least 3 - // bytes of bitmap data), small arrays are handled differently. - *state = ConsumeTiny(array); - } else { - *state = ConsumeSparse(array); - } - - return Status::OK(); - } - - Status Merge(const StateType& src, StateType* dst) const override { - *dst += src; - return Status::OK(); - } - - Status Finalize(const StateType& src, Datum* output) const override { - *output = src.Finalize(); - return Status::OK(); - } - - std::shared_ptr out_type() const override { return StateType::out_type(); } - - private: - StateType ConsumeDense(const ArrayType& array) const { - StateType local; - - const auto values = array.raw_values(); - const int64_t length = array.length(); - for (int64_t i = 0; i < length; i++) { - local.sum += values[i]; - } - - local.count = length; - - return local; - } - - StateType ConsumeTiny(const ArrayType& array) const { - StateType local; - - internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), - array.length()); - const auto values = array.raw_values(); - for (int64_t i = 0; i < array.length(); i++) { - if (reader.IsSet()) { - local.sum += values[i]; - local.count++; - } - reader.Next(); - } - - return local; - } - - // While this is not branchless, gcc needs this to be in a different function - // for it to generate cmov which ends to be slightly faster than - // multiplication but safe for handling NaN with doubles. - inline CType MaskedValue(bool valid, CType value) const { return valid ? value : 0; } - - inline StateType UnrolledSum(uint8_t bits, const CType* values) const { - StateType local; - - if (bits < 0xFF) { - // Some nulls - for (size_t i = 0; i < 8; i++) { - local.sum += MaskedValue(bits & (1U << i), values[i]); - } - local.count += BitUtil::kBytePopcount[bits]; - } else { - // No nulls - for (size_t i = 0; i < 8; i++) { - local.sum += values[i]; - } - local.count += 8; - } - - return local; - } - - StateType ConsumeSparse(const ArrayType& array) const { - StateType local; - - // Sliced bitmaps on non-byte positions induce problem with the branchless - // unrolled technique. Thus extra padding is added on both left and right - // side of the slice such that both ends are byte-aligned. The first and - // last bitmap are properly masked to ignore extra values induced by - // padding. - // - // The execution is divided in 3 sections. - // - // 1. Compute the sum of the first masked byte. - // 2. Compute the sum of the middle bytes - // 3. Compute the sum of the last masked byte. - - const int64_t length = array.length(); - const int64_t offset = array.offset(); - - // The number of bytes covering the range, this includes partial bytes. - // This number bounded by `<= (length / 8) + 2`, e.g. a possible extra byte - // on the left, and on the right. - const int64_t covering_bytes = BitUtil::CoveringBytes(offset, length); - DCHECK_GE(covering_bytes, 3); - - // Align values to the first batch of 8 elements. Note that raw_values() is - // already adjusted with the offset, thus we rewind a little to align to - // the closest 8-batch offset. - const auto values = array.raw_values() - (offset % 8); - - // Align bitmap at the first consumable byte. - const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(offset, 8) / 8; - - // Consume the first (potentially partial) byte. - const uint8_t first_mask = BitUtil::kTrailingBitmask[offset % 8]; - local += UnrolledSum(bitmap[0] & first_mask, values); - - // Consume the (full) middle bytes. The loop iterates in unit of - // batches of 8 values and 1 byte of bitmap. - for (int64_t i = 1; i < covering_bytes - 1; i++) { - local += UnrolledSum(bitmap[i], &values[i * 8]); - } - - // Consume the last (potentially partial) byte. - const int64_t last_idx = covering_bytes - 1; - const uint8_t last_mask = BitUtil::kPrecedingWrappingBitmask[(offset + length) % 8]; - local += UnrolledSum(bitmap[last_idx] & last_mask, &values[last_idx * 8]); - - return local; - } -}; // namespace compute - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/sum.h b/r/R/inst/include/arrow/compute/kernels/sum.h deleted file mode 100644 index e6f95490d7c..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/sum.h +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -class Status; - -namespace compute { - -struct Datum; -class FunctionContext; -class AggregateFunction; - -/// \brief Return a Sum Kernel -/// -/// \param[in] type required to specialize the kernel -/// \param[in] context the FunctionContext -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -std::shared_ptr MakeSumAggregateFunction(const DataType& type, - FunctionContext* context); - -/// \brief Sum values of a numeric array. -/// -/// \param[in] context the FunctionContext -/// \param[in] value datum to sum, expecting Array or ChunkedArray -/// \param[out] out resulting datum -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Sum(FunctionContext* context, const Datum& value, Datum* out); - -/// \brief Sum values of a numeric array. -/// -/// \param[in] context the FunctionContext -/// \param[in] array to sum -/// \param[out] out resulting datum -/// -/// \since 0.13.0 -/// \note API not yet finalized -ARROW_EXPORT -Status Sum(FunctionContext* context, const Array& array, Datum* out); - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/take.h b/r/R/inst/include/arrow/compute/kernels/take.h deleted file mode 100644 index 3aa5ed5eedf..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/take.h +++ /dev/null @@ -1,83 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/kernel.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; - -namespace compute { - -class FunctionContext; - -struct ARROW_EXPORT TakeOptions {}; - -/// \brief Take from an array of values at indices in another array -/// -/// The output array will be of the same type as the input values -/// array, with elements taken from the values array at the given -/// indices. If an index is null then the taken element will be null. -/// -/// For example given values = ["a", "b", "c", null, "e", "f"] and -/// indices = [2, 1, null, 3], the output will be -/// = [values[2], values[1], null, values[3]] -/// = ["c", "b", null, null] -/// -/// \param[in] context the FunctionContext -/// \param[in] values array from which to take -/// \param[in] indices which values to take -/// \param[in] options options -/// \param[out] out resulting array -ARROW_EXPORT -Status Take(FunctionContext* context, const Array& values, const Array& indices, - const TakeOptions& options, std::shared_ptr* out); - -/// \brief Take from an array of values at indices in another array -/// -/// \param[in] context the FunctionContext -/// \param[in] values datum from which to take -/// \param[in] indices which values to take -/// \param[in] options options -/// \param[out] out resulting datum -ARROW_EXPORT -Status Take(FunctionContext* context, const Datum& values, const Datum& indices, - const TakeOptions& options, Datum* out); - -/// \brief BinaryKernel implementing Take operation -class ARROW_EXPORT TakeKernel : public BinaryKernel { - public: - explicit TakeKernel(const std::shared_ptr& type, TakeOptions options = {}) - : type_(type), options_(options) {} - - Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices, - Datum* out) override; - - std::shared_ptr out_type() const override { return type_; } - - private: - std::shared_ptr type_; - TakeOptions options_; -}; -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/kernels/util-internal.h b/r/R/inst/include/arrow/compute/kernels/util-internal.h deleted file mode 100644 index 25a670c8b25..00000000000 --- a/r/R/inst/include/arrow/compute/kernels/util-internal.h +++ /dev/null @@ -1,144 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H -#define ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H - -#include -#include - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/compute/kernel.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - -class FunctionContext; - -// \brief Make a copy of the buffers into a destination array without carrying -// the type. -static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) { - output->length = input.length; - output->null_count = input.null_count; - output->buffers = input.buffers; - output->offset = input.offset; - output->child_data = input.child_data; -} - -namespace detail { - -/// \brief Invoke the kernel on value using the ctx and store results in outputs. -/// -/// \param[in,out] ctx The function context to use when invoking the kernel. -/// \param[in,out] kernel The kernel to execute. -/// \param[in] value The input value to execute the kernel with. -/// \param[out] outputs One ArrayData datum for each ArrayData available in value. -ARROW_EXPORT -Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel, - const Datum& value, std::vector* outputs); - -ARROW_EXPORT -Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel, - const Datum& left, const Datum& right, - std::vector* outputs); -ARROW_EXPORT -Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel, - const Datum& left, const Datum& right, Datum* output); - -/// \brief Assign validity bitmap to output, copying bitmap if necessary, but -/// zero-copy otherwise, so that the same value slots are valid/not-null in the -/// output (sliced arrays). -/// -/// \param[in] ctx the kernel FunctionContext -/// \param[in] input the input array -/// \param[out] output the output array. Must have length set correctly. -ARROW_EXPORT -Status PropagateNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output); - -/// \brief Set validity bitmap in output with all null values. -/// -/// \param[in] ctx the kernel FunctionContext -/// \param[in] input the input array -/// \param[out] output the output array. Must have length and buffer set correctly. -ARROW_EXPORT -Status SetAllNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output); - -/// \brief Assign validity bitmap to output, taking the intersection of left and right -/// null bitmaps if necessary, but zero-copy otherwise. -/// -/// \param[in] ctx the kernel FunctionContext -/// \param[in] left the left operand -/// \param[in] right the right operand -/// \param[out] output the output array. Must have length set correctly. -ARROW_EXPORT -Status AssignNullIntersection(FunctionContext* ctx, const ArrayData& left, - const ArrayData& right, ArrayData* output); - -ARROW_EXPORT -Datum WrapArraysLike(const Datum& value, - const std::vector>& arrays); - -ARROW_EXPORT -Datum WrapDatumsLike(const Datum& value, const std::vector& datums); - -/// \brief Kernel used to preallocate outputs for primitive types. This -/// does not include allocations for the validity bitmap (PropagateNulls -/// should be used for that). -class ARROW_EXPORT PrimitiveAllocatingUnaryKernel : public UnaryKernel { - public: - // \brief Construct with a delegate that must live longer - // then this object. - explicit PrimitiveAllocatingUnaryKernel(UnaryKernel* delegate); - /// \brief Allocates ArrayData with the necessary data buffers allocated and - /// then written into by the delegate kernel - Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; - - std::shared_ptr out_type() const override; - - private: - UnaryKernel* delegate_; -}; - -/// \brief Kernel used to preallocate outputs for primitive types. -class ARROW_EXPORT PrimitiveAllocatingBinaryKernel : public BinaryKernel { - public: - // \brief Construct with a kernel to delegate operatoions to. - // - // Ownership is not taken of the delegate kernel, it must outlive - // the life time of this object. - explicit PrimitiveAllocatingBinaryKernel(BinaryKernel* delegate); - - /// \brief Sets out to be of type ArrayData with the necessary - /// data buffers prepopulated. - Status Call(FunctionContext* ctx, const Datum& left, const Datum& right, - Datum* out) override; - - std::shared_ptr out_type() const override; - - private: - BinaryKernel* delegate_; -}; - -} // namespace detail - -} // namespace compute -} // namespace arrow - -#endif // ARROW_COMPUTE_KERNELS_UTIL_INTERNAL_H diff --git a/r/R/inst/include/arrow/compute/logical_type.h b/r/R/inst/include/arrow/compute/logical_type.h deleted file mode 100644 index 7acbeefe4a5..00000000000 --- a/r/R/inst/include/arrow/compute/logical_type.h +++ /dev/null @@ -1,308 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Metadata objects for creating well-typed expressions. These are distinct -// from (and higher level than) arrow::DataType as some type parameters (like -// decimal scale and precision) may not be known at expression build time, and -// these are resolved later on evaluation - -#pragma once - -#include -#include - -#include "arrow/compute/type_fwd.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace compute { - -class Expr; - -/// \brief An object that represents either a single concrete value type or a -/// group of related types, to help with expression type validation and other -/// purposes -class ARROW_EXPORT LogicalType { - public: - enum Id { - ANY, - NUMBER, - INTEGER, - SIGNED_INTEGER, - UNSIGNED_INTEGER, - FLOATING, - NULL_, - BOOL, - UINT8, - INT8, - UINT16, - INT16, - UINT32, - INT32, - UINT64, - INT64, - FLOAT16, - FLOAT32, - FLOAT64, - BINARY, - UTF8, - DATE, - TIME, - TIMESTAMP, - DECIMAL, - LIST, - STRUCT - }; - - Id id() const { return id_; } - - virtual ~LogicalType() = default; - - virtual std::string ToString() const = 0; - - /// \brief Check if expression is an instance of this type class - virtual bool IsInstance(const Expr& expr) const = 0; - - /// \brief Get a logical expression type from a concrete Arrow in-memory - /// array type - static Status FromArrow(const ::arrow::DataType& type, LogicalTypePtr* out); - - protected: - explicit LogicalType(Id id) : id_(id) {} - Id id_; -}; - -namespace type { - -/// \brief Logical type for any value type -class ARROW_EXPORT Any : public LogicalType { - public: - Any() : LogicalType(LogicalType::ANY) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for null -class ARROW_EXPORT Null : public LogicalType { - public: - Null() : LogicalType(LogicalType::NULL_) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for concrete boolean -class ARROW_EXPORT Bool : public LogicalType { - public: - Bool() : LogicalType(LogicalType::BOOL) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for any number (integer or floating point) -class ARROW_EXPORT Number : public LogicalType { - public: - Number() : Number(LogicalType::NUMBER) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit Number(Id type_id) : LogicalType(type_id) {} -}; - -/// \brief Logical type for any integer -class ARROW_EXPORT Integer : public Number { - public: - Integer() : Integer(LogicalType::INTEGER) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit Integer(Id type_id) : Number(type_id) {} -}; - -/// \brief Logical type for any floating point number -class ARROW_EXPORT Floating : public Number { - public: - Floating() : Floating(LogicalType::FLOATING) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit Floating(Id type_id) : Number(type_id) {} -}; - -/// \brief Logical type for any signed integer -class ARROW_EXPORT SignedInteger : public Integer { - public: - SignedInteger() : SignedInteger(LogicalType::SIGNED_INTEGER) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit SignedInteger(Id type_id) : Integer(type_id) {} -}; - -/// \brief Logical type for any unsigned integer -class ARROW_EXPORT UnsignedInteger : public Integer { - public: - UnsignedInteger() : UnsignedInteger(LogicalType::UNSIGNED_INTEGER) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit UnsignedInteger(Id type_id) : Integer(type_id) {} -}; - -/// \brief Logical type for int8 -class ARROW_EXPORT Int8 : public SignedInteger { - public: - Int8() : SignedInteger(LogicalType::INT8) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for int16 -class ARROW_EXPORT Int16 : public SignedInteger { - public: - Int16() : SignedInteger(LogicalType::INT16) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for int32 -class ARROW_EXPORT Int32 : public SignedInteger { - public: - Int32() : SignedInteger(LogicalType::INT32) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for int64 -class ARROW_EXPORT Int64 : public SignedInteger { - public: - Int64() : SignedInteger(LogicalType::INT64) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for uint8 -class ARROW_EXPORT UInt8 : public UnsignedInteger { - public: - UInt8() : UnsignedInteger(LogicalType::UINT8) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for uint16 -class ARROW_EXPORT UInt16 : public UnsignedInteger { - public: - UInt16() : UnsignedInteger(LogicalType::UINT16) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for uint32 -class ARROW_EXPORT UInt32 : public UnsignedInteger { - public: - UInt32() : UnsignedInteger(LogicalType::UINT32) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for uint64 -class ARROW_EXPORT UInt64 : public UnsignedInteger { - public: - UInt64() : UnsignedInteger(LogicalType::UINT64) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for 16-bit floating point -class ARROW_EXPORT Float16 : public Floating { - public: - Float16() : Floating(LogicalType::FLOAT16) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for 32-bit floating point -class ARROW_EXPORT Float32 : public Floating { - public: - Float32() : Floating(LogicalType::FLOAT32) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for 64-bit floating point -class ARROW_EXPORT Float64 : public Floating { - public: - Float64() : Floating(LogicalType::FLOAT64) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -/// \brief Logical type for variable-size binary -class ARROW_EXPORT Binary : public LogicalType { - public: - Binary() : Binary(LogicalType::BINARY) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; - - protected: - explicit Binary(Id type_id) : LogicalType(type_id) {} -}; - -/// \brief Logical type for variable-size binary -class ARROW_EXPORT Utf8 : public Binary { - public: - Utf8() : Binary(LogicalType::UTF8) {} - bool IsInstance(const Expr& expr) const override; - std::string ToString() const override; -}; - -#define SIMPLE_TYPE_FACTORY(NAME) ARROW_EXPORT LogicalTypePtr NAME(); - -SIMPLE_TYPE_FACTORY(any); -SIMPLE_TYPE_FACTORY(null); -SIMPLE_TYPE_FACTORY(boolean); -SIMPLE_TYPE_FACTORY(number); -SIMPLE_TYPE_FACTORY(integer); -SIMPLE_TYPE_FACTORY(signed_integer); -SIMPLE_TYPE_FACTORY(unsigned_integer); -SIMPLE_TYPE_FACTORY(floating); -SIMPLE_TYPE_FACTORY(int8); -SIMPLE_TYPE_FACTORY(int16); -SIMPLE_TYPE_FACTORY(int32); -SIMPLE_TYPE_FACTORY(int64); -SIMPLE_TYPE_FACTORY(uint8); -SIMPLE_TYPE_FACTORY(uint16); -SIMPLE_TYPE_FACTORY(uint32); -SIMPLE_TYPE_FACTORY(uint64); -SIMPLE_TYPE_FACTORY(float16); -SIMPLE_TYPE_FACTORY(float32); -SIMPLE_TYPE_FACTORY(float64); -SIMPLE_TYPE_FACTORY(binary); -SIMPLE_TYPE_FACTORY(utf8); - -#undef SIMPLE_TYPE_FACTORY - -} // namespace type -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operation.h b/r/R/inst/include/arrow/compute/operation.h deleted file mode 100644 index c06f8c311cc..00000000000 --- a/r/R/inst/include/arrow/compute/operation.h +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/compute/type_fwd.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace compute { - -/// \brief An operation is a node in a computation graph, taking input data -/// expression dependencies and emitting an output expression -class ARROW_EXPORT Operation : public std::enable_shared_from_this { - public: - virtual ~Operation() = default; - - /// \brief Check input expression arguments and output the type of resulting - /// expression that this operation produces. If the input arguments are - /// invalid, error Status is returned - /// \param[out] out the returned well-typed expression - /// \return success or failure - virtual Status ToExpr(ExprPtr* out) const = 0; - - /// \brief Return the input expressions used to instantiate the - /// operation. The default implementation returns an empty vector - /// \return a vector of expressions - virtual std::vector input_args() const; -}; - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operations/cast.h b/r/R/inst/include/arrow/compute/operations/cast.h deleted file mode 100644 index 0052ebb6082..00000000000 --- a/r/R/inst/include/arrow/compute/operations/cast.h +++ /dev/null @@ -1,46 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/operation.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - -class LogicalType; - -namespace ops { - -/// \brief A cast operation creates an expression from a known constant -/// scalar value -class ARROW_EXPORT Cast : public Operation { - public: - Cast(std::shared_ptr value, std::shared_ptr out_type); - Status ToExpr(std::shared_ptr* out) const override; - - private: - std::shared_ptr value_; - std::shared_ptr out_type_; -}; - -} // namespace ops -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/operations/literal.h b/r/R/inst/include/arrow/compute/operations/literal.h deleted file mode 100644 index b596b339c89..00000000000 --- a/r/R/inst/include/arrow/compute/operations/literal.h +++ /dev/null @@ -1,45 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/compute/operation.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -struct Scalar; - -namespace compute { -namespace ops { - -/// \brief A literal operation creates an expression from a known constant -/// scalar value -class ARROW_EXPORT Literal : public Operation { - public: - explicit Literal(const std::shared_ptr& value); - Status ToExpr(std::shared_ptr* out) const override; - - private: - std::shared_ptr value_; -}; - -} // namespace ops -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/compute/test-util.h b/r/R/inst/include/arrow/compute/test-util.h deleted file mode 100644 index bec54cc3615..00000000000 --- a/r/R/inst/include/arrow/compute/test-util.h +++ /dev/null @@ -1,110 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_COMPUTE_TEST_UTIL_H -#define ARROW_COMPUTE_TEST_UTIL_H - -#include -#include - -#include - -#include "arrow/array.h" -#include "arrow/memory_pool.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/util.h" -#include "arrow/type.h" - -#include "arrow/compute/context.h" -#include "arrow/compute/kernel.h" - -namespace arrow { -namespace compute { - -class ComputeFixture { - public: - ComputeFixture() : ctx_(default_memory_pool()) {} - - protected: - FunctionContext ctx_; -}; - -class MockUnaryKernel : public UnaryKernel { - public: - MOCK_METHOD3(Call, Status(FunctionContext* ctx, const Datum& input, Datum* out)); - MOCK_CONST_METHOD0(out_type, std::shared_ptr()); -}; - -class MockBinaryKernel : public BinaryKernel { - public: - MOCK_METHOD4(Call, Status(FunctionContext* ctx, const Datum& left, const Datum& right, - Datum* out)); - MOCK_CONST_METHOD0(out_type, std::shared_ptr()); -}; - -template -std::shared_ptr _MakeArray(const std::shared_ptr& type, - const std::vector& values, - const std::vector& is_valid) { - std::shared_ptr result; - if (is_valid.size() > 0) { - ArrayFromVector(type, is_valid, values, &result); - } else { - ArrayFromVector(type, values, &result); - } - return result; -} - -template -struct DatumEqual {}; - -template -struct DatumEqual::value>::type> { - static constexpr double kArbitraryDoubleErrorBound = 1.0; - using ScalarType = typename TypeTraits::ScalarType; - - static void EnsureEqual(const Datum& lhs, const Datum& rhs) { - ASSERT_EQ(lhs.kind(), rhs.kind()); - if (lhs.kind() == Datum::SCALAR) { - auto left = internal::checked_cast(lhs.scalar().get()); - auto right = internal::checked_cast(rhs.scalar().get()); - ASSERT_EQ(left->is_valid, right->is_valid); - ASSERT_EQ(left->type->id(), right->type->id()); - ASSERT_NEAR(left->value, right->value, kArbitraryDoubleErrorBound); - } - } -}; - -template -struct DatumEqual::value>::type> { - using ScalarType = typename TypeTraits::ScalarType; - static void EnsureEqual(const Datum& lhs, const Datum& rhs) { - ASSERT_EQ(lhs.kind(), rhs.kind()); - if (lhs.kind() == Datum::SCALAR) { - auto left = internal::checked_cast(lhs.scalar().get()); - auto right = internal::checked_cast(rhs.scalar().get()); - ASSERT_EQ(left->is_valid, right->is_valid); - ASSERT_EQ(left->type->id(), right->type->id()); - ASSERT_EQ(left->value, right->value); - } - } -}; - -} // namespace compute -} // namespace arrow - -#endif diff --git a/r/R/inst/include/arrow/compute/type_fwd.h b/r/R/inst/include/arrow/compute/type_fwd.h deleted file mode 100644 index 48d45ecd118..00000000000 --- a/r/R/inst/include/arrow/compute/type_fwd.h +++ /dev/null @@ -1,38 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/type_fwd.h" - -namespace arrow { -namespace compute { - -class Expr; -class LogicalType; -class Operation; - -using ArrowTypePtr = std::shared_ptr<::arrow::DataType>; -using ExprPtr = std::shared_ptr; -using ConstOpPtr = std::shared_ptr; -using OpPtr = std::shared_ptr; -using LogicalTypePtr = std::shared_ptr; - -} // namespace compute -} // namespace arrow diff --git a/r/R/inst/include/arrow/csv/api.h b/r/R/inst/include/arrow/csv/api.h deleted file mode 100644 index 8e311844c52..00000000000 --- a/r/R/inst/include/arrow/csv/api.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_API_H -#define ARROW_CSV_API_H - -#include "arrow/csv/options.h" -#include "arrow/csv/reader.h" - -#endif // ARROW_CSV_API_H diff --git a/r/R/inst/include/arrow/csv/chunker.h b/r/R/inst/include/arrow/csv/chunker.h deleted file mode 100644 index 6c61632614c..00000000000 --- a/r/R/inst/include/arrow/csv/chunker.h +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_CHUNKER_H -#define ARROW_CSV_CHUNKER_H - -#include - -#include "arrow/csv/options.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace csv { - -/// \class Chunker -/// \brief A reusable block-based chunker for CSV data -/// -/// The chunker takes a block of CSV data and finds a suitable place -/// to cut it up without splitting a row. -/// If the block is truncated (i.e. not all data can be chunked), it is up -/// to the caller to arrange the next block to start with the trailing data. -/// -/// Note: if the previous block ends with CR (0x0d) and a new block starts -/// with LF (0x0a), the chunker will consider the leading newline as an empty line. -class ARROW_EXPORT Chunker { - public: - explicit Chunker(ParseOptions options); - - /// \brief Carve up a chunk in a block of data - /// - /// Process a block of CSV data, reading up to size bytes. - /// The number of bytes in the chunk is returned in out_size. - Status Process(const char* data, uint32_t size, uint32_t* out_size); - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); - - // Like Process(), but specialized for some parsing options - template - Status ProcessSpecialized(const char* data, uint32_t size, uint32_t* out_size); - - // Detect a single line from the data pointer. Return the line end, - // or nullptr if the remaining line is truncated. - template - inline const char* ReadLine(const char* data, const char* data_end); - - ParseOptions options_; -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_CHUNKER_H diff --git a/r/R/inst/include/arrow/csv/column-builder.h b/r/R/inst/include/arrow/csv/column-builder.h deleted file mode 100644 index 054a642295c..00000000000 --- a/r/R/inst/include/arrow/csv/column-builder.h +++ /dev/null @@ -1,87 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_COLUMN_BUILDER_H -#define ARROW_CSV_COLUMN_BUILDER_H - -#include -#include - -#include "arrow/array.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class ChunkedArray; -class DataType; - -namespace internal { - -class TaskGroup; - -} // namespace internal - -namespace csv { - -class BlockParser; -struct ConvertOptions; - -class ARROW_EXPORT ColumnBuilder { - public: - virtual ~ColumnBuilder() = default; - - /// Spawn a task that will try to convert and append the given CSV block. - /// All calls to Append() should happen on the same thread, otherwise - /// call Insert() instead. - virtual void Append(const std::shared_ptr& parser); - - /// Spawn a task that will try to convert and insert the given CSV block - virtual void Insert(int64_t block_index, - const std::shared_ptr& parser) = 0; - - /// Return the final chunked array. The TaskGroup _must_ have finished! - virtual Status Finish(std::shared_ptr* out) = 0; - - /// Change the task group. The previous TaskGroup _must_ have finished! - void SetTaskGroup(const std::shared_ptr& task_group); - - std::shared_ptr task_group() { return task_group_; } - - /// Construct a strictly-typed ColumnBuilder. - static Status Make(const std::shared_ptr& type, int32_t col_index, - const ConvertOptions& options, - const std::shared_ptr& task_group, - std::shared_ptr* out); - - /// Construct a type-inferring ColumnBuilder. - static Status Make(int32_t col_index, const ConvertOptions& options, - const std::shared_ptr& task_group, - std::shared_ptr* out); - - protected: - explicit ColumnBuilder(const std::shared_ptr& task_group) - : task_group_(task_group) {} - - std::shared_ptr task_group_; - ArrayVector chunks_; -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_COLUMN_BUILDER_H diff --git a/r/R/inst/include/arrow/csv/converter.h b/r/R/inst/include/arrow/csv/converter.h deleted file mode 100644 index d64fe695d0a..00000000000 --- a/r/R/inst/include/arrow/csv/converter.h +++ /dev/null @@ -1,68 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_CONVERTER_H -#define ARROW_CSV_CONVERTER_H - -#include -#include - -#include "arrow/csv/options.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -class MemoryPool; -class Status; - -namespace csv { - -class BlockParser; - -class ARROW_EXPORT Converter { - public: - Converter(const std::shared_ptr& type, const ConvertOptions& options, - MemoryPool* pool); - virtual ~Converter() = default; - - virtual Status Convert(const BlockParser& parser, int32_t col_index, - std::shared_ptr* out) = 0; - - std::shared_ptr type() const { return type_; } - - static Status Make(const std::shared_ptr& type, const ConvertOptions& options, - std::shared_ptr* out); - static Status Make(const std::shared_ptr& type, const ConvertOptions& options, - MemoryPool* pool, std::shared_ptr* out); - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); - - virtual Status Initialize() = 0; - - const ConvertOptions options_; - MemoryPool* pool_; - std::shared_ptr type_; -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_CONVERTER_H diff --git a/r/R/inst/include/arrow/csv/options.h b/r/R/inst/include/arrow/csv/options.h deleted file mode 100644 index 9cd312ac079..00000000000 --- a/r/R/inst/include/arrow/csv/options.h +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_OPTIONS_H -#define ARROW_CSV_OPTIONS_H - -#include -#include -#include -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class DataType; - -namespace csv { - -struct ARROW_EXPORT ParseOptions { - // Parsing options - - // Field delimiter - char delimiter = ','; - // Whether quoting is used - bool quoting = true; - // Quoting character (if `quoting` is true) - char quote_char = '"'; - // Whether a quote inside a value is double-quoted - bool double_quote = true; - // Whether escaping is used - bool escaping = false; - // Escaping character (if `escaping` is true) - char escape_char = '\\'; - // Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters - bool newlines_in_values = false; - // Whether empty lines are ignored. If false, an empty line represents - // a single empty value (assuming a one-column CSV file). - bool ignore_empty_lines = true; - - // XXX Should this be in ReadOptions? - // Number of header rows to skip (including the first row containing column names) - int32_t header_rows = 1; - - static ParseOptions Defaults(); -}; - -struct ARROW_EXPORT ConvertOptions { - // Conversion options - - // Whether to check UTF8 validity of string columns - bool check_utf8 = true; - // Optional per-column types (disabling type inference on those columns) - std::unordered_map> column_types; - // Recognized spellings for null values - std::vector null_values; - // Recognized spellings for boolean values - std::vector true_values; - std::vector false_values; - // Whether string / binary columns can have null values. - // If true, then strings in "null_values" are considered null for string columns. - // If false, then all strings are valid string values. - bool strings_can_be_null = false; - - static ConvertOptions Defaults(); -}; - -struct ARROW_EXPORT ReadOptions { - // Reader options - - // Whether to use the global CPU thread pool - bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true - int32_t block_size = 1 << 20; // 1 MB - - static ReadOptions Defaults(); -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_OPTIONS_H diff --git a/r/R/inst/include/arrow/csv/parser.h b/r/R/inst/include/arrow/csv/parser.h deleted file mode 100644 index fdddc37a2c0..00000000000 --- a/r/R/inst/include/arrow/csv/parser.h +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_PARSER_H -#define ARROW_CSV_PARSER_H - -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/csv/options.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; - -namespace csv { - -constexpr int32_t kMaxParserNumRows = 100000; - -/// \class BlockParser -/// \brief A reusable block-based parser for CSV data -/// -/// The parser takes a block of CSV data and delimits rows and fields, -/// unquoting and unescaping them on the fly. Parsed data is own by the -/// parser, so the original buffer can be discarded after Parse() returns. -/// -/// If the block is truncated (i.e. not all data can be parsed), it is up -/// to the caller to arrange the next block to start with the trailing data. -/// Also, if the previous block ends with CR (0x0d) and a new block starts -/// with LF (0x0a), the parser will consider the leading newline as an empty -/// line; the caller should therefore strip it. -class ARROW_EXPORT BlockParser { - public: - explicit BlockParser(ParseOptions options, int32_t num_cols = -1, - int32_t max_num_rows = kMaxParserNumRows); - explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1, - int32_t max_num_rows = kMaxParserNumRows); - - /// \brief Parse a block of data - /// - /// Parse a block of CSV data, ingesting up to max_num_rows rows. - /// The number of bytes actually parsed is returned in out_size. - Status Parse(const char* data, uint32_t size, uint32_t* out_size); - - /// \brief Parse the final block of data - /// - /// Like Parse(), but called with the final block in a file. - /// The last row may lack a trailing line separator. - Status ParseFinal(const char* data, uint32_t size, uint32_t* out_size); - - /// \brief Return the number of parsed rows - int32_t num_rows() const { return num_rows_; } - /// \brief Return the number of parsed columns - int32_t num_cols() const { return num_cols_; } - /// \brief Return the total size in bytes of parsed data - uint32_t num_bytes() const { return parsed_size_; } - - /// \brief Visit parsed values in a column - /// - /// The signature of the visitor is - /// Status(const uint8_t* data, uint32_t size, bool quoted) - template - Status VisitColumn(int32_t col_index, Visitor&& visit) const { - for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { - const auto& values_buffer = values_buffers_[buf_index]; - const auto values = reinterpret_cast(values_buffer->data()); - const auto max_pos = - static_cast(values_buffer->size() / sizeof(ValueDesc)) - 1; - for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) { - auto start = values[pos].offset; - auto stop = values[pos + 1].offset; - auto quoted = values[pos + 1].quoted; - ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); - } - } - return Status::OK(); - } - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); - - Status DoParse(const char* data, uint32_t size, bool is_final, uint32_t* out_size); - template - Status DoParseSpecialized(const char* data, uint32_t size, bool is_final, - uint32_t* out_size); - - template - Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer, - const char* data, const char* data_end, bool is_final, - int32_t rows_in_chunk, const char** out_data, bool* finished_parsing); - - // Parse a single line from the data pointer - template - Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer, - const char* data, const char* data_end, bool is_final, - const char** out_data); - - MemoryPool* pool_; - const ParseOptions options_; - // The number of rows parsed from the block - int32_t num_rows_; - // The number of columns (can be -1 at start) - int32_t num_cols_; - // The maximum number of rows to parse from this block - int32_t max_num_rows_; - - // Linear scratchpad for parsed values - struct ValueDesc { - uint32_t offset : 31; - bool quoted : 1; - }; - - // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes? - // It may help with null parsing... - std::vector> values_buffers_; - std::shared_ptr parsed_buffer_; - const uint8_t* parsed_; - int32_t values_size_; - int32_t parsed_size_; - - class ResizableValuesWriter; - class PresizedValuesWriter; - class PresizedParsedWriter; -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_PARSER_H diff --git a/r/R/inst/include/arrow/csv/reader.h b/r/R/inst/include/arrow/csv/reader.h deleted file mode 100644 index edf6f110980..00000000000 --- a/r/R/inst/include/arrow/csv/reader.h +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_READER_H -#define ARROW_CSV_READER_H - -#include - -#include "arrow/csv/options.h" // IWYU pragma: keep -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; -class Table; - -namespace io { -class InputStream; -} // namespace io - -namespace csv { - -class ARROW_EXPORT TableReader { - public: - virtual ~TableReader() = default; - - virtual Status Read(std::shared_ptr
* out) = 0; - - // XXX pass optional schema? - static Status Make(MemoryPool* pool, std::shared_ptr input, - const ReadOptions&, const ParseOptions&, const ConvertOptions&, - std::shared_ptr* out); -}; - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_READER_H diff --git a/r/R/inst/include/arrow/csv/test-common.h b/r/R/inst/include/arrow/csv/test-common.h deleted file mode 100644 index 624023f6037..00000000000 --- a/r/R/inst/include/arrow/csv/test-common.h +++ /dev/null @@ -1,71 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CSV_TEST_COMMON_H -#define ARROW_CSV_TEST_COMMON_H - -#include -#include -#include - -#include "arrow/csv/parser.h" -#include "arrow/testing/gtest_util.h" - -namespace arrow { -namespace csv { - -std::string MakeCSVData(std::vector lines) { - std::string s; - for (const auto& line : lines) { - s += line; - } - return s; -} - -// Make a BlockParser from a vector of lines representing a CSV file -void MakeCSVParser(std::vector lines, ParseOptions options, - std::shared_ptr* out) { - auto csv = MakeCSVData(lines); - auto parser = std::make_shared(options); - uint32_t out_size; - ASSERT_OK(parser->Parse(csv.data(), static_cast(csv.size()), &out_size)); - ASSERT_EQ(out_size, csv.size()) << "trailing CSV data not parsed"; - *out = parser; -} - -void MakeCSVParser(std::vector lines, std::shared_ptr* out) { - MakeCSVParser(lines, ParseOptions::Defaults(), out); -} - -// Make a BlockParser from a vector of strings representing a single CSV column -void MakeColumnParser(std::vector items, std::shared_ptr* out) { - auto options = ParseOptions::Defaults(); - // Need this to test for null (empty) values - options.ignore_empty_lines = false; - std::vector lines; - for (const auto& item : items) { - lines.push_back(item + '\n'); - } - MakeCSVParser(lines, options, out); - ASSERT_EQ((*out)->num_cols(), 1) << "Should have seen only 1 CSV column"; - ASSERT_EQ((*out)->num_rows(), items.size()); -} - -} // namespace csv -} // namespace arrow - -#endif // ARROW_CSV_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/api.h b/r/R/inst/include/arrow/dbi/hiveserver2/api.h deleted file mode 100644 index 6ac849ef87b..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/api.h +++ /dev/null @@ -1,27 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/dbi/hiveserver2/columnar-row-set.h" -#include "arrow/dbi/hiveserver2/operation.h" -#include "arrow/dbi/hiveserver2/service.h" -#include "arrow/dbi/hiveserver2/session.h" -#include "arrow/dbi/hiveserver2/types.h" -#include "arrow/dbi/hiveserver2/util.h" - -#include "arrow/status.h" diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h b/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h deleted file mode 100644 index a62c738020b..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/columnar-row-set.h +++ /dev/null @@ -1,155 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace hiveserver2 { - -// The Column class is used to access data that was fetched in columnar format. -// The contents of the data can be accessed through the data() fn, which returns -// a ptr to a vector containing the contents of this column in the fetched -// results, avoiding copies. This vector will be of size length(). -// -// If any of the values are null, they will be represented in the data vector as -// default values, i.e. 0 for numeric types. The nulls() fn returns a ptr to a -// bit array representing which values are null, and the IsNull() fn is provided -// for convenience when working with this bit array. The user should check -// IsNull() to distinguish between actual instances of the default values and nulls. -// -// A Column object is returned from a ColumnarRowSet and is only valid as long -// as that ColumnarRowSet still exists. -// -// Example: -// unique_ptr col = columnar_row_set->GetInt32Col(); -// for (int i = 0; i < col->length(); i++) { -// if (col->IsNull(i)) { -// cout << "NULL\n"; -// } else { -// cout << col->data()[i] << "\n"; -// } -// } -class ARROW_EXPORT Column { - public: - virtual ~Column() {} - - virtual int64_t length() const = 0; - - const uint8_t* nulls() const { return nulls_; } - int64_t nulls_size() const { return nulls_size_; } - - // Returns true iff the value for the i-th row within this set of data for this - // column is null. - bool IsNull(int64_t i) const { return (nulls_[i / 8] & (1 << (i % 8))) != 0; } - - protected: - explicit Column(const std::string* nulls); - - // The memory for these ptrs is owned by the ColumnarRowSet that - // created this Column. - // - // Due to the issue described in HUE-2722, the null bitmap may have fewer - // bytes than expected for some versions of Hive, so we retain the ability to - // check the buffer size in case this happens. - const uint8_t* nulls_; - int64_t nulls_size_; -}; - -template -class ARROW_EXPORT TypedColumn : public Column { - public: - const std::vector& data() const { return *data_; } - int64_t length() const { return data().size(); } - - // Returns the value for the i-th row within this set of data for this column. - const T& GetData(int64_t i) const { return data()[i]; } - - private: - // For access to the c'tor. - friend class ColumnarRowSet; - - TypedColumn(const std::string* nulls, const std::vector* data) - : Column(nulls), data_(data) {} - - const std::vector* data_; -}; - -typedef TypedColumn BoolColumn; -typedef TypedColumn ByteColumn; -typedef TypedColumn Int16Column; -typedef TypedColumn Int32Column; -typedef TypedColumn Int64Column; -typedef TypedColumn DoubleColumn; -typedef TypedColumn StringColumn; -typedef TypedColumn BinaryColumn; - -// A ColumnarRowSet represents the full results returned by a call to -// Operation::Fetch() when a columnar format is being used. -// -// ColumnarRowSet provides access to specific columns by their type and index in -// the results. All Column objects returned from a given ColumnarRowSet will have -// the same length(). A Column object returned by a ColumnarRowSet is only valid -// as long as the ColumnarRowSet still exists. -// -// Example: -// unique_ptr op; -// session->ExecuteStatement("select int_col, string_col from tbl", &op); -// unique_ptr columnar_row_set; -// if (op->Fetch(&columnar_row_set).ok()) { -// unique_ptr int32_col = columnar_row_set->GetInt32Col(0); -// unique_ptr string_col = columnar_row_set->GetStringCol(1); -// } -class ARROW_EXPORT ColumnarRowSet { - public: - ~ColumnarRowSet(); - - std::unique_ptr GetBoolCol(int i) const; - std::unique_ptr GetByteCol(int i) const; - std::unique_ptr GetInt16Col(int i) const; - std::unique_ptr GetInt32Col(int i) const; - std::unique_ptr GetInt64Col(int i) const; - std::unique_ptr GetDoubleCol(int i) const; - std::unique_ptr GetStringCol(int i) const; - std::unique_ptr GetBinaryCol(int i) const; - - template - std::unique_ptr GetCol(int i) const; - - private: - // Hides Thrift objects from the header. - struct ColumnarRowSetImpl; - - ARROW_DISALLOW_COPY_AND_ASSIGN(ColumnarRowSet); - - // For access to the c'tor. - friend class Operation; - - explicit ColumnarRowSet(ColumnarRowSetImpl* impl); - - std::unique_ptr impl_; -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/operation.h b/r/R/inst/include/arrow/dbi/hiveserver2/operation.h deleted file mode 100644 index f275592e23d..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/operation.h +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/dbi/hiveserver2/columnar-row-set.h" -#include "arrow/dbi/hiveserver2/types.h" - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace hiveserver2 { - -struct ThriftRPC; - -// Maps directly to TFetchOrientation in the HiveServer2 interface. -enum class FetchOrientation { - NEXT, // supported - PRIOR, // not supported - RELATIVE, // not supported - ABSOLUTE, // not supported - FIRST, // supported if query result caching is enabled in Impala - LAST // not supported -}; - -// Represents a single HiveServer2 operation. Used to monitor the status of an operation -// and to retrieve its results. The only Operation function that will block is Fetch, -// which blocks if there aren't any results ready yet. -// -// Operations are created using Session functions, eg. ExecuteStatement. They must -// have Close called on them before they can be deleted. -// -// This class is not thread-safe. -class ARROW_EXPORT Operation { - public: - // Maps directly to TOperationState in the HiveServer2 interface. - enum class State { - INITIALIZED, - RUNNING, - FINISHED, - CANCELED, - CLOSED, - ERROR, - UNKNOWN, - PENDING, - }; - - ~Operation(); - - // Fetches the current state of this operation. If successful, sets the operation state - // in 'out' and returns an OK status, otherwise an error status is returned. May be - // called after successfully creating the operation and before calling Close. - Status GetState(Operation::State* out) const; - - // May be called after successfully creating the operation and before calling Close. - Status GetLog(std::string* out) const; - - // May be called after successfully creating the operation and before calling Close. - Status GetProfile(std::string* out) const; - - // Fetches metadata for the columns in the output of this operation, such as the - // names and types of the columns, and returns it as a list of column descriptions. - // May be called after successfully creating the operation and before calling Close. - Status GetResultSetMetadata(std::vector* column_descs) const; - - // Fetches a batch of results, stores them in 'results', and sets has_more_rows. - // Fetch will block if there aren't any results that are ready. - Status Fetch(std::unique_ptr* results, bool* has_more_rows) const; - Status Fetch(int max_rows, FetchOrientation orientation, - std::unique_ptr* results, bool* has_more_rows) const; - - // May be called after successfully creating the operation and before calling Close. - Status Cancel() const; - - // Closes the operation. Must be called before the operation is deleted. May be safely - // called on an invalid or already closed operation - will only return an error if the - // operation is open but the close rpc fails. - Status Close(); - - // May be called after successfully creating the operation and before calling Close. - bool HasResultSet() const; - - // Returns true iff this operation's results will be returned in a columnar format. - // May be called at any time. - bool IsColumnar() const; - - protected: - // Hides Thrift objects from the header. - struct OperationImpl; - - explicit Operation(const std::shared_ptr& rpc); - - std::unique_ptr impl_; - std::shared_ptr rpc_; - - // True iff this operation has been successfully created and has not been closed yet, - // corresponding to when the operation has a valid operation handle. - bool open_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Operation); -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/service.h b/r/R/inst/include/arrow/dbi/hiveserver2/service.h deleted file mode 100644 index bfa7a97db3a..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/service.h +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace hiveserver2 { - -class Session; -struct ThriftRPC; - -// Stores per-session or per-operation configuration parameters. -class HS2ClientConfig { - public: - void SetOption(const std::string& key, const std::string& value) { - config_[key] = value; - } - - bool GetOption(const std::string& key, std::string* value_out) { - if (config_.find(key) != config_.end() && value_out) { - *value_out = config_[key]; - return true; - } - return false; - } - - const std::map& GetConfig() const { return config_; } - - private: - std::map config_; -}; - -// Maps directly to TProtocolVersion in the HiveServer2 interface. -enum class ProtocolVersion { - PROTOCOL_V1, // not supported - PROTOCOL_V2, // not supported - PROTOCOL_V3, // not supported - PROTOCOL_V4, // not supported - PROTOCOL_V5, // not supported - PROTOCOL_V6, // supported - PROTOCOL_V7, // supported -}; - -// Manages a connection to a HiveServer2 server. Primarily used to create -// new sessions via OpenSession. -// -// Service objects are created using Service::Connect(). They must -// have Close called on them before they can be deleted. -// -// This class is not thread-safe. -// -// Example: -// unique_ptr service; -// if (Service::Connect(host, port, protocol_version, &service).ok()) { -// // do some work -// service->Close(); -// } -class ARROW_EXPORT Service { - public: - // Creates a new connection to a HS2 service at the given host and port. If - // conn_timeout > 0, connection attempts will timeout after conn_timeout ms, otherwise - // no timeout is used. protocol_version is the HiveServer2 protocol to use, and - // determines whether the results returned by operations from this service are row or - // column oriented. Only column oriented protocols are currently supported. - // - // The client calling Connect has ownership of the new Service that is created. - // Executing RPCs with an Session or Operation corresponding to a particular - // Service after that Service has been closed or deleted in undefined. - static Status Connect(const std::string& host, int port, int conn_timeout, - ProtocolVersion protocol_version, - std::unique_ptr* service); - - ~Service(); - - // Closes the connection. Must be called before the service is deleted. May be - // safely called on an invalid or already closed service - will only return an - // error if the service is open but the close rpc fails. - Status Close(); - - // Returns true iff this service has an active connection to the HiveServer2 server. - bool IsConnected() const; - - // Set the send and receive timeout for Thrift RPCs in ms. 0 indicates no timeout, - // negative values are ignored. - void SetRecvTimeout(int timeout); - void SetSendTimeout(int timeout); - - // Opens a new HS2 session using this service. - // The client calling OpenSession has ownership of the Session that is created. - // Operations on the Session are undefined once it is closed. - Status OpenSession(const std::string& user, const HS2ClientConfig& config, - std::unique_ptr* session) const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Service); - - // Hides Thrift objects from the header. - struct ServiceImpl; - - Service(const std::string& host, int port, int conn_timeout, - ProtocolVersion protocol_version); - - // Opens the connection to the server. Called by Connect before new service is returned - // to the user. Must be called before OpenSession. - Status Open(); - - std::string host_; - int port_; - int conn_timeout_; - - std::unique_ptr impl_; - std::shared_ptr rpc_; -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/session.h b/r/R/inst/include/arrow/dbi/hiveserver2/session.h deleted file mode 100644 index 4e223de6c17..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/session.h +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/dbi/hiveserver2/operation.h" -#include "arrow/dbi/hiveserver2/service.h" - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace hiveserver2 { - -struct ThriftRPC; - -// Manages a single HiveServer2 session - stores the session handle returned by -// the OpenSession RPC and uses it to create and return operations. -// -// Sessions are created with Service::OpenSession(). They must have Close -// called on them before they can be deleted. -// -// Executing RPCs with an Operation corresponding to a particular Session after -// that Session has been closed or deleted is undefined. -// -// This class is not thread-safe. -class ARROW_EXPORT Session { - public: - ~Session(); - - // Closes the session. Must be called before the session is deleted. May be safely - // called on an invalid or already closed session - will only return an error if the - // session is open but the close rpc fails. - Status Close(); - - Status ExecuteStatement(const std::string& statement, - std::unique_ptr* operation) const; - Status ExecuteStatement(const std::string& statement, - const HS2ClientConfig& conf_overlay, - std::unique_ptr* operation) const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Session); - - // Hides Thrift objects from the header. - struct SessionImpl; - - // For access to the c'tor. - friend class Service; - - explicit Session(const std::shared_ptr& rpc); - - // Performs the RPC that initiates the session and stores the returned handle. - // Must be called before operations can be executed. - Status Open(const HS2ClientConfig& config, const std::string& user); - - std::unique_ptr impl_; - std::shared_ptr rpc_; - - // True if Open has been called and Close has not. - bool open_; -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h b/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h deleted file mode 100644 index aad535fc1f3..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/thrift-internal.h +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/dbi/hiveserver2/columnar-row-set.h" -#include "arrow/dbi/hiveserver2/operation.h" -#include "arrow/dbi/hiveserver2/service.h" -#include "arrow/dbi/hiveserver2/types.h" - -#include "arrow/dbi/hiveserver2/ImpalaHiveServer2Service.h" -#include "arrow/dbi/hiveserver2/TCLIService.h" - -namespace arrow { -namespace hiveserver2 { - -// PIMPL structs. -struct ColumnarRowSet::ColumnarRowSetImpl { - apache::hive::service::cli::thrift::TFetchResultsResp resp; -}; - -struct Operation::OperationImpl { - apache::hive::service::cli::thrift::TOperationHandle handle; - apache::hive::service::cli::thrift::TSessionHandle session_handle; -}; - -struct ThriftRPC { - std::unique_ptr client; -}; - -const std::string OperationStateToString(const Operation::State& state); - -const std::string TypeIdToString(const ColumnType::TypeId& type_id); - -// Functions for converting Thrift object to hs2client objects and vice-versa. -apache::hive::service::cli::thrift::TFetchOrientation::type -FetchOrientationToTFetchOrientation(FetchOrientation orientation); - -apache::hive::service::cli::thrift::TProtocolVersion::type -ProtocolVersionToTProtocolVersion(ProtocolVersion protocol); - -Operation::State TOperationStateToOperationState( - const apache::hive::service::cli::thrift::TOperationState::type& tstate); - -Status TStatusToStatus(const apache::hive::service::cli::thrift::TStatus& tstatus); - -// Converts a TTypeDesc to a ColumnType. Currently only primitive types are supported. -// The converted type is returned as a pointer to allow for polymorphism with ColumnType -// and its subclasses. -std::unique_ptr TTypeDescToColumnType( - const apache::hive::service::cli::thrift::TTypeDesc& ttype_desc); - -ColumnType::TypeId TTypeIdToTypeId( - const apache::hive::service::cli::thrift::TTypeId::type& type_id); - -} // namespace hiveserver2 -} // namespace arrow - -#define TRY_RPC_OR_RETURN(rpc) \ - do { \ - try { \ - (rpc); \ - } catch (apache::thrift::TException & tx) { \ - return Status::IOError(tx.what()); \ - } \ - } while (0) - -#define THRIFT_RETURN_NOT_OK(tstatus) \ - do { \ - if (tstatus.statusCode != hs2::TStatusCode::SUCCESS_STATUS && \ - tstatus.statusCode != hs2::TStatusCode::SUCCESS_WITH_INFO_STATUS) { \ - return TStatusToStatus(tstatus); \ - } \ - } while (0) diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/types.h b/r/R/inst/include/arrow/dbi/hiveserver2/types.h deleted file mode 100644 index 38cebcc2eeb..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/types.h +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -namespace arrow { -namespace hiveserver2 { - -// Represents a column's type. -// -// For now only PrimitiveType is implemented, as thase are the only types Impala will -// currently return. In the future, nested types will be represented as other subclasses -// of ColumnType containing ptrs to other ColumnTypes - for example, an ArrayType subclass -// would contain a single ptr to another ColumnType representing the type of objects -// stored in the array. -class ColumnType { - public: - virtual ~ColumnType() = default; - - // Maps directly to TTypeId in the HiveServer2 interface. - enum class TypeId { - BOOLEAN, - TINYINT, - SMALLINT, - INT, - BIGINT, - FLOAT, - DOUBLE, - STRING, - TIMESTAMP, - BINARY, - ARRAY, - MAP, - STRUCT, - UNION, - USER_DEFINED, - DECIMAL, - NULL_TYPE, - DATE, - VARCHAR, - CHAR, - INVALID, - }; - - virtual TypeId type_id() const = 0; - virtual std::string ToString() const = 0; -}; - -class PrimitiveType : public ColumnType { - public: - explicit PrimitiveType(const TypeId& type_id) : type_id_(type_id) {} - - TypeId type_id() const override { return type_id_; } - std::string ToString() const override; - - private: - const TypeId type_id_; -}; - -// Represents CHAR and VARCHAR types. -class CharacterType : public PrimitiveType { - public: - CharacterType(const TypeId& type_id, int max_length) - : PrimitiveType(type_id), max_length_(max_length) {} - - int max_length() const { return max_length_; } - - private: - const int max_length_; -}; - -// Represents DECIMAL types. -class DecimalType : public PrimitiveType { - public: - DecimalType(const TypeId& type_id, int precision, int scale) - : PrimitiveType(type_id), precision_(precision), scale_(scale) {} - - int precision() const { return precision_; } - int scale() const { return scale_; } - - private: - const int precision_; - const int scale_; -}; - -// Represents the metadata for a single column. -class ColumnDesc { - public: - ColumnDesc(const std::string& column_name, std::unique_ptr type, - int position, const std::string& comment) - : column_name_(column_name), - type_(move(type)), - position_(position), - comment_(comment) {} - - const std::string& column_name() const { return column_name_; } - const ColumnType* type() const { return type_.get(); } - int position() const { return position_; } - const std::string& comment() const { return comment_; } - - const PrimitiveType* GetPrimitiveType() const; - const CharacterType* GetCharacterType() const; - const DecimalType* GetDecimalType() const; - - private: - const std::string column_name_; - std::unique_ptr type_; - const int position_; - const std::string comment_; -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/dbi/hiveserver2/util.h b/r/R/inst/include/arrow/dbi/hiveserver2/util.h deleted file mode 100644 index a17e7b2286b..00000000000 --- a/r/R/inst/include/arrow/dbi/hiveserver2/util.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/dbi/hiveserver2/operation.h" - -namespace arrow { -namespace hiveserver2 { - -// Utility functions. Intended primary for testing purposes - clients should not -// rely on stability of the behavior or API of these functions. -class Util { - public: - // Fetches the operation's results and returns them in a nicely formatted string. - static void PrintResults(const Operation* op, std::ostream& out); -}; - -} // namespace hiveserver2 -} // namespace arrow diff --git a/r/R/inst/include/arrow/extension_type.h b/r/R/inst/include/arrow/extension_type.h deleted file mode 100644 index 48bc1e9bff7..00000000000 --- a/r/R/inst/include/arrow/extension_type.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// User-defined extension types. EXPERIMENTAL in 0.13.0 -/// \since 0.13.0 - -#pragma once - -#include -#include - -#include "arrow/array.h" -#include "arrow/type.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -/// \brief The base class for custom / user-defined types. -class ARROW_EXPORT ExtensionType : public DataType { - public: - static constexpr Type::type type_id = Type::EXTENSION; - - /// \brief The type of array used to represent this extension type's data - std::shared_ptr storage_type() const { return storage_type_; } - - std::string ToString() const override; - std::string name() const override; - - /// \brief Unique name of extension type used to identify type for - /// serialization - /// \return the string name of the extension - virtual std::string extension_name() const = 0; - - /// \brief Determine if two instances of the same extension types are - /// equal. Invoked from ExtensionType::Equals - /// \param[in] other the type to compare this type with - /// \return bool true if type instances are equal - virtual bool ExtensionEquals(const ExtensionType& other) const = 0; - - /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance - /// \param[in] data the physical storage for the extension type - virtual std::shared_ptr MakeArray(std::shared_ptr data) const = 0; - - /// \brief Create an instance of the ExtensionType given the actual storage - /// type and the serialized representation - /// \param[in] storage_type the physical storage type of the extension - /// \param[in] serialized_data the serialized representation produced by - /// Serialize - /// \param[out] out the reconstructed extension type - /// \return Status - virtual Status Deserialize(std::shared_ptr storage_type, - const std::string& serialized_data, - std::shared_ptr* out) const = 0; - - /// \brief Create a serialized representation of the extension type's - /// metadata. The storage type will be handled automatically in IPC code - /// paths - /// \return the serialized representation - virtual std::string Serialize() const = 0; - - protected: - explicit ExtensionType(std::shared_ptr storage_type) - : DataType(Type::EXTENSION), storage_type_(storage_type) {} - - std::shared_ptr storage_type_; -}; - -/// \brief Base array class for user-defined extension types -class ARROW_EXPORT ExtensionArray : public Array { - public: - explicit ExtensionArray(const std::shared_ptr& data) { SetData(data); } - - /// \brief The physical storage for the extension array - std::shared_ptr storage() const { return storage_; } - - protected: - void SetData(const std::shared_ptr& data); - std::shared_ptr storage_; -}; - -/// \brief Register an extension type globally. The name returned by the type's -/// extension_name() method should be unique. This method is thread-safe -/// \param[in] type an instance of the extension type -/// \return Status -ARROW_EXPORT -Status RegisterExtensionType(std::shared_ptr type); - -/// \brief Delete an extension type from the global registry. This method is -/// thread-safe -/// \param[in] type_name the unique name of a registered extension type -/// \return Status error if the type name is unknown -ARROW_EXPORT -Status UnregisterExtensionType(const std::string& type_name); - -/// \brief Retrieve an extension type from the global registry. Returns nullptr -/// if not found. This method is thread-safe -/// \return the globally-registered extension type -ARROW_EXPORT -std::shared_ptr GetExtensionType(const std::string& type_name); - -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/filesystem.h b/r/R/inst/include/arrow/filesystem/filesystem.h deleted file mode 100644 index 9a3e5a0dd58..00000000000 --- a/r/R/inst/include/arrow/filesystem/filesystem.h +++ /dev/null @@ -1,247 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -// The Windows API defines macros from *File resolving to either -// *FileA or *FileW. Need to undo them. -#ifdef _WIN32 -#ifdef DeleteFile -#undef DeleteFile -#endif -#ifdef CopyFile -#undef CopyFile -#endif -#endif - -namespace arrow { - -namespace io { - -class InputStream; -class OutputStream; -class RandomAccessFile; - -} // namespace io - -namespace fs { - -// A system clock time point expressed as a 64-bit (or more) number of -// nanoseconds since the epoch. -using TimePoint = - std::chrono::time_point; - -/// \brief EXPERIMENTAL: FileSystem entry type -enum class ARROW_EXPORT FileType { - // Target does not exist - NonExistent, - // Target exists but its type is unknown (could be a special file such - // as a Unix socket or character device, or Windows NUL / CON / ...) - Unknown, - // Target is a regular file - File, - // Target is a directory - Directory -}; - -ARROW_EXPORT std::string ToString(FileType); - -static const int64_t kNoSize = -1; -static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); - -/// \brief EXPERIMENTAL: FileSystem entry stats -struct ARROW_EXPORT FileStats { - FileStats() = default; - FileStats(FileStats&&) = default; - FileStats& operator=(FileStats&&) = default; - FileStats(const FileStats&) = default; - FileStats& operator=(const FileStats&) = default; - - // The file type. - FileType type() const { return type_; } - void set_type(FileType type) { type_ = type; } - - // The full file path in the filesystem. - std::string path() const { return path_; } - void set_path(const std::string& path) { path_ = path; } - - // The file base name (component after the last directory separator). - std::string base_name() const; - - // The size in bytes, if available. Only regular files are guaranteed - // to have a size. - int64_t size() const { return size_; } - void set_size(int64_t size) { size_ = size; } - - // The time of last modification, if available. - TimePoint mtime() const { return mtime_; } - void set_mtime(TimePoint mtime) { mtime_ = mtime; } - - protected: - FileType type_ = FileType::Unknown; - std::string path_; - int64_t size_ = kNoSize; - TimePoint mtime_ = kNoTime; -}; - -/// \brief EXPERIMENTAL: file selector -struct ARROW_EXPORT Selector { - // The directory in which to select files. - // If the path exists but doesn't point to a directory, this should be an error. - std::string base_dir; - // The behavior if `base_dir` doesn't exist in the filesystem. If false, - // an error is returned. If true, an empty selection is returned. - bool allow_non_existent = false; - // Whether to recurse into subdirectories. - bool recursive = false; - - Selector() {} -}; - -/// \brief EXPERIMENTAL: abstract file system API -class ARROW_EXPORT FileSystem { - public: - virtual ~FileSystem(); - - /// Get statistics for the given target. - /// - /// Any symlink is automatically dereferenced, recursively. - /// A non-existing or unreachable file returns an Ok status and - /// has a FileType of value NonExistent. An error status indicates - /// a truly exceptional condition (low-level I/O error, etc.). - virtual Status GetTargetStats(const std::string& path, FileStats* out) = 0; - /// Same, for many targets at once. - virtual Status GetTargetStats(const std::vector& paths, - std::vector* out); - /// Same, according to a selector. - /// - /// The selector's base directory will not be part of the results, even if - /// it exists. - /// If it doesn't exist, see `Selector::allow_non_existent`. - virtual Status GetTargetStats(const Selector& select, std::vector* out) = 0; - - /// Create a directory and subdirectories. - /// - /// This function succeeds if the directory already exists. - virtual Status CreateDir(const std::string& path, bool recursive = true) = 0; - - /// Delete a directory and its contents, recursively. - virtual Status DeleteDir(const std::string& path) = 0; - - /// Delete a file. - virtual Status DeleteFile(const std::string& path) = 0; - /// Delete many files. - /// - /// The default implementation issues individual delete operations in sequence. - virtual Status DeleteFiles(const std::vector& paths); - - /// Move / rename a file or directory. - /// - /// If the destination exists: - /// - if it is a non-empty directory, an error is returned - /// - otherwise, if it has the same type as the source, it is replaced - /// - otherwise, behavior is unspecified (implementation-dependent). - virtual Status Move(const std::string& src, const std::string& dest) = 0; - - /// Copy a file. - /// - /// If the destination exists and is a directory, an error is returned. - /// Otherwise, it is replaced. - virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; - - /// Open an input stream for sequential reading. - virtual Status OpenInputStream(const std::string& path, - std::shared_ptr* out) = 0; - - /// Open an input file for random access reading. - virtual Status OpenInputFile(const std::string& path, - std::shared_ptr* out) = 0; - - /// Open an output stream for sequential writing. - /// - /// If the target already exists, existing data is truncated. - virtual Status OpenOutputStream(const std::string& path, - std::shared_ptr* out) = 0; - - /// Open an output stream for appending. - /// - /// If the target doesn't exist, a new empty file is created. - virtual Status OpenAppendStream(const std::string& path, - std::shared_ptr* out) = 0; -}; - -/// \brief EXPERIMENTAL: a FileSystem implementation that delegates to another -/// implementation after prepending a fixed base path. -/// -/// This is useful to expose a logical view of a subtree of a filesystem, -/// for example a directory in a LocalFileSystem. -/// This makes no security guarantee. For example, symlinks may allow to -/// "escape" the subtree and access other parts of the underlying filesystem. -class ARROW_EXPORT SubTreeFileSystem : public FileSystem { - public: - explicit SubTreeFileSystem(const std::string& base_path, - std::shared_ptr base_fs); - ~SubTreeFileSystem() override; - - using FileSystem::GetTargetStats; - Status GetTargetStats(const std::string& path, FileStats* out) override; - Status GetTargetStats(const Selector& select, std::vector* out) override; - - Status CreateDir(const std::string& path, bool recursive = true) override; - - Status DeleteDir(const std::string& path) override; - - Status DeleteFile(const std::string& path) override; - - Status Move(const std::string& src, const std::string& dest) override; - - Status CopyFile(const std::string& src, const std::string& dest) override; - - Status OpenInputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenInputFile(const std::string& path, - std::shared_ptr* out) override; - - Status OpenOutputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenAppendStream(const std::string& path, - std::shared_ptr* out) override; - - protected: - const std::string base_path_; - std::shared_ptr base_fs_; - - std::string PrependBase(const std::string& s) const; - Status PrependBaseNonEmpty(std::string* s) const; - Status StripBase(const std::string& s, std::string* out) const; - Status FixStats(FileStats* st) const; -}; - -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/localfs.h b/r/R/inst/include/arrow/filesystem/localfs.h deleted file mode 100644 index c720ac2b93c..00000000000 --- a/r/R/inst/include/arrow/filesystem/localfs.h +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/filesystem/filesystem.h" - -namespace arrow { -namespace fs { - -/// \brief EXPERIMENTAL: a FileSystem implementation accessing files -/// on the local machine. -/// -/// Details such as symlinks are abstracted away (symlinks are always followed, -/// except when deleting an entry). -class ARROW_EXPORT LocalFileSystem : public FileSystem { - public: - LocalFileSystem(); - ~LocalFileSystem() override; - - using FileSystem::GetTargetStats; - Status GetTargetStats(const std::string& path, FileStats* out) override; - Status GetTargetStats(const Selector& select, std::vector* out) override; - - Status CreateDir(const std::string& path, bool recursive = true) override; - - Status DeleteDir(const std::string& path) override; - - Status DeleteFile(const std::string& path) override; - - Status Move(const std::string& src, const std::string& dest) override; - - Status CopyFile(const std::string& src, const std::string& dest) override; - - Status OpenInputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenInputFile(const std::string& path, - std::shared_ptr* out) override; - - Status OpenOutputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenAppendStream(const std::string& path, - std::shared_ptr* out) override; -}; - -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/mockfs.h b/r/R/inst/include/arrow/filesystem/mockfs.h deleted file mode 100644 index ba7b57636d3..00000000000 --- a/r/R/inst/include/arrow/filesystem/mockfs.h +++ /dev/null @@ -1,104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "arrow/filesystem/filesystem.h" - -namespace arrow { -namespace fs { -namespace internal { - -struct DirInfo { - std::string full_path; - TimePoint mtime; - - bool operator==(const DirInfo& other) const { - return mtime == other.mtime && full_path == other.full_path; - } - - friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const DirInfo&); -}; - -struct FileInfo { - std::string full_path; - TimePoint mtime; - std::string data; - - bool operator==(const FileInfo& other) const { - return mtime == other.mtime && full_path == other.full_path && data == other.data; - } - - friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const FileInfo&); -}; - -/// A mock FileSystem implementation that holds its contents in memory. -/// -/// Useful for validating the FileSystem API, writing conformance suite, -/// and bootstrapping FileSystem-based APIs. -class ARROW_EXPORT MockFileSystem : public FileSystem { - public: - explicit MockFileSystem(TimePoint current_time); - ~MockFileSystem() override; - - // XXX It's not very practical to have to explicitly declare inheritance - // of default overrides. - using FileSystem::GetTargetStats; - Status GetTargetStats(const std::string& path, FileStats* out) override; - Status GetTargetStats(const Selector& select, std::vector* out) override; - - Status CreateDir(const std::string& path, bool recursive = true) override; - - Status DeleteDir(const std::string& path) override; - - Status DeleteFile(const std::string& path) override; - - Status Move(const std::string& src, const std::string& dest) override; - - Status CopyFile(const std::string& src, const std::string& dest) override; - - Status OpenInputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenInputFile(const std::string& path, - std::shared_ptr* out) override; - - Status OpenOutputStream(const std::string& path, - std::shared_ptr* out) override; - - Status OpenAppendStream(const std::string& path, - std::shared_ptr* out) override; - - // Contents-dumping helpers to ease testing. - // Output is lexicographically-ordered by full path. - std::vector AllDirs(); - std::vector AllFiles(); - - class Impl; - - protected: - std::unique_ptr impl_; -}; - -} // namespace internal -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/path-util.h b/r/R/inst/include/arrow/filesystem/path-util.h deleted file mode 100644 index 444451d32ab..00000000000 --- a/r/R/inst/include/arrow/filesystem/path-util.h +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/status.h" - -namespace arrow { -namespace fs { -namespace internal { - -constexpr char kSep = '/'; - -// Computations on abstract paths (not local paths with system-dependent behaviour). -// Abstract paths are typically used in URIs. - -// Split an abstract path into its individual components. -ARROW_EXPORT -std::vector SplitAbstractPath(const std::string& s); - -// Return the parent directory and basename of an abstract path. Both values may be -// empty. -ARROW_EXPORT -std::pair GetAbstractPathParent(const std::string& s); - -// Validate the components of an abstract path. -ARROW_EXPORT -Status ValidateAbstractPathParts(const std::vector& parts); - -// Append a non-empty stem to an abstract path. -ARROW_EXPORT -std::string ConcatAbstractPath(const std::string& base, const std::string& stem); - -ARROW_EXPORT -std::string EnsureTrailingSlash(const std::string& s); - -// Join the components of an abstract path. -template -std::string JoinAbstractPath(StringIt it, StringIt end) { - std::string path; - for (; it != end; ++it) { - if (!path.empty()) { - path += kSep; - } - path += *it; - } - return path; -} - -} // namespace internal -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/test-util.h b/r/R/inst/include/arrow/filesystem/test-util.h deleted file mode 100644 index 179b08cf7e7..00000000000 --- a/r/R/inst/include/arrow/filesystem/test-util.h +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "arrow/filesystem/filesystem.h" - -namespace arrow { -namespace fs { - -static constexpr double kTimeSlack = 2.0; // In seconds - -ARROW_EXPORT -void AssertFileStats(const FileStats& st, const std::string& path, FileType type); - -ARROW_EXPORT -void AssertFileStats(const FileStats& st, const std::string& path, FileType type, - TimePoint mtime); - -ARROW_EXPORT -void AssertFileStats(const FileStats& st, const std::string& path, FileType type, - TimePoint mtime, int64_t size); - -ARROW_EXPORT -void AssertFileStats(const FileStats& st, const std::string& path, FileType type, - int64_t size); - -ARROW_EXPORT -void CreateFile(FileSystem* fs, const std::string& path, const std::string& data); - -// Sort of vector of FileStats by lexicographic path order -ARROW_EXPORT -void SortStats(std::vector* stats); - -template -void AssertDurationBetween(Duration d, double min_secs, double max_secs) { - auto seconds = std::chrono::duration_cast>(d); - ASSERT_GE(seconds.count(), min_secs); - ASSERT_LE(seconds.count(), max_secs); -} - -// Generic tests for FileSystem implementations. -// To use this class, subclass both from it and ::testing::Test, -// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS() -// to define the various tests. -class ARROW_EXPORT GenericFileSystemTest { - public: - virtual ~GenericFileSystemTest(); - - void TestEmpty(); - void TestCreateDir(); - void TestDeleteDir(); - void TestDeleteFile(); - void TestDeleteFiles(); - void TestMoveFile(); - void TestMoveDir(); - void TestCopyFile(); - void TestGetTargetStatsSingle(); - void TestGetTargetStatsVector(); - void TestGetTargetStatsSelector(); - void TestOpenOutputStream(); - void TestOpenAppendStream(); - void TestOpenInputStream(); - void TestOpenInputFile(); - - protected: - virtual std::shared_ptr GetEmptyFileSystem() = 0; - - void TestEmpty(FileSystem* fs); - void TestCreateDir(FileSystem* fs); - void TestDeleteDir(FileSystem* fs); - void TestDeleteFile(FileSystem* fs); - void TestDeleteFiles(FileSystem* fs); - void TestMoveFile(FileSystem* fs); - void TestMoveDir(FileSystem* fs); - void TestCopyFile(FileSystem* fs); - void TestGetTargetStatsSingle(FileSystem* fs); - void TestGetTargetStatsVector(FileSystem* fs); - void TestGetTargetStatsSelector(FileSystem* fs); - void TestOpenOutputStream(FileSystem* fs); - void TestOpenAppendStream(FileSystem* fs); - void TestOpenInputStream(FileSystem* fs); - void TestOpenInputFile(FileSystem* fs); -}; - -#define GENERIC_FS_TEST_FUNCTION(TEST_CLASS, NAME) \ - TEST_F(TEST_CLASS, NAME) { Test##NAME(); } - -#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, Empty) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, CreateDir) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteDir) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteFile) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, DeleteFiles) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, MoveFile) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, MoveDir) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, CopyFile) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsSingle) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsVector) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, GetTargetStatsSelector) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenOutputStream) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenAppendStream) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenInputStream) \ - GENERIC_FS_TEST_FUNCTION(TEST_CLASS, OpenInputFile) - -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/filesystem/util-internal.h b/r/R/inst/include/arrow/filesystem/util-internal.h deleted file mode 100644 index eabdad4a6fa..00000000000 --- a/r/R/inst/include/arrow/filesystem/util-internal.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/io/interfaces.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace fs { -namespace internal { - -ARROW_EXPORT -Status CopyStream(const std::shared_ptr& src, - const std::shared_ptr& dest, int64_t chunk_size); - -} // namespace internal -} // namespace fs -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/api.h b/r/R/inst/include/arrow/flight/api.h deleted file mode 100644 index 855ef7c3553..00000000000 --- a/r/R/inst/include/arrow/flight/api.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/flight/client.h" -#include "arrow/flight/client_auth.h" -#include "arrow/flight/server.h" -#include "arrow/flight/server_auth.h" -#include "arrow/flight/types.h" diff --git a/r/R/inst/include/arrow/flight/client.h b/r/R/inst/include/arrow/flight/client.h deleted file mode 100644 index 689c9f8c5b5..00000000000 --- a/r/R/inst/include/arrow/flight/client.h +++ /dev/null @@ -1,178 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// \brief Implementation of Flight RPC client using gRPC. API should be -// considered experimental for now - -#pragma once - -#include -#include -#include -#include - -#include "arrow/ipc/writer.h" -#include "arrow/status.h" - -#include "arrow/flight/types.h" // IWYU pragma: keep -#include "arrow/flight/visibility.h" - -namespace arrow { - -class MemoryPool; -class RecordBatch; -class RecordBatchReader; -class Schema; - -namespace flight { - -class ClientAuthHandler; - -/// \brief A duration type for Flight call timeouts. -typedef std::chrono::duration TimeoutDuration; - -/// \brief Hints to the underlying RPC layer for Arrow Flight calls. -class ARROW_FLIGHT_EXPORT FlightCallOptions { - public: - /// Create a default set of call options. - FlightCallOptions(); - - /// \brief An optional timeout for this call. Negative durations - /// mean an implementation-defined default behavior will be used - /// instead. This is the default value. - TimeoutDuration timeout; -}; - -class ARROW_FLIGHT_EXPORT FlightClientOptions { - public: - std::string tls_root_certs; -}; - -/// \brief Client class for Arrow Flight RPC services (gRPC-based). -/// API experimental for now -class ARROW_FLIGHT_EXPORT FlightClient { - public: - ~FlightClient(); - - /// \brief Connect to an unauthenticated flight service - /// \param[in] location the URI - /// \param[out] client the created FlightClient - /// \return Status OK status may not indicate that the connection was - /// successful - static Status Connect(const Location& location, std::unique_ptr* client); - - /// \brief Connect to an unauthenticated flight service - /// \param[in] location the URI - /// \param[in] options Other options for setting up the client - /// \param[out] client the created FlightClient - /// \return Status OK status may not indicate that the connection was - /// successful - static Status Connect(const Location& location, const FlightClientOptions& options, - std::unique_ptr* client); - - /// \brief Authenticate to the server using the given handler. - /// \param[in] options Per-RPC options - /// \param[in] auth_handler The authentication mechanism to use - /// \return Status OK if the client authenticated successfully - Status Authenticate(const FlightCallOptions& options, - std::unique_ptr auth_handler); - - /// \brief Perform the indicated action, returning an iterator to the stream - /// of results, if any - /// \param[in] options Per-RPC options - /// \param[in] action the action to be performed - /// \param[out] results an iterator object for reading the returned results - /// \return Status - Status DoAction(const FlightCallOptions& options, const Action& action, - std::unique_ptr* results); - Status DoAction(const Action& action, std::unique_ptr* results) { - return DoAction({}, action, results); - } - - /// \brief Retrieve a list of available Action types - /// \param[in] options Per-RPC options - /// \param[out] actions the available actions - /// \return Status - Status ListActions(const FlightCallOptions& options, std::vector* actions); - Status ListActions(std::vector* actions) { - return ListActions({}, actions); - } - - /// \brief Request access plan for a single flight, which may be an existing - /// dataset or a command to be executed - /// \param[in] options Per-RPC options - /// \param[in] descriptor the dataset request, whether a named dataset or - /// command - /// \param[out] info the FlightInfo describing where to access the dataset - /// \return Status - Status GetFlightInfo(const FlightCallOptions& options, - const FlightDescriptor& descriptor, - std::unique_ptr* info); - Status GetFlightInfo(const FlightDescriptor& descriptor, - std::unique_ptr* info) { - return GetFlightInfo({}, descriptor, info); - } - - /// \brief List all available flights known to the server - /// \param[out] listing an iterator that returns a FlightInfo for each flight - /// \return Status - Status ListFlights(std::unique_ptr* listing); - - /// \brief List available flights given indicated filter criteria - /// \param[in] options Per-RPC options - /// \param[in] criteria the filter criteria (opaque) - /// \param[out] listing an iterator that returns a FlightInfo for each flight - /// \return Status - Status ListFlights(const FlightCallOptions& options, const Criteria& criteria, - std::unique_ptr* listing); - - /// \brief Given a flight ticket and schema, request to be sent the - /// stream. Returns record batch stream reader - /// \param[in] options Per-RPC options - /// \param[in] ticket The flight ticket to use - /// \param[out] stream the returned RecordBatchReader - /// \return Status - Status DoGet(const FlightCallOptions& options, const Ticket& ticket, - std::unique_ptr* stream); - Status DoGet(const Ticket& ticket, std::unique_ptr* stream) { - return DoGet({}, ticket, stream); - } - - /// \brief Upload data to a Flight described by the given - /// descriptor. The caller must call Close() on the returned stream - /// once they are done writing. - /// \param[in] options Per-RPC options - /// \param[in] descriptor the descriptor of the stream - /// \param[in] schema the schema for the data to upload - /// \param[out] stream a writer to write record batches to - /// \return Status - Status DoPut(const FlightCallOptions& options, const FlightDescriptor& descriptor, - const std::shared_ptr& schema, - std::unique_ptr* stream); - Status DoPut(const FlightDescriptor& descriptor, const std::shared_ptr& schema, - std::unique_ptr* stream) { - return DoPut({}, descriptor, schema, stream); - } - - private: - FlightClient(); - class FlightClientImpl; - std::unique_ptr impl_; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/client_auth.h b/r/R/inst/include/arrow/flight/client_auth.h deleted file mode 100644 index 9dad36aa094..00000000000 --- a/r/R/inst/include/arrow/flight/client_auth.h +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/flight/visibility.h" -#include "arrow/status.h" - -namespace arrow { - -namespace flight { - -/// \brief A reader for messages from the server during an -/// authentication handshake. -class ARROW_FLIGHT_EXPORT ClientAuthReader { - public: - virtual ~ClientAuthReader() = default; - virtual Status Read(std::string* response) = 0; -}; - -/// \brief A writer for messages to the server during an -/// authentication handshake. -class ARROW_FLIGHT_EXPORT ClientAuthSender { - public: - virtual ~ClientAuthSender() = default; - virtual Status Write(const std::string& token) = 0; -}; - -/// \brief An authentication implementation for a Flight service. -/// Authentication includes both an initial negotiation and a per-call -/// token validation. Implementations may choose to use either or both -/// mechanisms. -class ARROW_FLIGHT_EXPORT ClientAuthHandler { - public: - virtual ~ClientAuthHandler() = default; - /// \brief Authenticate the client on initial connection. The client - /// can send messages to/read responses from the server at any time. - /// \return Status OK if authenticated successfully - virtual Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) = 0; - /// \brief Get a per-call token. - /// \param[out] token The token to send to the server. - virtual Status GetToken(std::string* token) = 0; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/customize_protobuf.h b/r/R/inst/include/arrow/flight/customize_protobuf.h deleted file mode 100644 index f27ab0b6878..00000000000 --- a/r/R/inst/include/arrow/flight/customize_protobuf.h +++ /dev/null @@ -1,129 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/flight/platform.h" -#include "arrow/util/config.h" - -// Silence protobuf warnings -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4244) -#endif - -#ifdef GRPCPP_PP_INCLUDE -#include -#else -#include -#endif - -// It is necessary to undefined this macro so that the protobuf -// SerializationTraits specialization is not declared in proto_utils.h. We've -// copied that specialization below and modified it to exclude -// protocol::FlightData from the default implementation so we can specialize -// for our faster serialization-deserialization path -#undef GRPC_OPEN_SOURCE_PROTO - -#ifdef GRPCPP_PP_INCLUDE -#include -#else -#include -#endif - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -namespace grpc { - -class ByteBuffer; - -} // namespace grpc - -namespace arrow { -namespace flight { - -struct FlightPayload; - -namespace internal { - -struct FlightData; - -// Those two functions are defined in serialization-internal.cc - -// Write FlightData to a grpc::ByteBuffer without extra copying -grpc::Status FlightDataSerialize(const FlightPayload& msg, grpc::ByteBuffer* out, - bool* own_buffer); - -// Read internal::FlightData from grpc::ByteBuffer containing FlightData -// protobuf without copying -grpc::Status FlightDataDeserialize(grpc::ByteBuffer* buffer, FlightData* out); - -} // namespace internal - -namespace protocol { - -class FlightData; - -} // namespace protocol -} // namespace flight -} // namespace arrow - -namespace grpc { - -// This class provides a protobuf serializer. It translates between protobuf -// objects and grpc_byte_buffers. More information about SerializationTraits can -// be found in include/grpcpp/impl/codegen/serialization_traits.h. -template -class SerializationTraits< - T, typename std::enable_if< - std::is_base_of::value && - !std::is_same::value>::type> { - public: - static Status Serialize(const grpc::protobuf::Message& msg, ByteBuffer* bb, - bool* own_buffer) { - return GenericSerialize(msg, bb, own_buffer); - } - - static Status Deserialize(ByteBuffer* buffer, grpc::protobuf::Message* msg) { - return GenericDeserialize(buffer, msg); - } -}; - -template -class SerializationTraits::value>::type> { - public: - // In the functions below, we cast back the Message argument to its real - // type (see ReadPayload() and WritePayload() for the initial cast). - static Status Serialize(const grpc::protobuf::Message& msg, ByteBuffer* bb, - bool* own_buffer) { - return arrow::flight::internal::FlightDataSerialize( - *reinterpret_cast(&msg), bb, own_buffer); - } - - static Status Deserialize(ByteBuffer* buffer, grpc::protobuf::Message* msg) { - return arrow::flight::internal::FlightDataDeserialize( - buffer, reinterpret_cast(msg)); - } -}; - -} // namespace grpc diff --git a/r/R/inst/include/arrow/flight/internal.h b/r/R/inst/include/arrow/flight/internal.h deleted file mode 100644 index 784e8ebae1c..00000000000 --- a/r/R/inst/include/arrow/flight/internal.h +++ /dev/null @@ -1,100 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/flight/protocol-internal.h" // IWYU pragma: keep -#include "arrow/flight/types.h" -#include "arrow/util/macros.h" - -namespace grpc { - -class Status; - -} // namespace grpc - -namespace arrow { - -class Schema; -class Status; - -namespace pb = arrow::flight::protocol; - -namespace ipc { - -class Message; - -} // namespace ipc - -namespace flight { - -#define GRPC_RETURN_NOT_OK(expr) \ - do { \ - ::arrow::Status _s = (expr); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - return ::arrow::flight::internal::ToGrpcStatus(_s); \ - } \ - } while (0) - -#define GRPC_RETURN_NOT_GRPC_OK(expr) \ - do { \ - ::grpc::Status _s = (expr); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - return _s; \ - } \ - } while (0) - -namespace internal { - -static const char* AUTH_HEADER = "auth-token-bin"; - -ARROW_FLIGHT_EXPORT -Status SchemaToString(const Schema& schema, std::string* out); - -ARROW_FLIGHT_EXPORT -Status FromGrpcStatus(const grpc::Status& grpc_status); - -ARROW_FLIGHT_EXPORT -grpc::Status ToGrpcStatus(const Status& arrow_status); - -// These functions depend on protobuf types which are not exported in the Flight DLL. - -Status FromProto(const pb::ActionType& pb_type, ActionType* type); -Status FromProto(const pb::Action& pb_action, Action* action); -Status FromProto(const pb::Result& pb_result, Result* result); -Status FromProto(const pb::Criteria& pb_criteria, Criteria* criteria); -Status FromProto(const pb::Location& pb_location, Location* location); -Status FromProto(const pb::Ticket& pb_ticket, Ticket* ticket); -Status FromProto(const pb::FlightData& pb_data, FlightDescriptor* descriptor, - std::unique_ptr* message); -Status FromProto(const pb::FlightDescriptor& pb_descr, FlightDescriptor* descr); -Status FromProto(const pb::FlightEndpoint& pb_endpoint, FlightEndpoint* endpoint); -Status FromProto(const pb::FlightInfo& pb_info, FlightInfo::Data* info); - -Status ToProto(const FlightDescriptor& descr, pb::FlightDescriptor* pb_descr); -Status ToProto(const FlightInfo& info, pb::FlightInfo* pb_info); -Status ToProto(const ActionType& type, pb::ActionType* pb_type); -Status ToProto(const Action& action, pb::Action* pb_action); -Status ToProto(const Result& result, pb::Result* pb_result); -void ToProto(const Ticket& ticket, pb::Ticket* pb_ticket); - -} // namespace internal -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/platform.h b/r/R/inst/include/arrow/flight/platform.h deleted file mode 100644 index 7f1b0954d84..00000000000 --- a/r/R/inst/include/arrow/flight/platform.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal header. Platform-specific definitions for gRPC. - -#pragma once - -#ifdef _MSC_VER - -// The protobuf documentation says that C4251 warnings when using the -// library are spurious and suppressed when the build the library and -// compiler, but must be also suppressed in downstream projects -#pragma warning(disable : 4251) - -#endif // _MSC_VER - -#include "arrow/util/config.h" // IWYU pragma: keep -#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep diff --git a/r/R/inst/include/arrow/flight/protocol-internal.h b/r/R/inst/include/arrow/flight/protocol-internal.h deleted file mode 100644 index 98bf9238809..00000000000 --- a/r/R/inst/include/arrow/flight/protocol-internal.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations - -#pragma once - -// This addresses platform-specific defines, e.g. on Windows -#include "arrow/flight/platform.h" // IWYU pragma: keep - -// This header holds the Flight protobuf definitions. - -// Need to include this first to get our gRPC customizations -#include "arrow/flight/customize_protobuf.h" // IWYU pragma: export - -#include "arrow/flight/Flight.grpc.pb.h" // IWYU pragma: export -#include "arrow/flight/Flight.pb.h" // IWYU pragma: export diff --git a/r/R/inst/include/arrow/flight/serialization-internal.h b/r/R/inst/include/arrow/flight/serialization-internal.h deleted file mode 100644 index aa47af6ae35..00000000000 --- a/r/R/inst/include/arrow/flight/serialization-internal.h +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// (De)serialization utilities that hook into gRPC, efficiently -// handling Arrow-encoded data in a gRPC call. - -#pragma once - -#include - -#include "arrow/flight/internal.h" -#include "arrow/flight/types.h" -#include "arrow/ipc/message.h" -#include "arrow/status.h" - -namespace arrow { - -class Buffer; - -namespace flight { -namespace internal { - -/// Internal, not user-visible type used for memory-efficient reads from gRPC -/// stream -struct FlightData { - /// Used only for puts, may be null - std::unique_ptr descriptor; - - /// Non-length-prefixed Message header as described in format/Message.fbs - std::shared_ptr metadata; - - /// Message body - std::shared_ptr body; - - /// Open IPC message from the metadata and body - Status OpenMessage(std::unique_ptr* message); -}; - -/// Write Flight message on gRPC stream with zero-copy optimizations. -/// True is returned on success, false if some error occurred (connection closed?). -bool WritePayload(const FlightPayload& payload, - grpc::ClientWriter* writer); -bool WritePayload(const FlightPayload& payload, - grpc::ServerWriter* writer); - -/// Read Flight message from gRPC stream with zero-copy optimizations. -/// True is returned on success, false if stream ended. -bool ReadPayload(grpc::ClientReader* reader, FlightData* data); -bool ReadPayload(grpc::ServerReader* reader, FlightData* data); - -} // namespace internal -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/server.h b/r/R/inst/include/arrow/flight/server.h deleted file mode 100644 index 7164b64c4ab..00000000000 --- a/r/R/inst/include/arrow/flight/server.h +++ /dev/null @@ -1,207 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now - -#pragma once - -#include -#include -#include - -#include "arrow/flight/server_auth.h" -#include "arrow/flight/types.h" // IWYU pragma: keep -#include "arrow/flight/visibility.h" // IWYU pragma: keep -#include "arrow/ipc/dictionary.h" -#include "arrow/memory_pool.h" -#include "arrow/record_batch.h" - -namespace arrow { - -class MemoryPool; -class Schema; -class Status; - -namespace flight { - -/// \brief Interface that produces a sequence of IPC payloads to be sent in -/// FlightData protobuf messages -class ARROW_FLIGHT_EXPORT FlightDataStream { - public: - virtual ~FlightDataStream(); - - virtual std::shared_ptr schema() = 0; - - /// \brief Compute FlightPayload containing serialized RecordBatch schema - virtual Status GetSchemaPayload(FlightPayload* payload) = 0; - - // When the stream is completed, the last payload written will have null - // metadata - virtual Status Next(FlightPayload* payload) = 0; -}; - -/// \brief A basic implementation of FlightDataStream that will provide -/// a sequence of FlightData messages to be written to a gRPC stream -class ARROW_FLIGHT_EXPORT RecordBatchStream : public FlightDataStream { - public: - /// \param[in] reader produces a sequence of record batches - /// \param[in,out] pool a MemoryPool to use for allocations - explicit RecordBatchStream(const std::shared_ptr& reader, - MemoryPool* pool = default_memory_pool()); - ~RecordBatchStream() override; - - std::shared_ptr schema() override; - Status GetSchemaPayload(FlightPayload* payload) override; - Status Next(FlightPayload* payload) override; - - private: - class RecordBatchStreamImpl; - std::unique_ptr impl_; -}; - -// Silence warning -// "non dll-interface class RecordBatchReader used as base for dll-interface class" -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4275) -#endif - -/// \brief A reader for IPC payloads uploaded by a client -class ARROW_FLIGHT_EXPORT FlightMessageReader : public RecordBatchReader { - public: - /// \brief Get the descriptor for this upload. - virtual const FlightDescriptor& descriptor() const = 0; -}; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/// \brief Call state/contextual data. -class ARROW_FLIGHT_EXPORT ServerCallContext { - public: - virtual ~ServerCallContext() = default; - /// \brief The name of the authenticated peer (may be the empty string) - virtual const std::string& peer_identity() const = 0; -}; - -class ARROW_FLIGHT_EXPORT FlightServerOptions { - public: - explicit FlightServerOptions(const Location& location_); - - Location location; - std::unique_ptr auth_handler; - std::string tls_cert_chain; - std::string tls_private_key; -}; - -/// \brief Skeleton RPC server implementation which can be used to create -/// custom servers by implementing its abstract methods -class ARROW_FLIGHT_EXPORT FlightServerBase { - public: - FlightServerBase(); - virtual ~FlightServerBase(); - - // Lifecycle methods. - - /// \brief Initialize a Flight server listening at the given location. - /// This method must be called before any other method. - /// \param[in] options The configuration for this server. - Status Init(FlightServerOptions& options); - - /// \brief Set the server to stop when receiving any of the given signal - /// numbers. - /// This method must be called before Serve(). - Status SetShutdownOnSignals(const std::vector sigs); - - /// \brief Start serving. - /// This method blocks until either Shutdown() is called or one of the signals - /// registered in SetShutdownOnSignals() is received. - Status Serve(); - - /// \brief Query whether Serve() was interrupted by a signal. - /// This method must be called after Serve() has returned. - /// - /// \return int the signal number that interrupted Serve(), if any, otherwise 0 - int GotSignal() const; - - /// \brief Shut down the server. Can be called from signal handler or another - /// thread while Serve() blocks. - /// - /// TODO(wesm): Shutdown with deadline - void Shutdown(); - - // Implement these methods to create your own server. The default - // implementations will return a not-implemented result to the client - - /// \brief Retrieve a list of available fields given an optional opaque - /// criteria - /// \param[in] context The call context. - /// \param[in] criteria may be null - /// \param[out] listings the returned listings iterator - /// \return Status - virtual Status ListFlights(const ServerCallContext& context, const Criteria* criteria, - std::unique_ptr* listings); - - /// \brief Retrieve the schema and an access plan for the indicated - /// descriptor - /// \param[in] context The call context. - /// \param[in] request may be null - /// \param[out] info the returned flight info provider - /// \return Status - virtual Status GetFlightInfo(const ServerCallContext& context, - const FlightDescriptor& request, - std::unique_ptr* info); - - /// \brief Get a stream of IPC payloads to put on the wire - /// \param[in] context The call context. - /// \param[in] request an opaque ticket - /// \param[out] stream the returned stream provider - /// \return Status - virtual Status DoGet(const ServerCallContext& context, const Ticket& request, - std::unique_ptr* stream); - - /// \brief Process a stream of IPC payloads sent from a client - /// \param[in] context The call context. - /// \param[in] reader a sequence of uploaded record batches - /// \return Status - virtual Status DoPut(const ServerCallContext& context, - std::unique_ptr reader); - - /// \brief Execute an action, return stream of zero or more results - /// \param[in] context The call context. - /// \param[in] action the action to execute, with type and body - /// \param[out] result the result iterator - /// \return Status - virtual Status DoAction(const ServerCallContext& context, const Action& action, - std::unique_ptr* result); - - /// \brief Retrieve the list of available actions - /// \param[in] context The call context. - /// \param[out] actions a vector of available action types - /// \return Status - virtual Status ListActions(const ServerCallContext& context, - std::vector* actions); - - private: - struct Impl; - std::unique_ptr impl_; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/server_auth.h b/r/R/inst/include/arrow/flight/server_auth.h deleted file mode 100644 index b1ccb096d7b..00000000000 --- a/r/R/inst/include/arrow/flight/server_auth.h +++ /dev/null @@ -1,78 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// \brief Server-side APIs to implement authentication for Flight. - -#pragma once - -#include - -#include "arrow/flight/visibility.h" -#include "arrow/status.h" - -namespace arrow { - -namespace flight { - -/// \brief A reader for messages from the client during an -/// authentication handshake. -class ARROW_FLIGHT_EXPORT ServerAuthReader { - public: - virtual ~ServerAuthReader() = default; - virtual Status Read(std::string* token) = 0; -}; - -/// \brief A writer for messages to the client during an -/// authentication handshake. -class ARROW_FLIGHT_EXPORT ServerAuthSender { - public: - virtual ~ServerAuthSender() = default; - virtual Status Write(const std::string& message) = 0; -}; - -/// \brief An authentication implementation for a Flight service. -/// Authentication includes both an initial negotiation and a per-call -/// token validation. Implementations may choose to use either or both -/// mechanisms. -/// An implementation may need to track some state, e.g. a mapping of -/// client tokens to authenticated identities. -class ARROW_FLIGHT_EXPORT ServerAuthHandler { - public: - virtual ~ServerAuthHandler(); - /// \brief Authenticate the client on initial connection. The server - /// can send and read responses from the client at any time. - virtual Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) = 0; - /// \brief Validate a per-call client token. - /// \param[in] token The client token. May be the empty string if - /// the client does not provide a token. - /// \param[out] peer_identity The identity of the peer, if this - /// authentication method supports it. - /// \return Status OK if the token is valid, any other status if - /// validation failed - virtual Status IsValid(const std::string& token, std::string* peer_identity) = 0; -}; - -/// \brief An authentication mechanism that does nothing. -class ARROW_FLIGHT_EXPORT NoOpAuthHandler : public ServerAuthHandler { - public: - ~NoOpAuthHandler() override; - Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) override; - Status IsValid(const std::string& token, std::string* peer_identity) override; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/test-util.h b/r/R/inst/include/arrow/flight/test-util.h deleted file mode 100644 index 2e1f4b0ed15..00000000000 --- a/r/R/inst/include/arrow/flight/test-util.h +++ /dev/null @@ -1,188 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include - -#include "arrow/status.h" - -#include "arrow/flight/client_auth.h" -#include "arrow/flight/server_auth.h" -#include "arrow/flight/types.h" -#include "arrow/flight/visibility.h" - -namespace boost { -namespace process { - -class child; - -} // namespace process -} // namespace boost - -namespace arrow { -namespace flight { - -// ---------------------------------------------------------------------- -// Fixture to use for running test servers - -// Get a TCP port number to listen on. This is a different number every time, -// as reusing the same port accross tests can produce spurious "Stream removed" -// errors as Windows. -ARROW_FLIGHT_EXPORT -int GetListenPort(); - -class ARROW_FLIGHT_EXPORT TestServer { - public: - explicit TestServer(const std::string& executable_name) - : executable_name_(executable_name), port_(GetListenPort()) {} - explicit TestServer(const std::string& executable_name, int port) - : executable_name_(executable_name), port_(port) {} - - void Start(); - - int Stop(); - - bool IsRunning(); - - int port() const; - - private: - std::string executable_name_; - int port_; - std::shared_ptr<::boost::process::child> server_process_; -}; - -class ARROW_FLIGHT_EXPORT InProcessTestServer { - public: - explicit InProcessTestServer(std::unique_ptr server, - const Location& location) - : server_(std::move(server)), location_(location), thread_() {} - ~InProcessTestServer(); - Status Start(); - void Stop(); - const Location& location() const; - - private: - std::unique_ptr server_; - Location location_; - std::thread thread_; -}; - -// ---------------------------------------------------------------------- -// A RecordBatchReader for serving a sequence of in-memory record batches - -// Silence warning -// "non dll-interface class RecordBatchReader used as base for dll-interface class" -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4275) -#endif - -class ARROW_FLIGHT_EXPORT BatchIterator : public RecordBatchReader { - public: - BatchIterator(const std::shared_ptr& schema, - const std::vector>& batches) - : schema_(schema), batches_(batches), position_(0) {} - - std::shared_ptr schema() const override { return schema_; } - - Status ReadNext(std::shared_ptr* out) override { - if (position_ >= batches_.size()) { - *out = nullptr; - } else { - *out = batches_[position_++]; - } - return Status::OK(); - } - - private: - std::shared_ptr schema_; - std::vector> batches_; - size_t position_; -}; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -// ---------------------------------------------------------------------- -// Example data for test-server and unit tests - -using BatchVector = std::vector>; - -ARROW_FLIGHT_EXPORT -std::shared_ptr ExampleIntSchema(); - -ARROW_FLIGHT_EXPORT -std::shared_ptr ExampleStringSchema(); - -ARROW_FLIGHT_EXPORT -std::shared_ptr ExampleDictSchema(); - -ARROW_FLIGHT_EXPORT -Status ExampleIntBatches(BatchVector* out); - -ARROW_FLIGHT_EXPORT -Status ExampleDictBatches(BatchVector* out); - -ARROW_FLIGHT_EXPORT -std::vector ExampleFlightInfo(); - -ARROW_FLIGHT_EXPORT -std::vector ExampleActionTypes(); - -ARROW_FLIGHT_EXPORT -Status MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor, - const std::vector& endpoints, int64_t total_records, - int64_t total_bytes, FlightInfo::Data* out); - -// ---------------------------------------------------------------------- -// A pair of authentication handlers that check for a predefined password -// and set the peer identity to a predefined username. - -class ARROW_FLIGHT_EXPORT TestServerAuthHandler : public ServerAuthHandler { - public: - explicit TestServerAuthHandler(const std::string& username, - const std::string& password); - ~TestServerAuthHandler() override; - Status Authenticate(ServerAuthSender* outgoing, ServerAuthReader* incoming) override; - Status IsValid(const std::string& token, std::string* peer_identity) override; - - private: - std::string username_; - std::string password_; -}; - -class ARROW_FLIGHT_EXPORT TestClientAuthHandler : public ClientAuthHandler { - public: - explicit TestClientAuthHandler(const std::string& username, - const std::string& password); - ~TestClientAuthHandler() override; - Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override; - Status GetToken(std::string* token) override; - - private: - std::string username_; - std::string password_; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/types.h b/r/R/inst/include/arrow/flight/types.h deleted file mode 100644 index 8d372252636..00000000000 --- a/r/R/inst/include/arrow/flight/types.h +++ /dev/null @@ -1,290 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Data structure for Flight RPC. API should be considered experimental for now - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "arrow/flight/visibility.h" -#include "arrow/ipc/writer.h" - -namespace arrow { - -class Buffer; -class Schema; -class Status; - -namespace ipc { - -class DictionaryMemo; - -} // namespace ipc - -namespace internal { - -class Uri; - -} // namespace internal - -namespace flight { - -/// \brief A type of action that can be performed with the DoAction RPC -struct ARROW_FLIGHT_EXPORT ActionType { - /// Name of action - std::string type; - - /// Opaque action description - std::string description; -}; - -/// \brief Opaque selection critera for ListFlights RPC -struct ARROW_FLIGHT_EXPORT Criteria { - /// Opaque criteria expression, dependent on server implementation - std::string expression; -}; - -/// \brief An action to perform with the DoAction RPC -struct ARROW_FLIGHT_EXPORT Action { - /// The action type - std::string type; - - /// The action content as a Buffer - std::shared_ptr body; -}; - -/// \brief Opaque result returned after executing an action -struct ARROW_FLIGHT_EXPORT Result { - std::shared_ptr body; -}; - -/// \brief A message received after completing a DoPut stream -struct ARROW_FLIGHT_EXPORT PutResult {}; - -/// \brief A request to retrieve or generate a dataset -struct ARROW_FLIGHT_EXPORT FlightDescriptor { - enum DescriptorType { - UNKNOWN = 0, /// Unused - PATH = 1, /// Named path identifying a dataset - CMD = 2 /// Opaque command to generate a dataset - }; - - /// The descriptor type - DescriptorType type; - - /// Opaque value used to express a command. Should only be defined when type - /// is CMD - std::string cmd; - - /// List of strings identifying a particular dataset. Should only be defined - /// when type is PATH - std::vector path; - - bool Equals(const FlightDescriptor& other) const; - - std::string ToString() const; - - // Convenience factory functions - - static FlightDescriptor Command(const std::string& c) { - return FlightDescriptor{CMD, c, {}}; - } - - static FlightDescriptor Path(const std::vector& p) { - return FlightDescriptor{PATH, "", p}; - } -}; - -/// \brief Data structure providing an opaque identifier or credential to use -/// when requesting a data stream with the DoGet RPC -struct ARROW_FLIGHT_EXPORT Ticket { - std::string ticket; -}; - -class FlightClient; -class FlightServerBase; - -static const char* kSchemeGrpc = "grpc"; -static const char* kSchemeGrpcTcp = "grpc+tcp"; -static const char* kSchemeGrpcUnix = "grpc+unix"; -static const char* kSchemeGrpcTls = "grpc+tls"; - -/// \brief A host location (a URI) -struct ARROW_FLIGHT_EXPORT Location { - public: - /// \brief Initialize a blank location. - Location(); - - /// \brief Initialize a location by parsing a URI string - static Status Parse(const std::string& uri_string, Location* location); - - /// \brief Initialize a location for a non-TLS, gRPC-based Flight - /// service from a host and port - /// \param[in] host The hostname to connect to - /// \param[in] port The port - /// \param[out] location The resulting location - static Status ForGrpcTcp(const std::string& host, const int port, Location* location); - - /// \brief Initialize a location for a domain socket-based Flight - /// service - /// \param[in] path The path to the domain socket - /// \param[out] location The resulting location - static Status ForGrpcUnix(const std::string& path, Location* location); - - /// \brief Get a representation of this URI as a string. - std::string ToString() const; - - /// \brief Get the scheme of this URI. - std::string scheme() const; - - bool Equals(const Location& other) const; - - friend bool operator==(const Location& left, const Location& right) { - return left.Equals(right); - } - friend bool operator!=(const Location& left, const Location& right) { - return !(left == right); - } - - private: - friend class FlightClient; - friend class FlightServerBase; - std::shared_ptr uri_; -}; - -/// \brief A flight ticket and list of locations where the ticket can be -/// redeemed -struct ARROW_FLIGHT_EXPORT FlightEndpoint { - /// Opaque ticket identify; use with DoGet RPC - Ticket ticket; - - /// List of locations where ticket can be redeemed. If the list is empty, the - /// ticket can only be redeemed on the current service where the ticket was - /// generated - std::vector locations; -}; - -/// \brief Staging data structure for messages about to be put on the wire -/// -/// This structure corresponds to FlightData in the protocol. -struct ARROW_FLIGHT_EXPORT FlightPayload { - std::shared_ptr descriptor; - ipc::internal::IpcPayload ipc_message; -}; - -/// \brief The access coordinates for retireval of a dataset, returned by -/// GetFlightInfo -class ARROW_FLIGHT_EXPORT FlightInfo { - public: - struct Data { - std::string schema; - FlightDescriptor descriptor; - std::vector endpoints; - int64_t total_records; - int64_t total_bytes; - }; - - explicit FlightInfo(const Data& data) : data_(data), reconstructed_schema_(false) {} - explicit FlightInfo(Data&& data) - : data_(std::move(data)), reconstructed_schema_(false) {} - - /// \brief Deserialize the Arrow schema of the dataset, to be passed - /// to each call to DoGet. Populate any dictionary encoded fields - /// into a DictionaryMemo for bookkeeping - /// \param[in,out] dictionary_memo for dictionary bookkeeping, will - /// be modified - /// \param[out] out the reconstructed Schema - Status GetSchema(ipc::DictionaryMemo* dictionary_memo, - std::shared_ptr* out) const; - - const std::string& serialized_schema() const { return data_.schema; } - - /// The descriptor associated with this flight, may not be set - const FlightDescriptor& descriptor() const { return data_.descriptor; } - - /// A list of endpoints associated with the flight (dataset). To consume the - /// whole flight, all endpoints must be consumed - const std::vector& endpoints() const { return data_.endpoints; } - - /// The total number of records (rows) in the dataset. If unknown, set to -1 - int64_t total_records() const { return data_.total_records; } - - /// The total number of bytes in the dataset. If unknown, set to -1 - int64_t total_bytes() const { return data_.total_bytes; } - - private: - Data data_; - mutable std::shared_ptr schema_; - mutable bool reconstructed_schema_; -}; - -/// \brief An iterator to FlightInfo instances returned by ListFlights -class ARROW_FLIGHT_EXPORT FlightListing { - public: - virtual ~FlightListing() = default; - - /// \brief Retrieve the next FlightInfo from the iterator. Returns nullptr - /// when there are none left - /// \param[out] info a single FlightInfo - /// \return Status - virtual Status Next(std::unique_ptr* info) = 0; -}; - -/// \brief An iterator to Result instances returned by DoAction -class ARROW_FLIGHT_EXPORT ResultStream { - public: - virtual ~ResultStream() = default; - - /// \brief Retrieve the next Result from the iterator. Returns nullptr - /// when there are none left - /// \param[out] info a single Result - /// \return Status - virtual Status Next(std::unique_ptr* info) = 0; -}; - -// \brief Create a FlightListing from a vector of FlightInfo objects. This can -// be iterated once, then it is consumed -class ARROW_FLIGHT_EXPORT SimpleFlightListing : public FlightListing { - public: - explicit SimpleFlightListing(const std::vector& flights); - explicit SimpleFlightListing(std::vector&& flights); - - Status Next(std::unique_ptr* info) override; - - private: - int position_; - std::vector flights_; -}; - -class ARROW_FLIGHT_EXPORT SimpleResultStream : public ResultStream { - public: - explicit SimpleResultStream(std::vector&& results); - Status Next(std::unique_ptr* result) override; - - private: - std::vector results_; - size_t position_; -}; - -} // namespace flight -} // namespace arrow diff --git a/r/R/inst/include/arrow/flight/visibility.h b/r/R/inst/include/arrow/flight/visibility.h deleted file mode 100644 index bdee8b751d8..00000000000 --- a/r/R/inst/include/arrow/flight/visibility.h +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef ARROW_FLIGHT_STATIC -#define ARROW_FLIGHT_EXPORT -#elif defined(ARROW_FLIGHT_EXPORTING) -#define ARROW_FLIGHT_EXPORT __declspec(dllexport) -#else -#define ARROW_FLIGHT_EXPORT __declspec(dllimport) -#endif - -#define ARROW_FLIGHT_NO_EXPORT -#else // Not Windows -#ifndef ARROW_FLIGHT_EXPORT -#define ARROW_FLIGHT_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef ARROW_FLIGHT_NO_EXPORT -#define ARROW_FLIGHT_NO_EXPORT __attribute__((visibility("hidden"))) -#endif -#endif // Non-Windows - -#if defined(_MSC_VER) -#pragma warning(pop) -#endif diff --git a/r/R/inst/include/arrow/gpu/cuda_api.h b/r/R/inst/include/arrow/gpu/cuda_api.h deleted file mode 100644 index c63b77e8721..00000000000 --- a/r/R/inst/include/arrow/gpu/cuda_api.h +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_GPU_CUDA_API_H -#define ARROW_GPU_CUDA_API_H - -#include "arrow/gpu/cuda_arrow_ipc.h" -#include "arrow/gpu/cuda_context.h" -#include "arrow/gpu/cuda_memory.h" -#include "arrow/gpu/cuda_version.h" - -#endif // ARROW_GPU_CUDA_API_H diff --git a/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h b/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h deleted file mode 100644 index 4eb85e797c7..00000000000 --- a/r/R/inst/include/arrow/gpu/cuda_arrow_ipc.h +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_GPU_CUDA_ARROW_IPC_H -#define ARROW_GPU_CUDA_ARROW_IPC_H - -#include -#include - -#include "arrow/buffer.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -#include "arrow/gpu/cuda_memory.h" - -namespace arrow { - -class MemoryPool; -class RecordBatch; -class Schema; - -namespace ipc { - -class Message; - -} // namespace ipc - -namespace cuda { - -/// \brief Write record batch message to GPU device memory -/// \param[in] batch record batch to write -/// \param[in] ctx CudaContext to allocate device memory from -/// \param[out] out the returned device buffer which contains the record batch message -/// \return Status -ARROW_EXPORT -Status SerializeRecordBatch(const RecordBatch& batch, CudaContext* ctx, - std::shared_ptr* out); - -/// \brief Read Arrow IPC message located on GPU device -/// \param[in] reader a CudaBufferReader -/// \param[in] pool a MemoryPool to allocate CPU memory for the metadata -/// \param[out] message the deserialized message, body still on device -/// -/// This function reads the message metadata into host memory, but leaves the -/// message body on the device -ARROW_EXPORT -Status ReadMessage(CudaBufferReader* reader, MemoryPool* pool, - std::unique_ptr* message); - -/// \brief ReadRecordBatch specialized to handle metadata on CUDA device -/// \param[in] schema the Schema for the record batch -/// \param[in] buffer a CudaBuffer containing the complete IPC message -/// \param[in] pool a MemoryPool to use for allocating space for the metadata -/// \param[out] out the reconstructed RecordBatch, with device pointers -ARROW_EXPORT -Status ReadRecordBatch(const std::shared_ptr& schema, - const std::shared_ptr& buffer, MemoryPool* pool, - std::shared_ptr* out); - -} // namespace cuda -} // namespace arrow - -#endif // ARROW_GPU_CUDA_ARROW_IPC_H diff --git a/r/R/inst/include/arrow/gpu/cuda_common.h b/r/R/inst/include/arrow/gpu/cuda_common.h deleted file mode 100644 index 87371ce20ad..00000000000 --- a/r/R/inst/include/arrow/gpu/cuda_common.h +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Non-public header - -#ifndef ARROW_GPU_CUDA_COMMON_H -#define ARROW_GPU_CUDA_COMMON_H - -#include - -namespace arrow { -namespace cuda { - -#define CU_RETURN_NOT_OK(STMT) \ - do { \ - CUresult ret = (STMT); \ - if (ret != CUDA_SUCCESS) { \ - return Status::IOError("Cuda Driver API call in ", __FILE__, " at line ", \ - __LINE__, " failed with code ", ret, ": ", #STMT); \ - } \ - } while (0) - -} // namespace cuda -} // namespace arrow - -#endif // ARROW_GPU_CUDA_COMMON_H diff --git a/r/R/inst/include/arrow/gpu/cuda_context.h b/r/R/inst/include/arrow/gpu/cuda_context.h deleted file mode 100644 index 99c3fc2ba42..00000000000 --- a/r/R/inst/include/arrow/gpu/cuda_context.h +++ /dev/null @@ -1,168 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_GPU_CUDA_CONTEXT_H -#define ARROW_GPU_CUDA_CONTEXT_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -#include "arrow/gpu/cuda_memory.h" - -namespace arrow { -namespace cuda { - -// Forward declaration -class CudaContext; - -class ARROW_EXPORT CudaDeviceManager { - public: - static Status GetInstance(CudaDeviceManager** manager); - - /// \brief Get the CUDA driver context for a particular device - /// \param[in] device_number the CUDA device - /// \param[out] out cached context - Status GetContext(int device_number, std::shared_ptr* out); - - /// \brief Get the shared CUDA driver context for a particular device - /// \param[in] device_number the CUDA device - /// \param[in] handle CUDA context handler created by another library - /// \param[out] out shared context - Status GetSharedContext(int device_number, void* handle, - std::shared_ptr* out); - - /// \brief Allocate host memory with fast access to given GPU device - /// \param[in] device_number the CUDA device - /// \param[in] nbytes number of bytes - /// \param[out] out the allocated buffer - Status AllocateHost(int device_number, int64_t nbytes, - std::shared_ptr* out); - - Status FreeHost(void* data, int64_t nbytes); - - int num_devices() const; - - private: - CudaDeviceManager(); - static std::unique_ptr instance_; - - class CudaDeviceManagerImpl; - std::unique_ptr impl_; - - friend CudaContext; -}; - -struct ARROW_EXPORT CudaDeviceInfo {}; - -/// \class CudaContext -/// \brief Friendlier interface to the CUDA driver API -class ARROW_EXPORT CudaContext : public std::enable_shared_from_this { - public: - ~CudaContext(); - - Status Close(); - - /// \brief Allocate CUDA memory on GPU device for this context - /// \param[in] nbytes number of bytes - /// \param[out] out the allocated buffer - /// \return Status - Status Allocate(int64_t nbytes, std::shared_ptr* out); - - /// \brief Create a view of CUDA memory on GPU device of this context - /// \param[in] data the starting device address - /// \param[in] nbytes number of bytes - /// \param[out] out the view buffer - /// \return Status - /// - /// \note The caller is responsible for allocating and freeing the - /// memory as well as ensuring that the memory belongs to the CUDA - /// context that this CudaContext instance holds. - Status View(uint8_t* data, int64_t nbytes, std::shared_ptr* out); - - /// \brief Open existing CUDA IPC memory handle - /// \param[in] ipc_handle opaque pointer to CUipcMemHandle (driver API) - /// \param[out] out a CudaBuffer referencing the IPC segment - /// \return Status - Status OpenIpcBuffer(const CudaIpcMemHandle& ipc_handle, - std::shared_ptr* out); - - /// \brief Close memory mapped with IPC buffer - /// \param[in] buffer a CudaBuffer referencing - /// \return Status - Status CloseIpcBuffer(CudaBuffer* buffer); - - /// \brief Block until the all device tasks are completed. - Status Synchronize(void); - - int64_t bytes_allocated() const; - - /// \brief Expose CUDA context handle to other libraries - void* handle() const; - - /// \brief Return device number - int device_number() const; - - /// \brief Return the device address that is reachable from kernels - /// running in the context - /// \param[in] addr device or host memory address - /// \param[out] devaddr the device address - /// \return Status - /// - /// The device address is defined as a memory address accessible by - /// device. While it is often a device memory address, it can be - /// also a host memory address, for instance, when the memory is - /// allocated as host memory (using cudaMallocHost or cudaHostAlloc) - /// or as managed memory (using cudaMallocManaged) or the host - /// memory is page-locked (using cudaHostRegister). - Status GetDeviceAddress(uint8_t* addr, uint8_t** devaddr); - - /// \brief Release CUDA memory on GPU device for this context - /// \param[in] device_ptr the buffer address - /// \param[in] nbytes number of bytes - /// \return Status - Status Free(void* device_ptr, int64_t nbytes); - - private: - CudaContext(); - - Status ExportIpcBuffer(void* data, int64_t size, - std::shared_ptr* handle); - Status CopyHostToDevice(void* dst, const void* src, int64_t nbytes); - Status CopyDeviceToHost(void* dst, const void* src, int64_t nbytes); - Status CopyDeviceToDevice(void* dst, const void* src, int64_t nbytes); - Status CopyDeviceToAnotherDevice(const std::shared_ptr& dst_ctx, void* dst, - const void* src, int64_t nbytes); - - class CudaContextImpl; - std::unique_ptr impl_; - - friend CudaBuffer; - friend CudaBufferReader; - friend CudaBufferWriter; - /// \cond FALSE - // (note: emits warning on Doxygen < 1.8.15) - friend CudaDeviceManager::CudaDeviceManagerImpl; - /// \endcond -}; - -} // namespace cuda -} // namespace arrow - -#endif // ARROW_GPU_CUDA_CONTEXT_H diff --git a/r/R/inst/include/arrow/gpu/cuda_memory.h b/r/R/inst/include/arrow/gpu/cuda_memory.h deleted file mode 100644 index 6b9f04cc6de..00000000000 --- a/r/R/inst/include/arrow/gpu/cuda_memory.h +++ /dev/null @@ -1,232 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_GPU_CUDA_MEMORY_H -#define ARROW_GPU_CUDA_MEMORY_H - -#include -#include - -#include "arrow/buffer.h" -#include "arrow/io/memory.h" -#include "arrow/memory_pool.h" -#include "arrow/status.h" - -namespace arrow { -namespace cuda { - -class CudaContext; -class CudaIpcMemHandle; - -/// \class CudaBuffer -/// \brief An Arrow buffer located on a GPU device -/// -/// Be careful using this in any Arrow code which may not be GPU-aware -class ARROW_EXPORT CudaBuffer : public Buffer { - public: - CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr& context, - bool own_data = false, bool is_ipc = false); - - CudaBuffer(const std::shared_ptr& parent, const int64_t offset, - const int64_t size); - - ~CudaBuffer(); - - /// \brief Convert back generic buffer into CudaBuffer - /// \param[in] buffer buffer to convert - /// \param[out] out conversion result - /// \return Status - /// - /// \note This function returns an error if the buffer isn't backed - /// by GPU memory - static Status FromBuffer(std::shared_ptr buffer, - std::shared_ptr* out); - - /// \brief Copy memory from GPU device to CPU host - /// \param[in] position start position inside buffer to copy bytes from - /// \param[in] nbytes number of bytes to copy - /// \param[out] out start address of the host memory area to copy to - /// \return Status - Status CopyToHost(const int64_t position, const int64_t nbytes, void* out) const; - - /// \brief Copy memory to device at position - /// \param[in] position start position to copy bytes to - /// \param[in] data the host data to copy - /// \param[in] nbytes number of bytes to copy - /// \return Status - Status CopyFromHost(const int64_t position, const void* data, int64_t nbytes); - - /// \brief Copy memory from device to device at position - /// \param[in] position start position inside buffer to copy bytes to - /// \param[in] data start address of the device memory area to copy from - /// \param[in] nbytes number of bytes to copy - /// \return Status - /// - /// \note It is assumed that both source and destination device - /// memories have been allocated within the same context. - Status CopyFromDevice(const int64_t position, const void* data, int64_t nbytes); - - /// \brief Copy memory from another device to device at position - /// \param[in] src_ctx context of the source device memory - /// \param[in] position start position inside buffer to copy bytes to - /// \param[in] data start address of the another device memory area to copy from - /// \param[in] nbytes number of bytes to copy - /// \return Status - Status CopyFromAnotherDevice(const std::shared_ptr& src_ctx, - const int64_t position, const void* data, int64_t nbytes); - - /// \brief Expose this device buffer as IPC memory which can be used in other processes - /// \param[out] handle the exported IPC handle - /// \return Status - /// - /// \note After calling this function, this device memory will not be freed - /// when the CudaBuffer is destructed - virtual Status ExportForIpc(std::shared_ptr* handle); - - std::shared_ptr context() const { return context_; } - - protected: - std::shared_ptr context_; - bool own_data_; - bool is_ipc_; - - virtual Status Close(); -}; - -/// \class CudaHostBuffer -/// \brief Device-accessible CPU memory created using cudaHostAlloc -class ARROW_EXPORT CudaHostBuffer : public MutableBuffer { - public: - using MutableBuffer::MutableBuffer; - ~CudaHostBuffer(); -}; - -/// \class CudaIpcHandle -/// \brief A container for a CUDA IPC handle -class ARROW_EXPORT CudaIpcMemHandle { - public: - ~CudaIpcMemHandle(); - - /// \brief Create CudaIpcMemHandle from opaque buffer (e.g. from another process) - /// \param[in] opaque_handle a CUipcMemHandle as a const void* - /// \param[out] handle the CudaIpcMemHandle instance - /// \return Status - static Status FromBuffer(const void* opaque_handle, - std::shared_ptr* handle); - - /// \brief Write CudaIpcMemHandle to a Buffer - /// \param[in] pool a MemoryPool to allocate memory from - /// \param[out] out the serialized buffer - /// \return Status - Status Serialize(MemoryPool* pool, std::shared_ptr* out) const; - - private: - explicit CudaIpcMemHandle(const void* handle); - CudaIpcMemHandle(int64_t memory_size, const void* cu_handle); - - struct CudaIpcMemHandleImpl; - std::unique_ptr impl_; - - const void* handle() const; - int64_t memory_size() const; - - friend CudaBuffer; - friend CudaContext; -}; - -/// \class CudaBufferReader -/// \brief File interface for zero-copy read from CUDA buffers -/// -/// Note: Reads return pointers to device memory. This means you must be -/// careful using this interface with any Arrow code which may expect to be -/// able to do anything other than pointer arithmetic on the returned buffers -class ARROW_EXPORT CudaBufferReader : public io::BufferReader { - public: - explicit CudaBufferReader(const std::shared_ptr& buffer); - ~CudaBufferReader() override; - - /// \brief Read bytes into pre-allocated host memory - /// \param[in] nbytes number of bytes to read - /// \param[out] bytes_read actual number of bytes read - /// \param[out] buffer pre-allocated memory to write into - Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; - - /// \brief Zero-copy read from device memory - /// \param[in] nbytes number of bytes to read - /// \param[out] out a Buffer referencing device memory - /// \return Status - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - private: - std::shared_ptr cuda_buffer_; - std::shared_ptr context_; -}; - -/// \class CudaBufferWriter -/// \brief File interface for writing to CUDA buffers, with optional buffering -class ARROW_EXPORT CudaBufferWriter : public io::WritableFile { - public: - explicit CudaBufferWriter(const std::shared_ptr& buffer); - ~CudaBufferWriter() override; - - /// \brief Close writer and flush buffered bytes to GPU - Status Close() override; - - bool closed() const override; - - /// \brief Flush buffered bytes to GPU - Status Flush() override; - - Status Seek(int64_t position) override; - - Status Write(const void* data, int64_t nbytes) override; - - Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; - - Status Tell(int64_t* position) const override; - - /// \brief Set CPU buffer size to limit calls to cudaMemcpy - /// \param[in] buffer_size the size of CPU buffer to allocate - /// \return Status - /// - /// By default writes are unbuffered - Status SetBufferSize(const int64_t buffer_size); - - /// \brief Returns size of host (CPU) buffer, 0 for unbuffered - int64_t buffer_size() const; - - /// \brief Returns number of bytes buffered on host - int64_t num_bytes_buffered() const; - - private: - class CudaBufferWriterImpl; - std::unique_ptr impl_; -}; - -/// \brief Allocate CUDA-accessible memory on CPU host -/// \param[in] device_number device to expose host memory -/// \param[in] size number of bytes -/// \param[out] out the allocated buffer -/// \return Status -ARROW_EXPORT -Status AllocateCudaHostBuffer(int device_number, const int64_t size, - std::shared_ptr* out); - -} // namespace cuda -} // namespace arrow - -#endif // ARROW_GPU_CUDA_MEMORY_H diff --git a/r/R/inst/include/arrow/io/api.h b/r/R/inst/include/arrow/io/api.h deleted file mode 100644 index cf1be337fd1..00000000000 --- a/r/R/inst/include/arrow/io/api.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_API_H -#define ARROW_IO_API_H - -#include "arrow/io/buffered.h" -#include "arrow/io/compressed.h" -#include "arrow/io/file.h" -#include "arrow/io/hdfs.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" - -#endif // ARROW_IO_API_H diff --git a/r/R/inst/include/arrow/io/buffered.h b/r/R/inst/include/arrow/io/buffered.h deleted file mode 100644 index 03ea1c7f757..00000000000 --- a/r/R/inst/include/arrow/io/buffered.h +++ /dev/null @@ -1,160 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Buffered stream implementations - -#ifndef ARROW_IO_BUFFERED_H -#define ARROW_IO_BUFFERED_H - -#include -#include - -#include "arrow/io/interfaces.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Status; - -namespace io { - -class ARROW_EXPORT BufferedOutputStream : public OutputStream { - public: - ~BufferedOutputStream() override; - - /// \brief Create a buffered output stream wrapping the given output stream. - /// \param[in] buffer_size the size of the temporary write buffer - /// \param[in] pool a MemoryPool to use for allocations - /// \param[in] raw another OutputStream - /// \param[out] out the created BufferedOutputStream - /// \return Status - static Status Create(int64_t buffer_size, MemoryPool* pool, - std::shared_ptr raw, - std::shared_ptr* out); - - /// \brief Resize internal buffer - /// \param[in] new_buffer_size the new buffer size - /// \return Status - Status SetBufferSize(int64_t new_buffer_size); - - /// \brief Return the current size of the internal buffer - int64_t buffer_size() const; - - /// \brief Flush any buffered writes and release the raw - /// OutputStream. Further operations on this object are invalid - /// \param[out] raw the underlying OutputStream - /// \return Status - Status Detach(std::shared_ptr* raw); - - // OutputStream interface - - /// \brief Close the buffered output stream. This implicitly closes the - /// underlying raw output stream. - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - // Write bytes to the stream. Thread-safe - Status Write(const void* data, int64_t nbytes) override; - - Status Flush() override; - - /// \brief Return the underlying raw output stream. - std::shared_ptr raw() const; - - private: - explicit BufferedOutputStream(std::shared_ptr raw, MemoryPool* pool); - - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -/// \class BufferedInputStream -/// \brief An InputStream that performs buffered reads from an unbuffered -/// InputStream, which can mitigate the overhead of many small reads in some -/// cases -class ARROW_EXPORT BufferedInputStream : public InputStream { - public: - ~BufferedInputStream() override; - - /// \brief Create a BufferedInputStream from a raw InputStream - /// \param[in] buffer_size the size of the temporary read buffer - /// \param[in] pool a MemoryPool to use for allocations - /// \param[in] raw a raw InputStream - /// \param[out] out the created BufferedInputStream - /// \param[in] raw_read_bound a bound on the maximum number of bytes - /// to read from the raw input stream. The default -1 indicates that - /// it is unbounded - static Status Create(int64_t buffer_size, MemoryPool* pool, - std::shared_ptr raw, - std::shared_ptr* out, - int64_t raw_read_bound = -1); - - /// \brief Resize internal read buffer; calls to Read(...) will read at least - /// \param[in] new_buffer_size the new read buffer size - /// \return Status - Status SetBufferSize(int64_t new_buffer_size); - - /// \brief Return the number of remaining bytes in the read buffer - int64_t bytes_buffered() const; - - /// \brief Return the current size of the internal buffer - int64_t buffer_size() const; - - /// \brief Release the raw InputStream. Any data buffered will be - /// discarded. Further operations on this object are invalid - /// \return raw the underlying InputStream - std::shared_ptr Detach(); - - /// \brief Return the unbuffered InputStream - std::shared_ptr raw() const; - - // InputStream APIs - - /// \brief Return a zero-copy string view referencing buffered data, - /// but do not advance the position of the stream. Buffers data and - /// expands the buffer size if necessary - Status Peek(int64_t nbytes, util::string_view* out) override; - - Status Close() override; - bool closed() const override; - - /// \brief Returns the position of the buffered stream, though the position - /// of the unbuffered stream may be further advanced - Status Tell(int64_t* position) const override; - - Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - - /// \brief Read into buffer. If the read is already buffered, then this will - /// return a slice into the buffer - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - private: - explicit BufferedInputStream(std::shared_ptr raw, MemoryPool* pool, - int64_t raw_total_bytes_bound); - - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_BUFFERED_H diff --git a/r/R/inst/include/arrow/io/compressed.h b/r/R/inst/include/arrow/io/compressed.h deleted file mode 100644 index ffb18d929ab..00000000000 --- a/r/R/inst/include/arrow/io/compressed.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Compressed stream implementations - -#ifndef ARROW_IO_COMPRESSED_H -#define ARROW_IO_COMPRESSED_H - -#include -#include - -#include "arrow/io/interfaces.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; -class Status; - -namespace util { - -class Codec; - -} // namespace util - -namespace io { - -class ARROW_EXPORT CompressedOutputStream : public OutputStream { - public: - ~CompressedOutputStream() override; - - /// \brief Create a compressed output stream wrapping the given output stream. - static Status Make(util::Codec* codec, const std::shared_ptr& raw, - std::shared_ptr* out); - static Status Make(MemoryPool* pool, util::Codec* codec, - const std::shared_ptr& raw, - std::shared_ptr* out); - - // OutputStream interface - - /// \brief Close the compressed output stream. This implicitly closes the - /// underlying raw output stream. - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Write(const void* data, int64_t nbytes) override; - Status Flush() override; - - /// \brief Return the underlying raw output stream. - std::shared_ptr raw() const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedOutputStream); - - CompressedOutputStream() = default; - - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -class ARROW_EXPORT CompressedInputStream : public InputStream { - public: - ~CompressedInputStream() override; - - /// \brief Create a compressed input stream wrapping the given input stream. - static Status Make(util::Codec* codec, const std::shared_ptr& raw, - std::shared_ptr* out); - static Status Make(MemoryPool* pool, util::Codec* codec, - const std::shared_ptr& raw, - std::shared_ptr* out); - - // InputStream interface - - /// \brief Close the compressed input stream. This implicitly closes the - /// underlying raw input stream. - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - /// \brief Return the underlying raw input stream. - std::shared_ptr raw() const; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedInputStream); - - CompressedInputStream() = default; - - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_COMPRESSED_H diff --git a/r/R/inst/include/arrow/io/file.h b/r/R/inst/include/arrow/io/file.h deleted file mode 100644 index e9ac13f4c6a..00000000000 --- a/r/R/inst/include/arrow/io/file.h +++ /dev/null @@ -1,246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// IO interface implementations for OS files - -#ifndef ARROW_IO_FILE_H -#define ARROW_IO_FILE_H - -#include -#include -#include - -#include "arrow/io/interfaces.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Status; - -namespace io { - -class ARROW_EXPORT FileOutputStream : public OutputStream { - public: - ~FileOutputStream() override; - - /// \brief Open a local file for writing, truncating any existing file - /// \param[in] path with UTF8 encoding - /// \param[out] out a base interface OutputStream instance - /// - /// When opening a new file, any existing file with the indicated path is - /// truncated to 0 bytes, deleting any existing data - static Status Open(const std::string& path, std::shared_ptr* out); - - /// \brief Open a local file for writing - /// \param[in] path with UTF8 encoding - /// \param[in] append append to existing file, otherwise truncate to 0 bytes - /// \param[out] out a base interface OutputStream instance - static Status Open(const std::string& path, bool append, - std::shared_ptr* out); - - /// \brief Open a file descriptor for writing. The underlying file isn't - /// truncated. - /// \param[in] fd file descriptor - /// \param[out] out a base interface OutputStream instance - /// - /// The file descriptor becomes owned by the OutputStream, and will be closed - /// on Close() or destruction. - static Status Open(int fd, std::shared_ptr* out); - - /// \brief Open a local file for writing, truncating any existing file - /// \param[in] path with UTF8 encoding - /// \param[out] file a FileOutputStream instance - /// - /// When opening a new file, any existing file with the indicated path is - /// truncated to 0 bytes, deleting any existing data - static Status Open(const std::string& path, std::shared_ptr* file); - - /// \brief Open a local file for writing - /// \param[in] path with UTF8 encoding - /// \param[in] append append to existing file, otherwise truncate to 0 bytes - /// \param[out] file a FileOutputStream instance - static Status Open(const std::string& path, bool append, - std::shared_ptr* file); - - /// \brief Open a file descriptor for writing. The underlying file isn't - /// truncated. - /// \param[in] fd file descriptor - /// \param[out] out a FileOutputStream instance - /// - /// The file descriptor becomes owned by the OutputStream, and will be closed - /// on Close() or destruction. - static Status Open(int fd, std::shared_ptr* out); - - // OutputStream interface - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - - // Write bytes to the stream. Thread-safe - Status Write(const void* data, int64_t nbytes) override; - - using Writable::Write; - - int file_descriptor() const; - - private: - FileOutputStream(); - - class ARROW_NO_EXPORT FileOutputStreamImpl; - std::unique_ptr impl_; -}; - -// Operating system file -class ARROW_EXPORT ReadableFile : public RandomAccessFile { - public: - ~ReadableFile() override; - - /// \brief Open a local file for reading - /// \param[in] path with UTF8 encoding - /// \param[out] file ReadableFile instance - /// Open file, allocate memory (if needed) from default memory pool - static Status Open(const std::string& path, std::shared_ptr* file); - - /// \brief Open a local file for reading - /// \param[in] path with UTF8 encoding - /// \param[in] pool a MemoryPool for memory allocations - /// \param[out] file ReadableFile instance - /// Open file with one's own memory pool for memory allocations - static Status Open(const std::string& path, MemoryPool* pool, - std::shared_ptr* file); - - /// \brief Open a local file for reading - /// \param[in] fd file descriptor - /// \param[out] file ReadableFile instance - /// Open file with one's own memory pool for memory allocations - /// - /// The file descriptor becomes owned by the ReadableFile, and will be closed - /// on Close() or destruction. - static Status Open(int fd, std::shared_ptr* file); - - /// \brief Open a local file for reading - /// \param[in] fd file descriptor - /// \param[in] pool a MemoryPool for memory allocations - /// \param[out] file ReadableFile instance - /// Open file with one's own memory pool for memory allocations - /// - /// The file descriptor becomes owned by the ReadableFile, and will be closed - /// on Close() or destruction. - static Status Open(int fd, MemoryPool* pool, std::shared_ptr* file); - - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - - // Read bytes from the file. Thread-safe - Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - /// \brief Thread-safe implementation of ReadAt - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - void* out) override; - - /// \brief Thread-safe implementation of ReadAt - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status GetSize(int64_t* size) override; - Status Seek(int64_t position) override; - - int file_descriptor() const; - - private: - explicit ReadableFile(MemoryPool* pool); - - class ARROW_NO_EXPORT ReadableFileImpl; - std::unique_ptr impl_; -}; - -// A file interface that uses memory-mapped files for memory interactions, -// supporting zero copy reads. The same class is used for both reading and -// writing. -// -// If opening a file in a writable mode, it is not truncated first as with -// FileOutputStream -class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { - public: - ~MemoryMappedFile() override; - - /// Create new file with indicated size, return in read/write mode - static Status Create(const std::string& path, int64_t size, - std::shared_ptr* out); - - static Status Open(const std::string& path, FileMode::type mode, - std::shared_ptr* out); - - Status Close() override; - - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Seek(int64_t position) override; - - // Required by RandomAccessFile, copies memory into out. Not thread-safe - Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - - // Zero copy read, moves position pointer. Not thread-safe - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - // Zero-copy read, leaves position unchanged. Acquires a reader lock - // for the duration of slice creation (typically very short). Is thread-safe. - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - // Raw copy of the memory at specified position. Thread-safe, but - // locks out other readers for the duration of memcpy. Prefer the - // zero copy method - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - void* out) override; - - bool supports_zero_copy() const override; - - /// Write data at the current position in the file. Thread-safe - Status Write(const void* data, int64_t nbytes) override; - - /// Set the size of the map to new_size. - Status Resize(int64_t new_size); - - /// Write data at a particular position in the file. Thread-safe - Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; - - // @return: the size in bytes of the memory source - Status GetSize(int64_t* size) const; - - // @return: the size in bytes of the memory source - Status GetSize(int64_t* size) override; - - int file_descriptor() const; - - private: - MemoryMappedFile(); - - Status WriteInternal(const void* data, int64_t nbytes); - - class ARROW_NO_EXPORT MemoryMap; - std::shared_ptr memory_map_; -}; - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_FILE_H diff --git a/r/R/inst/include/arrow/io/hdfs-internal.h b/r/R/inst/include/arrow/io/hdfs-internal.h deleted file mode 100644 index 3912f2f1144..00000000000 --- a/r/R/inst/include/arrow/io/hdfs-internal.h +++ /dev/null @@ -1,224 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_HDFS_INTERNAL -#define ARROW_IO_HDFS_INTERNAL - -#include -#include - -#include - -#include "arrow/util/visibility.h" -#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep - -using std::size_t; - -struct hdfsBuilder; - -namespace arrow { - -class Status; - -namespace io { -namespace internal { - -// NOTE(wesm): cpplint does not like use of short and other imprecise C types -struct LibHdfsShim { -#ifndef _WIN32 - void* handle; -#else - HINSTANCE handle; -#endif - - hdfsBuilder* (*hdfsNewBuilder)(void); - void (*hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn); - void (*hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port); - void (*hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName); - void (*hdfsBuilderSetKerbTicketCachePath)(hdfsBuilder* bld, - const char* kerbTicketCachePath); - void (*hdfsBuilderSetForceNewInstance)(hdfsBuilder* bld); - hdfsFS (*hdfsBuilderConnect)(hdfsBuilder* bld); - int (*hdfsBuilderConfSetStr)(hdfsBuilder* bld, const char* key, const char* val); - - int (*hdfsDisconnect)(hdfsFS fs); - - hdfsFile (*hdfsOpenFile)(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize); // NOLINT - - int (*hdfsCloseFile)(hdfsFS fs, hdfsFile file); - int (*hdfsExists)(hdfsFS fs, const char* path); - int (*hdfsSeek)(hdfsFS fs, hdfsFile file, tOffset desiredPos); - tOffset (*hdfsTell)(hdfsFS fs, hdfsFile file); - tSize (*hdfsRead)(hdfsFS fs, hdfsFile file, void* buffer, tSize length); - tSize (*hdfsPread)(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, - tSize length); - tSize (*hdfsWrite)(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); - int (*hdfsFlush)(hdfsFS fs, hdfsFile file); - int (*hdfsAvailable)(hdfsFS fs, hdfsFile file); - int (*hdfsCopy)(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); - int (*hdfsMove)(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); - int (*hdfsDelete)(hdfsFS fs, const char* path, int recursive); - int (*hdfsRename)(hdfsFS fs, const char* oldPath, const char* newPath); - char* (*hdfsGetWorkingDirectory)(hdfsFS fs, char* buffer, size_t bufferSize); - int (*hdfsSetWorkingDirectory)(hdfsFS fs, const char* path); - int (*hdfsCreateDirectory)(hdfsFS fs, const char* path); - int (*hdfsSetReplication)(hdfsFS fs, const char* path, int16_t replication); - hdfsFileInfo* (*hdfsListDirectory)(hdfsFS fs, const char* path, int* numEntries); - hdfsFileInfo* (*hdfsGetPathInfo)(hdfsFS fs, const char* path); - void (*hdfsFreeFileInfo)(hdfsFileInfo* hdfsFileInfo, int numEntries); - char*** (*hdfsGetHosts)(hdfsFS fs, const char* path, tOffset start, tOffset length); - void (*hdfsFreeHosts)(char*** blockHosts); - tOffset (*hdfsGetDefaultBlockSize)(hdfsFS fs); - tOffset (*hdfsGetCapacity)(hdfsFS fs); - tOffset (*hdfsGetUsed)(hdfsFS fs); - int (*hdfsChown)(hdfsFS fs, const char* path, const char* owner, const char* group); - int (*hdfsChmod)(hdfsFS fs, const char* path, short mode); // NOLINT - int (*hdfsUtime)(hdfsFS fs, const char* path, tTime mtime, tTime atime); - - void Initialize() { - this->handle = nullptr; - this->hdfsNewBuilder = nullptr; - this->hdfsBuilderSetNameNode = nullptr; - this->hdfsBuilderSetNameNodePort = nullptr; - this->hdfsBuilderSetUserName = nullptr; - this->hdfsBuilderSetKerbTicketCachePath = nullptr; - this->hdfsBuilderSetForceNewInstance = nullptr; - this->hdfsBuilderConfSetStr = nullptr; - this->hdfsBuilderConnect = nullptr; - this->hdfsDisconnect = nullptr; - this->hdfsOpenFile = nullptr; - this->hdfsCloseFile = nullptr; - this->hdfsExists = nullptr; - this->hdfsSeek = nullptr; - this->hdfsTell = nullptr; - this->hdfsRead = nullptr; - this->hdfsPread = nullptr; - this->hdfsWrite = nullptr; - this->hdfsFlush = nullptr; - this->hdfsAvailable = nullptr; - this->hdfsCopy = nullptr; - this->hdfsMove = nullptr; - this->hdfsDelete = nullptr; - this->hdfsRename = nullptr; - this->hdfsGetWorkingDirectory = nullptr; - this->hdfsSetWorkingDirectory = nullptr; - this->hdfsCreateDirectory = nullptr; - this->hdfsSetReplication = nullptr; - this->hdfsListDirectory = nullptr; - this->hdfsGetPathInfo = nullptr; - this->hdfsFreeFileInfo = nullptr; - this->hdfsGetHosts = nullptr; - this->hdfsFreeHosts = nullptr; - this->hdfsGetDefaultBlockSize = nullptr; - this->hdfsGetCapacity = nullptr; - this->hdfsGetUsed = nullptr; - this->hdfsChown = nullptr; - this->hdfsChmod = nullptr; - this->hdfsUtime = nullptr; - } - - hdfsBuilder* NewBuilder(void); - - void BuilderSetNameNode(hdfsBuilder* bld, const char* nn); - - void BuilderSetNameNodePort(hdfsBuilder* bld, tPort port); - - void BuilderSetUserName(hdfsBuilder* bld, const char* userName); - - void BuilderSetKerbTicketCachePath(hdfsBuilder* bld, const char* kerbTicketCachePath); - - void BuilderSetForceNewInstance(hdfsBuilder* bld); - - int BuilderConfSetStr(hdfsBuilder* bld, const char* key, const char* val); - - hdfsFS BuilderConnect(hdfsBuilder* bld); - - int Disconnect(hdfsFS fs); - - hdfsFile OpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, - short replication, tSize blocksize); // NOLINT - - int CloseFile(hdfsFS fs, hdfsFile file); - - int Exists(hdfsFS fs, const char* path); - - int Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos); - - tOffset Tell(hdfsFS fs, hdfsFile file); - - tSize Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length); - - bool HasPread(); - - tSize Pread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length); - - tSize Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length); - - int Flush(hdfsFS fs, hdfsFile file); - - int Available(hdfsFS fs, hdfsFile file); - - int Copy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); - - int Move(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); - - int Delete(hdfsFS fs, const char* path, int recursive); - - int Rename(hdfsFS fs, const char* oldPath, const char* newPath); - - char* GetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize); - - int SetWorkingDirectory(hdfsFS fs, const char* path); - - int MakeDirectory(hdfsFS fs, const char* path); - - int SetReplication(hdfsFS fs, const char* path, int16_t replication); - - hdfsFileInfo* ListDirectory(hdfsFS fs, const char* path, int* numEntries); - - hdfsFileInfo* GetPathInfo(hdfsFS fs, const char* path); - - void FreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries); - - char*** GetHosts(hdfsFS fs, const char* path, tOffset start, tOffset length); - - void FreeHosts(char*** blockHosts); - - tOffset GetDefaultBlockSize(hdfsFS fs); - tOffset GetCapacity(hdfsFS fs); - - tOffset GetUsed(hdfsFS fs); - - int Chown(hdfsFS fs, const char* path, const char* owner, const char* group); - - int Chmod(hdfsFS fs, const char* path, short mode); // NOLINT - - int Utime(hdfsFS fs, const char* path, tTime mtime, tTime atime); - - Status GetRequiredSymbols(); -}; - -// TODO(wesm): Remove these exports when we are linking statically -Status ARROW_EXPORT ConnectLibHdfs(LibHdfsShim** driver); -Status ARROW_EXPORT ConnectLibHdfs3(LibHdfsShim** driver); - -} // namespace internal -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_HDFS_INTERNAL diff --git a/r/R/inst/include/arrow/io/hdfs.h b/r/R/inst/include/arrow/io/hdfs.h deleted file mode 100644 index 45a47ddedad..00000000000 --- a/r/R/inst/include/arrow/io/hdfs.h +++ /dev/null @@ -1,258 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_HDFS -#define ARROW_IO_HDFS - -#include -#include -#include -#include -#include - -#include "arrow/io/interfaces.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Status; - -namespace io { - -class HdfsReadableFile; -class HdfsOutputStream; - -struct HdfsPathInfo { - ObjectType::type kind; - - std::string name; - std::string owner; - std::string group; - - // Access times in UNIX timestamps (seconds) - int64_t size; - int64_t block_size; - - int32_t last_modified_time; - int32_t last_access_time; - - int16_t replication; - int16_t permissions; -}; - -enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 }; - -struct HdfsConnectionConfig { - std::string host; - int port; - std::string user; - std::string kerb_ticket; - std::unordered_map extra_conf; - HdfsDriver driver; -}; - -class ARROW_EXPORT HadoopFileSystem : public FileSystem { - public: - ~HadoopFileSystem() override; - - // Connect to an HDFS cluster given a configuration - // - // @param config (in): configuration for connecting - // @param fs (out): the created client - // @returns Status - static Status Connect(const HdfsConnectionConfig* config, - std::shared_ptr* fs); - - // Create directory and all parents - // - // @param path (in): absolute HDFS path - // @returns Status - Status MakeDirectory(const std::string& path) override; - - // Delete file or directory - // @param path: absolute path to data - // @param recursive: if path is a directory, delete contents as well - // @returns error status on failure - Status Delete(const std::string& path, bool recursive = false); - - Status DeleteDirectory(const std::string& path) override; - - // Disconnect from cluster - // - // @returns Status - Status Disconnect(); - - // @param path (in): absolute HDFS path - // @returns bool, true if the path exists, false if not (or on error) - bool Exists(const std::string& path); - - // @param path (in): absolute HDFS path - // @param info (out) - // @returns Status - Status GetPathInfo(const std::string& path, HdfsPathInfo* info); - - // @param nbytes (out): total capacity of the filesystem - // @returns Status - Status GetCapacity(int64_t* nbytes); - - // @param nbytes (out): total bytes used of the filesystem - // @returns Status - Status GetUsed(int64_t* nbytes); - - Status GetChildren(const std::string& path, std::vector* listing) override; - - Status ListDirectory(const std::string& path, std::vector* listing); - - /// Change - /// - /// @param path file path to change - /// @param owner pass null for no change - /// @param group pass null for no change - Status Chown(const std::string& path, const char* owner, const char* group); - - /// Change path permissions - /// - /// \param path Absolute path in file system - /// \param mode Mode bitset - /// \return Status - Status Chmod(const std::string& path, int mode); - - // Move file or directory from source path to destination path within the - // current filesystem - Status Rename(const std::string& src, const std::string& dst) override; - - Status Stat(const std::string& path, FileStatistics* stat) override; - - // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory - - // Open an HDFS file in READ mode. Returns error - // status if the file is not found. - // - // @param path complete file path - Status OpenReadable(const std::string& path, int32_t buffer_size, - std::shared_ptr* file); - - Status OpenReadable(const std::string& path, std::shared_ptr* file); - - // FileMode::WRITE options - // @param path complete file path - // @param buffer_size, 0 for default - // @param replication, 0 for default - // @param default_block_size, 0 for default - Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file); - - Status OpenWritable(const std::string& path, bool append, - std::shared_ptr* file); - - ARROW_DEPRECATED("Use OpenWritable") - Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, - int16_t replication, int64_t default_block_size, - std::shared_ptr* file); - - ARROW_DEPRECATED("Use OpenWritable") - Status OpenWriteable(const std::string& path, bool append, - std::shared_ptr* file); - - private: - friend class HdfsReadableFile; - friend class HdfsOutputStream; - - class ARROW_NO_EXPORT HadoopFileSystemImpl; - std::unique_ptr impl_; - - HadoopFileSystem(); - ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem); -}; - -class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { - public: - ~HdfsReadableFile() override; - - Status Close() override; - - bool closed() const override; - - Status GetSize(int64_t* size) override; - - // NOTE: If you wish to read a particular range of a file in a multithreaded - // context, you may prefer to use ReadAt to avoid locking issues - Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; - - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - void* buffer) override; - - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status Seek(int64_t position) override; - Status Tell(int64_t* position) const override; - - void set_memory_pool(MemoryPool* pool); - - private: - explicit HdfsReadableFile(MemoryPool* pool = NULLPTR); - - class ARROW_NO_EXPORT HdfsReadableFileImpl; - std::unique_ptr impl_; - - friend class HadoopFileSystem::HadoopFileSystemImpl; - - ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); -}; - -// Naming this file OutputStream because it does not support seeking (like the -// WritableFile interface) -class ARROW_EXPORT HdfsOutputStream : public OutputStream { - public: - ~HdfsOutputStream() override; - - Status Close() override; - - bool closed() const override; - - Status Write(const void* buffer, int64_t nbytes) override; - - Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written); - - Status Flush() override; - - Status Tell(int64_t* position) const override; - - private: - class ARROW_NO_EXPORT HdfsOutputStreamImpl; - std::unique_ptr impl_; - - friend class HadoopFileSystem::HadoopFileSystemImpl; - - HdfsOutputStream(); - - ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); -}; - -Status ARROW_EXPORT HaveLibHdfs(); -Status ARROW_EXPORT HaveLibHdfs3(); - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_HDFS diff --git a/r/R/inst/include/arrow/io/interfaces.h b/r/R/inst/include/arrow/io/interfaces.h deleted file mode 100644 index 3a5cfe3d778..00000000000 --- a/r/R/inst/include/arrow/io/interfaces.h +++ /dev/null @@ -1,206 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_INTERFACES_H -#define ARROW_IO_INTERFACES_H - -#include -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class Status; - -namespace io { - -struct FileMode { - enum type { READ, WRITE, READWRITE }; -}; - -struct ObjectType { - enum type { FILE, DIRECTORY }; -}; - -struct ARROW_EXPORT FileStatistics { - /// Size of file, -1 if finding length is unsupported - int64_t size; - ObjectType::type kind; -}; - -class ARROW_EXPORT FileSystem { - public: - virtual ~FileSystem() = default; - - virtual Status MakeDirectory(const std::string& path) = 0; - - virtual Status DeleteDirectory(const std::string& path) = 0; - - virtual Status GetChildren(const std::string& path, - std::vector* listing) = 0; - - virtual Status Rename(const std::string& src, const std::string& dst) = 0; - - virtual Status Stat(const std::string& path, FileStatistics* stat) = 0; -}; - -class ARROW_EXPORT FileInterface { - public: - virtual ~FileInterface() = 0; - virtual Status Close() = 0; - virtual Status Tell(int64_t* position) const = 0; - virtual bool closed() const = 0; - - FileMode::type mode() const { return mode_; } - - protected: - FileInterface() : mode_(FileMode::READ) {} - FileMode::type mode_; - void set_mode(FileMode::type mode) { mode_ = mode; } - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(FileInterface); -}; - -class ARROW_EXPORT Seekable { - public: - virtual ~Seekable() = default; - virtual Status Seek(int64_t position) = 0; -}; - -class ARROW_EXPORT Writable { - public: - virtual ~Writable() = default; - - virtual Status Write(const void* data, int64_t nbytes) = 0; - - /// \brief Flush buffered bytes, if any - virtual Status Flush(); - - Status Write(const std::string& data); -}; - -class ARROW_EXPORT Readable { - public: - virtual ~Readable() = default; - - virtual Status Read(int64_t nbytes, int64_t* bytes_read, void* out) = 0; - - // Does not copy if not necessary - virtual Status Read(int64_t nbytes, std::shared_ptr* out) = 0; -}; - -class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable { - protected: - OutputStream() = default; -}; - -class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Readable { - public: - /// \brief Advance or skip stream indicated number of bytes - /// \param[in] nbytes the number to move forward - /// \return Status - Status Advance(int64_t nbytes); - - /// \brief Return zero-copy string_view to upcoming bytes in the - /// stream but do not modify stream position. View becomes invalid - /// after any operation on file. If the InputStream is unbuffered, - /// returns 0-length string_view. May trigger buffering if the - /// requested size is larger than the number of buffered bytes - /// \param[in] nbytes the maximum number of bytes to see - /// \param[out] out the returned arrow::util::string_view - /// \return Status - virtual Status Peek(int64_t nbytes, util::string_view* out); - - /// \brief Return true if InputStream is capable of zero copy Buffer reads - virtual bool supports_zero_copy() const; - - protected: - InputStream() = default; -}; - -class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable { - public: - /// Necessary because we hold a std::unique_ptr - ~RandomAccessFile() override; - - virtual Status GetSize(int64_t* size) = 0; - - /// \brief Read nbytes at position, provide default implementations using - /// Read(...), but can be overridden. The default implementation is - /// thread-safe. It is unspecified whether this method updates the file - /// position or not. - /// - /// \param[in] position Where to read bytes from - /// \param[in] nbytes The number of bytes to read - /// \param[out] bytes_read The number of bytes read - /// \param[out] out The buffer to read bytes into - /// \return Status - virtual Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out); - - /// \brief Read nbytes at position, provide default implementations using - /// Read(...), but can be overridden. The default implementation is - /// thread-safe. It is unspecified whether this method updates the file - /// position or not. - /// - /// \param[in] position Where to read bytes from - /// \param[in] nbytes The number of bytes to read - /// \param[out] out The buffer to read bytes into. The number of bytes read can be - /// retrieved by calling Buffer::size(). - virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out); - - protected: - RandomAccessFile(); - - private: - struct ARROW_NO_EXPORT RandomAccessFileImpl; - std::unique_ptr interface_impl_; -}; - -class ARROW_EXPORT WritableFile : public OutputStream, public Seekable { - public: - virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0; - - protected: - WritableFile() = default; -}; - -class ARROW_EXPORT ReadWriteFileInterface : public RandomAccessFile, public WritableFile { - protected: - ReadWriteFileInterface() { RandomAccessFile::set_mode(FileMode::READWRITE); } -}; - -// TODO(kszucs): remove this after 0.13 -#ifndef _MSC_VER -using WriteableFile ARROW_DEPRECATED("Use WritableFile") = WritableFile; -using ReadableFileInterface ARROW_DEPRECATED("Use RandomAccessFile") = RandomAccessFile; -#else -// MSVC does not like using ARROW_DEPRECATED with using declarations -using WriteableFile = WritableFile; -using ReadableFileInterface = RandomAccessFile; -#endif - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_INTERFACES_H diff --git a/r/R/inst/include/arrow/io/memory.h b/r/R/inst/include/arrow/io/memory.h deleted file mode 100644 index d820d46552c..00000000000 --- a/r/R/inst/include/arrow/io/memory.h +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for different memory sharing / IO mechanisms - -#pragma once - -#include -#include - -#include "arrow/buffer.h" -#include "arrow/io/interfaces.h" -#include "arrow/memory_pool.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class ResizableBuffer; -class Status; - -namespace io { - -// \brief An output stream that writes to a resizable buffer -class ARROW_EXPORT BufferOutputStream : public OutputStream { - public: - explicit BufferOutputStream(const std::shared_ptr& buffer); - - /// \brief Create in-memory output stream with indicated capacity using a - /// memory pool - /// \param[in] initial_capacity the initial allocated internal capacity of - /// the OutputStream - /// \param[in,out] pool a MemoryPool to use for allocations - /// \param[out] out the created stream - static Status Create(int64_t initial_capacity, MemoryPool* pool, - std::shared_ptr* out); - - ~BufferOutputStream() override; - - // Implement the OutputStream interface - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - Status Write(const void* data, int64_t nbytes) override; - - using OutputStream::Write; - - /// Close the stream and return the buffer - Status Finish(std::shared_ptr* result); - - /// \brief Initialize state of OutputStream with newly allocated memory and - /// set position to 0 - /// \param[in] initial_capacity the starting allocated capacity - /// \param[in,out] pool the memory pool to use for allocations - /// \return Status - Status Reset(int64_t initial_capacity = 1024, MemoryPool* pool = default_memory_pool()); - - int64_t capacity() const { return capacity_; } - - private: - BufferOutputStream(); - - // Ensures there is sufficient space available to write nbytes - Status Reserve(int64_t nbytes); - - std::shared_ptr buffer_; - bool is_open_; - int64_t capacity_; - int64_t position_; - uint8_t* mutable_data_; -}; - -// \brief A helper class to tracks the size of allocations -class ARROW_EXPORT MockOutputStream : public OutputStream { - public: - MockOutputStream() : extent_bytes_written_(0), is_open_(true) {} - - // Implement the OutputStream interface - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - Status Write(const void* data, int64_t nbytes) override; - - int64_t GetExtentBytesWritten() const { return extent_bytes_written_; } - - private: - int64_t extent_bytes_written_; - bool is_open_; -}; - -/// \brief Enables random writes into a fixed-size mutable buffer -class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile { - public: - /// Input buffer must be mutable, will abort if not - explicit FixedSizeBufferWriter(const std::shared_ptr& buffer); - ~FixedSizeBufferWriter() override; - - Status Close() override; - bool closed() const override; - Status Seek(int64_t position) override; - Status Tell(int64_t* position) const override; - Status Write(const void* data, int64_t nbytes) override; - Status WriteAt(int64_t position, const void* data, int64_t nbytes) override; - - void set_memcopy_threads(int num_threads); - void set_memcopy_blocksize(int64_t blocksize); - void set_memcopy_threshold(int64_t threshold); - - protected: - class FixedSizeBufferWriterImpl; - std::unique_ptr impl_; -}; - -/// \class BufferReader -/// \brief Random access zero-copy reads on an arrow::Buffer -class ARROW_EXPORT BufferReader : public RandomAccessFile { - public: - explicit BufferReader(const std::shared_ptr& buffer); - explicit BufferReader(const Buffer& buffer); - BufferReader(const uint8_t* data, int64_t size); - - /// \brief Instantiate from std::string or arrow::util::string_view. Does not - /// own data - explicit BufferReader(const util::string_view& data); - - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; - // Zero copy read - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - Status Peek(int64_t nbytes, util::string_view* out) override; - - bool supports_zero_copy() const override; - - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - void* out) override; - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status GetSize(int64_t* size) override; - Status Seek(int64_t position) override; - - std::shared_ptr buffer() const { return buffer_; } - - protected: - inline Status CheckClosed() const; - - std::shared_ptr buffer_; - const uint8_t* data_; - int64_t size_; - int64_t position_; - bool is_open_; -}; - -} // namespace io -} // namespace arrow diff --git a/r/R/inst/include/arrow/io/mman.h b/r/R/inst/include/arrow/io/mman.h deleted file mode 100644 index 61254925609..00000000000 --- a/r/R/inst/include/arrow/io/mman.h +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright https://code.google.com/p/mman-win32/ -// -// Licensed under the MIT License; -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/MIT - -#ifndef _MMAN_WIN32_H -#define _MMAN_WIN32_H - -#include "arrow/util/windows_compatibility.h" - -#include -#include -#include - -#define PROT_NONE 0 -#define PROT_READ 1 -#define PROT_WRITE 2 -#define PROT_EXEC 4 - -#define MAP_FILE 0 -#define MAP_SHARED 1 -#define MAP_PRIVATE 2 -#define MAP_TYPE 0xf -#define MAP_FIXED 0x10 -#define MAP_ANONYMOUS 0x20 -#define MAP_ANON MAP_ANONYMOUS - -#define MAP_FAILED ((void*)-1) - -/* Flags for msync. */ -#define MS_ASYNC 1 -#define MS_SYNC 2 -#define MS_INVALIDATE 4 - -#ifndef FILE_MAP_EXECUTE -#define FILE_MAP_EXECUTE 0x0020 -#endif - -static inline int __map_mman_error(const DWORD err, const int deferr) { - if (err == 0) return 0; - // TODO: implement - return err; -} - -static inline DWORD __map_mmap_prot_page(const int prot) { - DWORD protect = 0; - - if (prot == PROT_NONE) return protect; - - if ((prot & PROT_EXEC) != 0) { - protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; - } else { - protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY; - } - - return protect; -} - -static inline DWORD __map_mmap_prot_file(const int prot) { - DWORD desiredAccess = 0; - - if (prot == PROT_NONE) return desiredAccess; - - if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ; - if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE; - if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE; - - return desiredAccess; -} - -static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes, - off_t off) { - HANDLE fm, h; - - void* map = MAP_FAILED; - -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4293) -#endif - - const DWORD dwFileOffsetLow = - (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); - const DWORD dwFileOffsetHigh = - (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); - const DWORD protect = __map_mmap_prot_page(prot); - const DWORD desiredAccess = __map_mmap_prot_file(prot); - - const size_t maxSize = off + len; - - const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFL); - const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFL); - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - - errno = 0; - - if (len == 0 - /* Unsupported flag combinations */ - || (flags & MAP_FIXED) != 0 - /* Usupported protection combinations */ - || prot == PROT_EXEC) { - errno = EINVAL; - return MAP_FAILED; - } - - h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) - : INVALID_HANDLE_VALUE; - - if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { - errno = EBADF; - return MAP_FAILED; - } - - fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); - - if (fm == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - return MAP_FAILED; - } - - map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); - - CloseHandle(fm); - - if (map == NULL) { - errno = __map_mman_error(GetLastError(), EPERM); - return MAP_FAILED; - } - - return map; -} - -static inline int munmap(void* addr, size_t len) { - if (UnmapViewOfFile(addr)) return 0; - - errno = __map_mman_error(GetLastError(), EPERM); - - return -1; -} - -static inline int mprotect(void* addr, size_t len, int prot) { - DWORD newProtect = __map_mmap_prot_page(prot); - DWORD oldProtect = 0; - - if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0; - - errno = __map_mman_error(GetLastError(), EPERM); - - return -1; -} - -static inline int msync(void* addr, size_t len, int flags) { - if (FlushViewOfFile(addr, len)) return 0; - - errno = __map_mman_error(GetLastError(), EPERM); - - return -1; -} - -static inline int mlock(const void* addr, size_t len) { - if (VirtualLock((LPVOID)addr, len)) return 0; - - errno = __map_mman_error(GetLastError(), EPERM); - - return -1; -} - -static inline int munlock(const void* addr, size_t len) { - if (VirtualUnlock((LPVOID)addr, len)) return 0; - - errno = __map_mman_error(GetLastError(), EPERM); - - return -1; -} - -#endif diff --git a/r/R/inst/include/arrow/io/readahead.h b/r/R/inst/include/arrow/io/readahead.h deleted file mode 100644 index 950520ba597..00000000000 --- a/r/R/inst/include/arrow/io/readahead.h +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_READAHEAD_H -#define ARROW_IO_READAHEAD_H - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; -class ResizableBuffer; -class Status; - -namespace io { - -class InputStream; - -namespace internal { - -struct ARROW_EXPORT ReadaheadBuffer { - std::shared_ptr buffer; - int64_t left_padding; - int64_t right_padding; -}; - -class ARROW_EXPORT ReadaheadSpooler { - public: - /// \brief EXPERIMENTAL: Create a readahead spooler wrapping the given input stream. - /// - /// The spooler launches a background thread that reads up to a given number - /// of fixed-size blocks in advance from the underlying stream. - /// The buffers returned by Read() will be padded at the beginning and the end - /// with the configured amount of (zeroed) bytes. - ReadaheadSpooler(MemoryPool* pool, std::shared_ptr raw, - int64_t read_size = kDefaultReadSize, int32_t readahead_queue_size = 1, - int64_t left_padding = 0, int64_t right_padding = 0); - - explicit ReadaheadSpooler(std::shared_ptr raw, - int64_t read_size = kDefaultReadSize, - int32_t readahead_queue_size = 1, int64_t left_padding = 0, - int64_t right_padding = 0); - - ~ReadaheadSpooler(); - - /// Configure zero-padding at beginning and end of buffers (default 0 bytes). - /// The buffers returned by Read() will be padded at the beginning and the end - /// with the configured amount of (zeroed) bytes. - /// Note that, as reading happens in background and in advance, changing the - /// configured values might not affect Read() results immediately. - int64_t GetLeftPadding(); - void SetLeftPadding(int64_t size); - - int64_t GetRightPadding(); - void SetRightPadding(int64_t size); - - /// \brief Close the spooler. This implicitly closes the underlying input stream. - Status Close(); - - /// \brief Read a buffer from the queue. - /// - /// If the buffer pointer in the ReadaheadBuffer is null, then EOF was - /// reached and/or the spooler was explicitly closed. - /// Otherwise, the buffer will contain at most read_size bytes in addition - /// to the configured padding (short reads are possible at the end of a file). - // How do we allow reusing the buffer in ReadaheadBuffer? perhaps by using - // a caching memory pool? - Status Read(ReadaheadBuffer* out); - - private: - static constexpr int64_t kDefaultReadSize = 1 << 20; // 1 MB - - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -} // namespace internal -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_READAHEAD_H diff --git a/r/R/inst/include/arrow/io/test-common.h b/r/R/inst/include/arrow/io/test-common.h deleted file mode 100644 index 75e134732e3..00000000000 --- a/r/R/inst/include/arrow/io/test-common.h +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IO_TEST_COMMON_H -#define ARROW_IO_TEST_COMMON_H - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace io { - -class MemoryMappedFile; - -ARROW_EXPORT -void AssertFileContents(const std::string& path, const std::string& contents); - -ARROW_EXPORT bool FileExists(const std::string& path); - -ARROW_EXPORT bool FileIsClosed(int fd); - -ARROW_EXPORT -Status ZeroMemoryMap(MemoryMappedFile* file); - -class ARROW_EXPORT MemoryMapFixture { - public: - void TearDown(); - - void CreateFile(const std::string& path, int64_t size); - - Status InitMemoryMap(int64_t size, const std::string& path, - std::shared_ptr* mmap); - - void AppendFile(const std::string& path); - - private: - std::vector tmp_files_; -}; - -} // namespace io -} // namespace arrow - -#endif // ARROW_IO_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/ipc/api.h b/r/R/inst/include/arrow/ipc/api.h deleted file mode 100644 index 1895c313193..00000000000 --- a/r/R/inst/include/arrow/ipc/api.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IPC_API_H -#define ARROW_IPC_API_H - -#include "arrow/ipc/dictionary.h" -#include "arrow/ipc/feather.h" -#include "arrow/ipc/json-simple.h" -#include "arrow/ipc/message.h" -#include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" - -#endif // ARROW_IPC_API_H diff --git a/r/R/inst/include/arrow/ipc/dictionary.h b/r/R/inst/include/arrow/ipc/dictionary.h deleted file mode 100644 index 787cd0ddd5a..00000000000 --- a/r/R/inst/include/arrow/ipc/dictionary.h +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Tools for dictionaries in IPC context - -#ifndef ARROW_IPC_DICTIONARY_H -#define ARROW_IPC_DICTIONARY_H - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -class Field; -class RecordBatch; - -namespace ipc { - -using DictionaryMap = std::unordered_map>; - -/// \brief Memoization data structure for assigning id numbers to -/// dictionaries and tracking their current state through possible -/// deltas in an IPC stream -class ARROW_EXPORT DictionaryMemo { - public: - DictionaryMemo(); - DictionaryMemo(DictionaryMemo&&) = default; - DictionaryMemo& operator=(DictionaryMemo&&) = default; - - /// \brief Return current dictionary corresponding to a particular - /// id. Returns KeyError if id not found - Status GetDictionary(int64_t id, std::shared_ptr* dictionary) const; - - /// \brief Return dictionary value type corresponding to a - /// particular dictionary id. This permits multiple fields to - /// reference the same dictionary in IPC and JSON - Status GetDictionaryType(int64_t id, std::shared_ptr* type) const; - - /// \brief Return id for dictionary, computing new id if necessary - Status GetOrAssignId(const std::shared_ptr& field, int64_t* out); - - /// \brief Return id for dictionary if it exists, otherwise return - /// KeyError - Status GetId(const Field& type, int64_t* id) const; - - /// \brief Return true if dictionary for type is in this memo - bool HasDictionary(const Field& type) const; - - /// \brief Return true if we have a dictionary for the input id - bool HasDictionary(int64_t id) const; - - /// \brief Add field to the memo, return KeyError if already present - Status AddField(int64_t id, const std::shared_ptr& field); - - /// \brief Add a dictionary to the memo with a particular id. Returns - /// KeyError if that dictionary already exists - Status AddDictionary(int64_t id, const std::shared_ptr& dictionary); - - const DictionaryMap& id_to_dictionary() const { return id_to_dictionary_; } - - /// \brief The number of fields tracked in the memo - int num_fields() const { return static_cast(field_to_id_.size()); } - int num_dictionaries() const { return static_cast(id_to_dictionary_.size()); } - - private: - Status AddFieldInternal(int64_t id, const std::shared_ptr& field); - - // Dictionary memory addresses, to track whether a particular - // dictionary-encoded field has been seen before - std::unordered_map field_to_id_; - - // Map of dictionary id to dictionary array - DictionaryMap id_to_dictionary_; - std::unordered_map> id_to_type_; - - ARROW_DISALLOW_COPY_AND_ASSIGN(DictionaryMemo); -}; - -ARROW_EXPORT -Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo); - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_DICTIONARY_H diff --git a/r/R/inst/include/arrow/ipc/feather-internal.h b/r/R/inst/include/arrow/ipc/feather-internal.h deleted file mode 100644 index 2aa04b2db72..00000000000 --- a/r/R/inst/include/arrow/ipc/feather-internal.h +++ /dev/null @@ -1,235 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for the "Feather" file format, originally created at -// http://github.com/wesm/feather - -#ifndef ARROW_IPC_FEATHER_INTERNAL_H -#define ARROW_IPC_FEATHER_INTERNAL_H - -#include -#include -#include -#include -#include - -#include "flatbuffers/flatbuffers.h" - -#include "arrow/buffer.h" -#include "arrow/ipc/feather.h" -#include "arrow/ipc/feather_generated.h" -#include "arrow/type.h" - -namespace arrow { -namespace ipc { -namespace feather { - -typedef std::vector> ColumnVector; -typedef flatbuffers::FlatBufferBuilder FBB; -typedef flatbuffers::Offset FBString; - -struct ARROW_EXPORT ColumnType { - enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; -}; - -struct ARROW_EXPORT ArrayMetadata { - ArrayMetadata() {} - - ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, - int64_t total_bytes) - : type(type), - offset(offset), - length(length), - null_count(null_count), - total_bytes(total_bytes) {} - - bool Equals(const ArrayMetadata& other) const { - return this->type == other.type && this->offset == other.offset && - this->length == other.length && this->null_count == other.null_count && - this->total_bytes == other.total_bytes; - } - - fbs::Type type; - int64_t offset; - int64_t length; - int64_t null_count; - int64_t total_bytes; -}; - -struct ARROW_EXPORT CategoryMetadata { - ArrayMetadata levels; - bool ordered; -}; - -struct ARROW_EXPORT TimestampMetadata { - TimeUnit::type unit; - - // A timezone name known to the Olson timezone database. For display purposes - // because the actual data is all UTC - std::string timezone; -}; - -struct ARROW_EXPORT TimeMetadata { - TimeUnit::type unit; -}; - -static constexpr const char* kFeatherMagicBytes = "FEA1"; -static constexpr const int kFeatherDefaultAlignment = 8; - -class ColumnBuilder; - -class ARROW_EXPORT TableBuilder { - public: - explicit TableBuilder(int64_t num_rows); - ~TableBuilder() = default; - - FBB& fbb(); - Status Finish(); - std::shared_ptr GetBuffer() const; - - std::unique_ptr AddColumn(const std::string& name); - void SetDescription(const std::string& description); - void SetNumRows(int64_t num_rows); - void add_column(const flatbuffers::Offset& col); - - private: - flatbuffers::FlatBufferBuilder fbb_; - ColumnVector columns_; - - friend class ColumnBuilder; - - bool finished_; - std::string description_; - int64_t num_rows_; -}; - -class ARROW_EXPORT TableMetadata { - public: - TableMetadata() : table_(NULLPTR) {} - ~TableMetadata() = default; - - Status Open(const std::shared_ptr& buffer) { - metadata_buffer_ = buffer; - table_ = fbs::GetCTable(buffer->data()); - - if (table_->version() < kFeatherVersion) { - std::cout << "This Feather file is old" - << " and will not be readable beyond the 0.3.0 release" << std::endl; - } - return Status::OK(); - } - - bool HasDescription() const { return table_->description() != 0; } - - std::string GetDescription() const { - if (!HasDescription()) { - return std::string(""); - } - return table_->description()->str(); - } - - int version() const { return table_->version(); } - int64_t num_rows() const { return table_->num_rows(); } - int64_t num_columns() const { return table_->columns()->size(); } - - const fbs::Column* column(int i) { return table_->columns()->Get(i); } - - private: - std::shared_ptr metadata_buffer_; - const fbs::CTable* table_; -}; - -static inline flatbuffers::Offset GetPrimitiveArray( - FBB& fbb, const ArrayMetadata& array) { - return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding_PLAIN, array.offset, - array.length, array.null_count, array.total_bytes); -} - -static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { - return static_cast(static_cast(unit)); -} - -static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { - return static_cast(static_cast(unit)); -} - -// Convert Feather enums to Flatbuffer enums - -const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { - fbs::TypeMetadata_NONE, // PRIMITIVE - fbs::TypeMetadata_CategoryMetadata, // CATEGORY - fbs::TypeMetadata_TimestampMetadata, // TIMESTAMP - fbs::TypeMetadata_DateMetadata, // DATE - fbs::TypeMetadata_TimeMetadata // TIME -}; - -static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { - return COLUMN_TYPE_ENUM_MAPPING[column_type]; -} - -static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) { - out->type = values->type(); - out->offset = values->offset(); - out->length = values->length(); - out->null_count = values->null_count(); - out->total_bytes = values->total_bytes(); -} - -class ARROW_EXPORT ColumnBuilder { - public: - ColumnBuilder(TableBuilder* parent, const std::string& name); - ~ColumnBuilder() = default; - - flatbuffers::Offset CreateColumnMetadata(); - - Status Finish(); - void SetValues(const ArrayMetadata& values); - void SetUserMetadata(const std::string& data); - void SetCategory(const ArrayMetadata& levels, bool ordered = false); - void SetTimestamp(TimeUnit::type unit); - void SetTimestamp(TimeUnit::type unit, const std::string& timezone); - void SetDate(); - void SetTime(TimeUnit::type unit); - FBB& fbb(); - - private: - TableBuilder* parent_; - - std::string name_; - ArrayMetadata values_; - std::string user_metadata_; - - // Column metadata - - // Is this a primitive type, or one of the types having metadata? Default is - // primitive - ColumnType::type type_; - - // Type-specific metadata union - CategoryMetadata meta_category_; - TimeMetadata meta_time_; - - TimestampMetadata meta_timestamp_; - - FBB* fbb_; -}; - -} // namespace feather -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_FEATHER_INTERNAL_H diff --git a/r/R/inst/include/arrow/ipc/feather.h b/r/R/inst/include/arrow/ipc/feather.h deleted file mode 100644 index b6bd4ff5e5b..00000000000 --- a/r/R/inst/include/arrow/ipc/feather.h +++ /dev/null @@ -1,173 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for the "Feather" file format, originally created at -// http://github.com/wesm/feather - -#ifndef ARROW_IPC_FEATHER_H -#define ARROW_IPC_FEATHER_H - -#include -#include -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Column; -class Status; -class Table; - -namespace io { - -class OutputStream; -class RandomAccessFile; - -} // namespace io - -namespace ipc { -namespace feather { - -static constexpr const int kFeatherVersion = 2; - -// ---------------------------------------------------------------------- -// Metadata accessor classes - -/// \class TableReader -/// \brief An interface for reading columns from Feather files -class ARROW_EXPORT TableReader { - public: - TableReader(); - ~TableReader(); - - /// \brief Open a Feather file from a RandomAccessFile interface - /// - /// \param[in] source a RandomAccessFile instance - /// \param[out] out the table reader - static Status Open(const std::shared_ptr& source, - std::unique_ptr* out); - - /// \brief Optional table description - /// - /// This does not return a const std::string& because a string has to be - /// copied from the flatbuffer to be able to return a non-flatbuffer type - std::string GetDescription() const; - - /// \brief Return true if the table has a description field populated - bool HasDescription() const; - - /// \brief Return the version number of the Feather file - int version() const; - - /// \brief Return the number of rows in the file - int64_t num_rows() const; - - /// \brief Return the number of columns in the file - int64_t num_columns() const; - - std::string GetColumnName(int i) const; - - /// \brief Read a column from the file as an arrow::Column. - /// - /// \param[in] i the column index to read - /// \param[out] out the returned column - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status GetColumn(int i, std::shared_ptr* out); - - /// \brief Read all columns from the file as an arrow::Table. - /// - /// \param[out] out the returned table - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status Read(std::shared_ptr
* out); - - /// \brief Read only the specified columns from the file as an arrow::Table. - /// - /// \param[in] indices the column indices to read - /// \param[out] out the returned table - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& indices, std::shared_ptr
* out); - - /// \brief Read only the specified columns from the file as an arrow::Table. - /// - /// \param[in] names the column names to read - /// \param[out] out the returned table - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& names, std::shared_ptr
* out); - - private: - class ARROW_NO_EXPORT TableReaderImpl; - std::unique_ptr impl_; -}; - -/// \class TableWriter -/// \brief Interface for writing Feather files -class ARROW_EXPORT TableWriter { - public: - ~TableWriter(); - - /// \brief Create a new TableWriter that writes to an OutputStream - /// \param[in] stream an output stream - /// \param[out] out the returned table writer - /// \return Status - static Status Open(const std::shared_ptr& stream, - std::unique_ptr* out); - - /// \brief Set the description field in the file metadata - void SetDescription(const std::string& desc); - - /// \brief Set the number of rows in the file - void SetNumRows(int64_t num_rows); - - /// \brief Append a column to the file - /// - /// \param[in] name the column name - /// \param[in] values the column values as a contiguous arrow::Array - /// \return Status - Status Append(const std::string& name, const Array& values); - - /// \brief Write a table to the file - /// - /// \param[in] table the table to be written - /// \return Status - Status Write(const Table& table); - - /// \brief Finalize the file by writing the file metadata and footer - /// \return Status - Status Finalize(); - - private: - TableWriter(); - class ARROW_NO_EXPORT TableWriterImpl; - std::unique_ptr impl_; -}; - -} // namespace feather -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_FEATHER_H diff --git a/r/R/inst/include/arrow/ipc/json-integration.h b/r/R/inst/include/arrow/ipc/json-integration.h deleted file mode 100644 index 0256532a4a9..00000000000 --- a/r/R/inst/include/arrow/ipc/json-integration.h +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Implement Arrow JSON serialization format - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class RecordBatch; -class Schema; - -namespace io { -class ReadableFile; -} // namespace io - -namespace ipc { -namespace internal { -namespace json { - -/// \class JsonWriter -/// \brief Write the JSON representation of an Arrow record batch file or stream -/// -/// This is used for integration testing -class ARROW_EXPORT JsonWriter { - public: - ~JsonWriter(); - - /// \brief Create a new JSON writer that writes to memory - /// - /// \param[in] schema the schema of record batches - /// \param[out] out the returned writer object - /// \return Status - static Status Open(const std::shared_ptr& schema, - std::unique_ptr* out); - - /// \brief Append a record batch - Status WriteRecordBatch(const RecordBatch& batch); - - /// \brief Finish the JSON payload and return as a std::string - /// - /// \param[out] result the JSON as as a std::string - /// \return Status - Status Finish(std::string* result); - - private: - explicit JsonWriter(const std::shared_ptr& schema); - - // Hide RapidJSON details from public API - class JsonWriterImpl; - std::unique_ptr impl_; -}; - -/// \class JsonReader -/// \brief Read the JSON representation of an Arrow record batch file or stream -/// -/// This is used for integration testing -class ARROW_EXPORT JsonReader { - public: - ~JsonReader(); - - /// \brief Create a new JSON reader - /// - /// \param[in] pool a MemoryPool to use for buffer allocations - /// \param[in] data a Buffer containing the JSON data - /// \param[out] reader the returned reader object - /// \return Status - static Status Open(MemoryPool* pool, const std::shared_ptr& data, - std::unique_ptr* reader); - - /// \brief Create a new JSON reader that uses the default memory pool - /// - /// \param[in] data a Buffer containing the JSON data - /// \param[out] reader the returned reader object - /// \return Status - static Status Open(const std::shared_ptr& data, - std::unique_ptr* reader); - - /// \brief Create a new JSON reader from a file - /// - /// \param[in] pool a MemoryPool to use for buffer allocations - /// \param[in] in_file a ReadableFile containing JSON data - /// \param[out] reader the returned reader object - /// \return Status - static Status Open(MemoryPool* pool, const std::shared_ptr& in_file, - std::unique_ptr* reader); - - /// \brief Return the schema read from the JSON - std::shared_ptr schema() const; - - /// \brief Return the number of record batches - int num_record_batches() const; - - /// \brief Read a particular record batch from the file - /// - /// \param[in] i the record batch index, does not boundscheck - /// \param[out] batch the read record batch - Status ReadRecordBatch(int i, std::shared_ptr* batch) const; - - private: - JsonReader(MemoryPool* pool, const std::shared_ptr& data); - - // Hide RapidJSON details from public API - class JsonReaderImpl; - std::unique_ptr impl_; -}; - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow diff --git a/r/R/inst/include/arrow/ipc/json-internal.h b/r/R/inst/include/arrow/ipc/json-internal.h deleted file mode 100644 index aa2e06a189d..00000000000 --- a/r/R/inst/include/arrow/ipc/json-internal.h +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IPC_JSON_INTERNAL_H -#define ARROW_IPC_JSON_INTERNAL_H - -#include -#include - -#include "arrow/json/rapidjson-defs.h" -#include "rapidjson/document.h" // IWYU pragma: export -#include "rapidjson/encodings.h" // IWYU pragma: export -#include "rapidjson/error/en.h" // IWYU pragma: export -#include "rapidjson/stringbuffer.h" // IWYU pragma: export -#include "rapidjson/writer.h" // IWYU pragma: export - -#include "arrow/status.h" // IWYU pragma: export -#include "arrow/type_fwd.h" // IWYU pragma: keep -#include "arrow/util/visibility.h" - -namespace rj = arrow::rapidjson; -using RjWriter = rj::Writer; -using RjArray = rj::Value::ConstArray; -using RjObject = rj::Value::ConstObject; - -#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ - if (NAME == (PARENT).MemberEnd()) { \ - return Status::Invalid("field ", TOK, " not found"); \ - } - -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - return Status::Invalid("field was not a string line ", __LINE__); \ - } - -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - return Status::Invalid("field was not a boolean line ", __LINE__); \ - } - -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - return Status::Invalid("field was not an int line ", __LINE__); \ - } - -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - return Status::Invalid("field was not an array line ", __LINE__); \ - } - -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - return Status::Invalid("field was not an object line ", __LINE__); \ - } - -namespace arrow { -namespace ipc { - -class DictionaryMemo; - -namespace internal { -namespace json { - -/// \brief Append integration test Schema format to rapidjson writer -ARROW_EXPORT -Status WriteSchema(const Schema& schema, DictionaryMemo* dict_memo, RjWriter* writer); - -ARROW_EXPORT -Status WriteDictionary(int64_t id, const std::shared_ptr& dictionary, - RjWriter* writer); - -ARROW_EXPORT -Status WriteRecordBatch(const RecordBatch& batch, RjWriter* writer); - -ARROW_EXPORT -Status WriteArray(const std::string& name, const Array& array, RjWriter* writer); - -ARROW_EXPORT -Status ReadSchema(const rj::Value& json_obj, MemoryPool* pool, - DictionaryMemo* dictionary_memo, std::shared_ptr* schema); - -ARROW_EXPORT -Status ReadRecordBatch(const rj::Value& json_obj, const std::shared_ptr& schema, - DictionaryMemo* dict_memo, MemoryPool* pool, - std::shared_ptr* batch); - -ARROW_EXPORT -Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, - const std::shared_ptr& type, DictionaryMemo* dict_memo, - std::shared_ptr* array); - -ARROW_EXPORT -Status ReadArray(MemoryPool* pool, const rj::Value& json_obj, const Schema& schema, - DictionaryMemo* dict_memo, std::shared_ptr* array); - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_JSON_INTERNAL_H diff --git a/r/R/inst/include/arrow/ipc/json-simple.h b/r/R/inst/include/arrow/ipc/json-simple.h deleted file mode 100644 index da6483ff155..00000000000 --- a/r/R/inst/include/arrow/ipc/json-simple.h +++ /dev/null @@ -1,56 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Implement a simple JSON representation format for arrays - -#ifndef ARROW_IPC_JSON_SIMPLE_H -#define ARROW_IPC_JSON_SIMPLE_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; - -namespace ipc { -namespace internal { -namespace json { - -ARROW_EXPORT -Status ArrayFromJSON(const std::shared_ptr&, const std::string& json, - std::shared_ptr* out); - -ARROW_EXPORT -Status ArrayFromJSON(const std::shared_ptr&, const util::string_view& json, - std::shared_ptr* out); - -ARROW_EXPORT -Status ArrayFromJSON(const std::shared_ptr&, const char* json, - std::shared_ptr* out); - -} // namespace json -} // namespace internal -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_JSON_SIMPLE_H diff --git a/r/R/inst/include/arrow/ipc/message.h b/r/R/inst/include/arrow/ipc/message.h deleted file mode 100644 index fcc7e778377..00000000000 --- a/r/R/inst/include/arrow/ipc/message.h +++ /dev/null @@ -1,241 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// C++ object model and user API for interprocess schema messaging - -#ifndef ARROW_IPC_MESSAGE_H -#define ARROW_IPC_MESSAGE_H - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; - -namespace io { - -class FileInterface; -class InputStream; -class OutputStream; -class RandomAccessFile; - -} // namespace io - -namespace ipc { - -enum class MetadataVersion : char { - /// 0.1.0 - V1, - - /// 0.2.0 - V2, - - /// 0.3.0 to 0.7.1 - V3, - - /// >= 0.8.0 - V4 -}; - -// ARROW-109: We set this number arbitrarily to help catch user mistakes. For -// deeply nested schemas, it is expected the user will indicate explicitly the -// maximum allowed recursion depth -constexpr int kMaxNestingDepth = 64; - -// Read interface classes. We do not fully deserialize the flatbuffers so that -// individual fields metadata can be retrieved from very large schema without -// - -/// \class Message -/// \brief An IPC message including metadata and body -class ARROW_EXPORT Message { - public: - enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH, TENSOR, SPARSE_TENSOR }; - - /// \brief Construct message, but do not validate - /// - /// Use at your own risk; Message::Open has more metadata validation - Message(const std::shared_ptr& metadata, const std::shared_ptr& body); - - ~Message(); - - /// \brief Create and validate a Message instance from two buffers - /// - /// \param[in] metadata a buffer containing the Flatbuffer metadata - /// \param[in] body a buffer containing the message body, which may be null - /// \param[out] out the created message - /// \return Status - static Status Open(const std::shared_ptr& metadata, - const std::shared_ptr& body, std::unique_ptr* out); - - /// \brief Read message body and create Message given Flatbuffer metadata - /// \param[in] metadata containing a serialized Message flatbuffer - /// \param[in] stream an InputStream - /// \param[out] out the created Message - /// \return Status - /// - /// \note If stream supports zero-copy, this is zero-copy - static Status ReadFrom(const std::shared_ptr& metadata, io::InputStream* stream, - std::unique_ptr* out); - - /// \brief Read message body from position in file, and create Message given - /// the Flatbuffer metadata - /// \param[in] offset the position in the file where the message body starts. - /// \param[in] metadata containing a serialized Message flatbuffer - /// \param[in] file the seekable file interface to read from - /// \param[out] out the created Message - /// \return Status - /// - /// \note If file supports zero-copy, this is zero-copy - static Status ReadFrom(const int64_t offset, const std::shared_ptr& metadata, - io::RandomAccessFile* file, std::unique_ptr* out); - - /// \brief Return true if message type and contents are equal - /// - /// \param other another message - /// \return true if contents equal - bool Equals(const Message& other) const; - - /// \brief the Message metadata - /// - /// \return buffer - std::shared_ptr metadata() const; - - /// \brief the Message body, if any - /// - /// \return buffer is null if no body - std::shared_ptr body() const; - - /// \brief The expected body length according to the metadata, for - /// verification purposes - int64_t body_length() const; - - /// \brief The Message type - Type type() const; - - /// \brief The Message metadata version - MetadataVersion metadata_version() const; - - const void* header() const; - - /// \brief Write length-prefixed metadata and body to output stream - /// - /// \param[in] file output stream to write to - /// \param[in] alignment byte alignment for metadata, usually 8 or - /// 64. Whether the body is padded depends on the metadata; if the body - /// buffer is smaller than the size indicated in the metadata, then extra - /// padding bytes will be written - /// \param[out] output_length the number of bytes written - /// \return Status - Status SerializeTo(io::OutputStream* file, int32_t alignment, - int64_t* output_length) const; - - /// \brief Return true if the Message metadata passes Flatbuffer validation - bool Verify() const; - - /// \brief Whether a given message type needs a body. - static bool HasBody(Type type) { return type != NONE && type != SCHEMA; } - - private: - // Hide serialization details from user API - class MessageImpl; - std::unique_ptr impl_; - - ARROW_DISALLOW_COPY_AND_ASSIGN(Message); -}; - -ARROW_EXPORT std::string FormatMessageType(Message::Type type); - -/// \brief Abstract interface for a sequence of messages -/// \since 0.5.0 -class ARROW_EXPORT MessageReader { - public: - virtual ~MessageReader() = default; - - /// \brief Create MessageReader that reads from InputStream - static std::unique_ptr Open(io::InputStream* stream); - - /// \brief Create MessageReader that reads from owned InputStream - static std::unique_ptr Open( - const std::shared_ptr& owned_stream); - - /// \brief Read next Message from the interface - /// - /// \param[out] message an arrow::ipc::Message instance - /// \return Status - virtual Status ReadNextMessage(std::unique_ptr* message) = 0; -}; - -/// \brief Read encapsulated RPC message from position in file -/// -/// Read a length-prefixed message flatbuffer starting at the indicated file -/// offset. If the message has a body with non-zero length, it will also be -/// read -/// -/// The metadata_length includes at least the length prefix and the flatbuffer -/// -/// \param[in] offset the position in the file where the message starts. The -/// first 4 bytes after the offset are the message length -/// \param[in] metadata_length the total number of bytes to read from file -/// \param[in] file the seekable file interface to read from -/// \param[out] message the message read -/// \return Status success or failure -ARROW_EXPORT -Status ReadMessage(const int64_t offset, const int32_t metadata_length, - io::RandomAccessFile* file, std::unique_ptr* message); - -/// \brief Advance stream to an 8-byte offset if its position is not a multiple -/// of 8 already -/// \param[in] stream an input stream -/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 -/// or 64, to ensure the body starts on a multiple of that alignment -/// \return Status -ARROW_EXPORT -Status AlignStream(io::InputStream* stream, int32_t alignment = 8); - -/// \brief Advance stream to an 8-byte offset if its position is not a multiple -/// of 8 already -/// \param[in] stream an output stream -/// \param[in] alignment the byte multiple for the metadata prefix, usually 8 -/// or 64, to ensure the body starts on a multiple of that alignment -/// \return Status -ARROW_EXPORT -Status AlignStream(io::OutputStream* stream, int32_t alignment = 8); - -/// \brief Return error Status if file position is not a multiple of the -/// indicated alignment -ARROW_EXPORT -Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8); - -/// \brief Read encapsulated RPC message (metadata and body) from InputStream -/// -/// Read length-prefixed message with as-yet unknown length. Returns null if -/// there are not enough bytes available or the message length is 0 (e.g. EOS -/// in a stream) -ARROW_EXPORT -Status ReadMessage(io::InputStream* stream, std::unique_ptr* message); - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_MESSAGE_H diff --git a/r/R/inst/include/arrow/ipc/metadata-internal.h b/r/R/inst/include/arrow/ipc/metadata-internal.h deleted file mode 100644 index 4563fb029d6..00000000000 --- a/r/R/inst/include/arrow/ipc/metadata-internal.h +++ /dev/null @@ -1,176 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal metadata serialization matters - -#ifndef ARROW_IPC_METADATA_INTERNAL_H -#define ARROW_IPC_METADATA_INTERNAL_H - -#include -#include -#include -#include -#include - -#include - -#include "arrow/buffer.h" -#include "arrow/ipc/Schema_generated.h" -#include "arrow/ipc/dictionary.h" // IYWU pragma: keep -#include "arrow/ipc/message.h" -#include "arrow/memory_pool.h" -#include "arrow/sparse_tensor.h" -#include "arrow/status.h" - -namespace arrow { - -class DataType; -class Schema; -class Tensor; -class SparseTensor; - -namespace flatbuf = org::apache::arrow::flatbuf; - -namespace io { - -class OutputStream; - -} // namespace io - -namespace ipc { - -class DictionaryMemo; - -namespace internal { - -static constexpr flatbuf::MetadataVersion kCurrentMetadataVersion = - flatbuf::MetadataVersion_V4; - -static constexpr flatbuf::MetadataVersion kMinMetadataVersion = - flatbuf::MetadataVersion_V4; - -MetadataVersion GetMetadataVersion(flatbuf::MetadataVersion version); - -static constexpr const char* kArrowMagicBytes = "ARROW1"; - -struct FieldMetadata { - int64_t length; - int64_t null_count; - int64_t offset; -}; - -struct BufferMetadata { - /// The relative offset into the memory page to the starting byte of the buffer - int64_t offset; - - /// Absolute length in bytes of the buffer - int64_t length; -}; - -struct FileBlock { - int64_t offset; - int32_t metadata_length; - int64_t body_length; -}; - -// Read interface classes. We do not fully deserialize the flatbuffers so that -// individual fields metadata can be retrieved from very large schema without -// - -// Construct a complete Schema from the message and add -// dictinory-encoded fields to a DictionaryMemo instance. May be -// expensive for very large schemas if you are only interested in a -// few fields -Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, - std::shared_ptr* out); - -Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type, - std::vector* shape, std::vector* strides, - std::vector* dim_names); - -// EXPERIMENTAL: Extracting metadata of a sparse tensor from the message -Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr* type, - std::vector* shape, - std::vector* dim_names, int64_t* length, - SparseTensorFormat::type* sparse_tensor_format_id); - -/// Write a serialized message metadata with a length-prefix and padding to an -/// 8-byte offset. Does not make assumptions about whether the stream is -/// aligned already -/// -/// -/// -/// \param[in] message a buffer containing the metadata to write -/// \param[in] alignment the size multiple of the total message size including -/// length prefix, metadata, and padding. Usually 8 or 64 -/// \param[in,out] file the OutputStream to write to -/// \param[out] message_length the total size of the payload written including -/// padding -/// \return Status -Status WriteMessage(const Buffer& message, int32_t alignment, io::OutputStream* file, - int32_t* message_length); - -// Serialize arrow::Schema as a Flatbuffer -// -// \param[in] schema a Schema instance -// \param[in,out] dictionary_memo class for tracking dictionaries and assigning -// dictionary ids -// \param[out] out the serialized arrow::Buffer -// \return Status outcome -Status WriteSchemaMessage(const Schema& schema, DictionaryMemo* dictionary_memo, - std::shared_ptr* out); - -Status WriteRecordBatchMessage(const int64_t length, const int64_t body_length, - const std::vector& nodes, - const std::vector& buffers, - std::shared_ptr* out); - -Status WriteTensorMessage(const Tensor& tensor, const int64_t buffer_start_offset, - std::shared_ptr* out); - -Status WriteSparseTensorMessage(const SparseTensor& sparse_tensor, int64_t body_length, - const std::vector& buffers, - std::shared_ptr* out); - -Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, - const std::vector& record_batches, - io::OutputStream* out); - -Status WriteDictionaryMessage(const int64_t id, const int64_t length, - const int64_t body_length, - const std::vector& nodes, - const std::vector& buffers, - std::shared_ptr* out); - -static inline Status WriteFlatbufferBuilder(flatbuffers::FlatBufferBuilder& fbb, - std::shared_ptr* out) { - int32_t size = fbb.GetSize(); - - std::shared_ptr result; - RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), size, &result)); - - uint8_t* dst = result->mutable_data(); - memcpy(dst, fbb.GetBufferPointer(), size); - *out = result; - return Status::OK(); -} - -} // namespace internal -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_METADATA_H diff --git a/r/R/inst/include/arrow/ipc/reader.h b/r/R/inst/include/arrow/ipc/reader.h deleted file mode 100644 index 34a0eefbbb5..00000000000 --- a/r/R/inst/include/arrow/ipc/reader.h +++ /dev/null @@ -1,291 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Read Arrow files and streams - -#ifndef ARROW_IPC_READER_H -#define ARROW_IPC_READER_H - -#include -#include - -#include "arrow/ipc/dictionary.h" -#include "arrow/ipc/message.h" -#include "arrow/record_batch.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class Schema; -class Status; -class Tensor; -class SparseTensor; - -namespace io { - -class InputStream; -class RandomAccessFile; - -} // namespace io - -namespace ipc { - -using RecordBatchReader = ::arrow::RecordBatchReader; - -/// \class RecordBatchStreamReader -/// \brief Synchronous batch stream reader that reads from io::InputStream -/// -/// This class reads the schema (plus any dictionaries) as the first messages -/// in the stream, followed by record batches. For more granular zero-copy -/// reads see the ReadRecordBatch functions -class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { - public: - ~RecordBatchStreamReader() override; - - /// Create batch reader from generic MessageReader. - /// This will take ownership of the given MessageReader. - /// - /// \param[in] message_reader a MessageReader implementation - /// \param[out] out the created RecordBatchReader object - /// \return Status - static Status Open(std::unique_ptr message_reader, - std::shared_ptr* out); - static Status Open(std::unique_ptr message_reader, - std::unique_ptr* out); - - /// \brief Record batch stream reader from InputStream - /// - /// \param[in] stream an input stream instance. Must stay alive throughout - /// lifetime of stream reader - /// \param[out] out the created RecordBatchStreamReader object - /// \return Status - static Status Open(io::InputStream* stream, std::shared_ptr* out); - - /// \brief Open stream and retain ownership of stream object - /// \param[in] stream the input stream - /// \param[out] out the batch reader - /// \return Status - static Status Open(const std::shared_ptr& stream, - std::shared_ptr* out); - - /// \brief Returns the schema read from the stream - std::shared_ptr schema() const override; - - Status ReadNext(std::shared_ptr* batch) override; - - private: - RecordBatchStreamReader(); - - class ARROW_NO_EXPORT RecordBatchStreamReaderImpl; - std::unique_ptr impl_; -}; - -/// \brief Reads the record batch file format -class ARROW_EXPORT RecordBatchFileReader { - public: - ~RecordBatchFileReader(); - - /// \brief Open a RecordBatchFileReader - /// - /// Open a file-like object that is assumed to be self-contained; i.e., the - /// end of the file interface is the end of the Arrow file. Note that there - /// can be any amount of data preceding the Arrow-formatted data, because we - /// need only locate the end of the Arrow file stream to discover the metadata - /// and then proceed to read the data into memory. - static Status Open(io::RandomAccessFile* file, - std::shared_ptr* reader); - - /// \brief Open a RecordBatchFileReader - /// If the file is embedded within some larger file or memory region, you can - /// pass the absolute memory offset to the end of the file (which contains the - /// metadata footer). The metadata must have been written with memory offsets - /// relative to the start of the containing file - /// - /// \param[in] file the data source - /// \param[in] footer_offset the position of the end of the Arrow file - /// \param[out] reader the returned reader - /// \return Status - static Status Open(io::RandomAccessFile* file, int64_t footer_offset, - std::shared_ptr* reader); - - /// \brief Version of Open that retains ownership of file - /// - /// \param[in] file the data source - /// \param[out] reader the returned reader - /// \return Status - static Status Open(const std::shared_ptr& file, - std::shared_ptr* reader); - - /// \brief Version of Open that retains ownership of file - /// - /// \param[in] file the data source - /// \param[in] footer_offset the position of the end of the Arrow file - /// \param[out] reader the returned reader - /// \return Status - static Status Open(const std::shared_ptr& file, - int64_t footer_offset, - std::shared_ptr* reader); - - /// \brief The schema read from the file - std::shared_ptr schema() const; - - /// \brief Returns the number of record batches in the file - int num_record_batches() const; - - /// \brief Return the metadata version from the file metadata - MetadataVersion version() const; - - /// \brief Read a particular record batch from the file. Does not copy memory - /// if the input source supports zero-copy. - /// - /// \param[in] i the index of the record batch to return - /// \param[out] batch the read batch - /// \return Status - Status ReadRecordBatch(int i, std::shared_ptr* batch); - - private: - RecordBatchFileReader(); - - class ARROW_NO_EXPORT RecordBatchFileReaderImpl; - std::unique_ptr impl_; -}; - -// Generic read functions; does not copy data if the input supports zero copy reads - -/// \brief Read Schema from stream serialized as a single IPC message -/// and populate any dictionary-encoded fields into a DictionaryMemo -/// -/// \param[in] stream an InputStream -/// \param[in] dictionary_memo for recording dictionary-encoded fields -/// \param[out] out the output Schema -/// \return Status -/// -/// If record batches follow the schema, it is better to use -/// RecordBatchStreamReader -ARROW_EXPORT -Status ReadSchema(io::InputStream* stream, DictionaryMemo* dictionary_memo, - std::shared_ptr* out); - -/// \brief Read Schema from encapsulated Message -/// -/// \param[in] message a message instance containing metadata -/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded -/// fields. Can be nullptr if you are sure there are no -/// dictionary-encoded fields -/// \param[out] out the resulting Schema -/// \return Status -ARROW_EXPORT -Status ReadSchema(const Message& message, DictionaryMemo* dictionary_memo, - std::shared_ptr* out); - -/// Read record batch as encapsulated IPC message with metadata size prefix and -/// header -/// -/// \param[in] schema the record batch schema -/// \param[in] dictionary_memo DictionaryMemo which has any -/// dictionaries. Can be nullptr if you are sure there are no -/// dictionary-encoded fields -/// \param[in] stream the file where the batch is located -/// \param[out] out the read record batch -/// \return Status -ARROW_EXPORT -Status ReadRecordBatch(const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, io::InputStream* stream, - std::shared_ptr* out); - -/// \brief Read record batch from file given metadata and schema -/// -/// \param[in] metadata a Message containing the record batch metadata -/// \param[in] schema the record batch schema -/// \param[in] dictionary_memo DictionaryMemo which has any -/// dictionaries. Can be nullptr if you are sure there are no -/// dictionary-encoded fields -/// \param[in] file a random access file -/// \param[out] out the read record batch -/// \return Status -ARROW_EXPORT -Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, io::RandomAccessFile* file, - std::shared_ptr* out); - -/// \brief Read record batch from encapsulated Message -/// -/// \param[in] message a message instance containing metadata and body -/// \param[in] schema the record batch schema -/// \param[in] dictionary_memo DictionaryMemo which has any -/// dictionaries. Can be nullptr if you are sure there are no -/// dictionary-encoded fields -/// \param[out] out the resulting RecordBatch -/// \return Status -ARROW_EXPORT -Status ReadRecordBatch(const Message& message, const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, - std::shared_ptr* out); - -/// Read record batch from file given metadata and schema -/// -/// \param[in] metadata a Message containing the record batch metadata -/// \param[in] schema the record batch schema -/// \param[in] dictionary_memo DictionaryMemo which has any -/// dictionaries. Can be nullptr if you are sure there are no -/// dictionary-encoded fields -/// \param[in] file a random access file -/// \param[in] max_recursion_depth the maximum permitted nesting depth -/// \param[out] out the read record batch -/// \return Status -ARROW_EXPORT -Status ReadRecordBatch(const Buffer& metadata, const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, int max_recursion_depth, - io::RandomAccessFile* file, std::shared_ptr* out); - -/// \brief Read arrow::Tensor as encapsulated IPC message in file -/// -/// \param[in] file an InputStream pointed at the start of the message -/// \param[out] out the read tensor -/// \return Status -ARROW_EXPORT -Status ReadTensor(io::InputStream* file, std::shared_ptr* out); - -/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message -/// -/// \param[in] message a Message containing the tensor metadata and body -/// \param[out] out the read tensor -/// \return Status -ARROW_EXPORT -Status ReadTensor(const Message& message, std::shared_ptr* out); - -/// \brief EXPERIMETNAL: Read arrow::SparseTensor as encapsulated IPC message in file -/// -/// \param[in] file an InputStream pointed at the start of the message -/// \param[out] out the read sparse tensor -/// \return Status -ARROW_EXPORT -Status ReadSparseTensor(io::InputStream* file, std::shared_ptr* out); - -/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message -/// -/// \param[in] message a Message containing the tensor metadata and body -/// \param[out] out the read sparse tensor -/// \return Status -ARROW_EXPORT -Status ReadSparseTensor(const Message& message, std::shared_ptr* out); - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_READER_H diff --git a/r/R/inst/include/arrow/ipc/test-common.h b/r/R/inst/include/arrow/ipc/test-common.h deleted file mode 100644 index adbc57bfe26..00000000000 --- a/r/R/inst/include/arrow/ipc/test-common.h +++ /dev/null @@ -1,134 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IPC_TEST_COMMON_H -#define ARROW_IPC_TEST_COMMON_H - -#include -#include - -#include "arrow/array.h" -#include "arrow/record_batch.h" -#include "arrow/status.h" -#include "arrow/type.h" - -namespace arrow { -namespace ipc { -namespace test { - -// A typedef used for test parameterization -typedef Status MakeRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -void CompareArraysDetailed(int index, const Array& result, const Array& expected); - -ARROW_EXPORT -void CompareBatchColumnsDetailed(const RecordBatch& result, const RecordBatch& expected); - -ARROW_EXPORT -Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, - std::shared_ptr* out, uint32_t seed = 0); - -ARROW_EXPORT -Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, - std::shared_ptr* out); - -ARROW_EXPORT -Status MakeRandomBooleanArray(const int length, bool include_nulls, - std::shared_ptr* out); - -ARROW_EXPORT -Status MakeBooleanBatchSized(const int length, std::shared_ptr* out); - -ARROW_EXPORT -Status MakeBooleanBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeIntBatchSized(int length, std::shared_ptr* out, - uint32_t seed = 0); - -ARROW_EXPORT -Status MakeIntRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* pool, - std::shared_ptr* out); - -ARROW_EXPORT -Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); - -ARROW_EXPORT -Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeNullRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeListRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeZeroLengthRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeNonNullRecordBatch(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeDeeplyNestedList(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeStruct(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeUnion(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeDictionary(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeDictionaryFlat(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeDates(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeTimestamps(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeIntervals(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeTimes(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeFWBinary(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeDecimal(std::shared_ptr* out); - -ARROW_EXPORT -Status MakeNull(std::shared_ptr* out); - -} // namespace test -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_TEST_COMMON_H diff --git a/r/R/inst/include/arrow/ipc/util.h b/r/R/inst/include/arrow/ipc/util.h deleted file mode 100644 index 80f9f3c5102..00000000000 --- a/r/R/inst/include/arrow/ipc/util.h +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_IPC_UTIL_H -#define ARROW_IPC_UTIL_H - -#include - -#include "arrow/array.h" -#include "arrow/io/interfaces.h" -#include "arrow/status.h" - -namespace arrow { -namespace ipc { - -// Buffers are padded to 64-byte boundaries (for SIMD) -static constexpr int32_t kArrowAlignment = 64; - -// Tensors are padded to 64-byte boundaries -static constexpr int32_t kTensorAlignment = 64; - -// Align on 8-byte boundaries in IPC -static constexpr int32_t kArrowIpcAlignment = 8; - -static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; - -static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) { - return ((nbytes + alignment - 1) / alignment) * alignment; -} - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_UTIL_H diff --git a/r/R/inst/include/arrow/ipc/writer.h b/r/R/inst/include/arrow/ipc/writer.h deleted file mode 100644 index 6bb55dbc1a5..00000000000 --- a/r/R/inst/include/arrow/ipc/writer.h +++ /dev/null @@ -1,366 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Implement Arrow streaming binary format - -#ifndef ARROW_IPC_WRITER_H -#define ARROW_IPC_WRITER_H - -#include -#include -#include - -#include "arrow/ipc/message.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Buffer; -class MemoryPool; -class RecordBatch; -class Schema; -class Status; -class Table; -class Tensor; -class SparseTensor; - -namespace io { - -class OutputStream; - -} // namespace io - -namespace ipc { - -class DictionaryMemo; - -/// \class RecordBatchWriter -/// \brief Abstract interface for writing a stream of record batches -class ARROW_EXPORT RecordBatchWriter { - public: - virtual ~RecordBatchWriter(); - - /// \brief Write a record batch to the stream - /// - /// \param[in] batch the record batch to write to the stream - /// \param[in] allow_64bit if true, allow field lengths that don't fit - /// in a signed 32-bit int - /// \return Status - virtual Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) = 0; - - /// \brief Write possibly-chunked table by creating sequence of record batches - /// \param[in] table table to write - /// \return Status - Status WriteTable(const Table& table); - - /// \brief Write Table with a particular chunksize - /// \param[in] table table to write - /// \param[in] max_chunksize maximum chunk size for table chunks - /// \return Status - Status WriteTable(const Table& table, int64_t max_chunksize); - - /// \brief Perform any logic necessary to finish the stream - /// - /// \return Status - virtual Status Close() = 0; - - /// In some cases, writing may require memory allocation. We use the default - /// memory pool, but provide the option to override - /// - /// \param pool the memory pool to use for required allocations - virtual void set_memory_pool(MemoryPool* pool) = 0; -}; - -/// \class RecordBatchStreamWriter -/// \brief Synchronous batch stream writer that writes the Arrow streaming -/// format -class ARROW_EXPORT RecordBatchStreamWriter : public RecordBatchWriter { - public: - ~RecordBatchStreamWriter() override; - - /// Create a new writer from stream sink and schema. User is responsible for - /// closing the actual OutputStream. - /// - /// \param[in] sink output stream to write to - /// \param[in] schema the schema of the record batches to be written - /// \param[out] out the created stream writer - /// \return Status - static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); - - /// \brief Write a record batch to the stream - /// - /// \param[in] batch the record batch to write - /// \param[in] allow_64bit allow array lengths over INT32_MAX - 1 - /// \return Status - Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; - - /// \brief Close the stream by writing a 4-byte int32 0 EOS market - /// \return Status - Status Close() override; - - void set_memory_pool(MemoryPool* pool) override; - - protected: - RecordBatchStreamWriter(); - class ARROW_NO_EXPORT RecordBatchStreamWriterImpl; - std::unique_ptr impl_; -}; - -/// \brief Creates the Arrow record batch file format -/// -/// Implements the random access file format, which structurally is a record -/// batch stream followed by a metadata footer at the end of the file. Magic -/// numbers are written at the start and end of the file -class ARROW_EXPORT RecordBatchFileWriter : public RecordBatchStreamWriter { - public: - ~RecordBatchFileWriter() override; - - /// Create a new writer from stream sink and schema - /// - /// \param[in] sink output stream to write to - /// \param[in] schema the schema of the record batches to be written - /// \param[out] out the created stream writer - /// \return Status - static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); - - /// \brief Write a record batch to the file - /// - /// \param[in] batch the record batch to write - /// \param[in] allow_64bit allow array lengths over INT32_MAX - 1 - /// \return Status - Status WriteRecordBatch(const RecordBatch& batch, bool allow_64bit = false) override; - - /// \brief Close the file stream by writing the file footer and magic number - /// \return Status - Status Close() override; - - private: - RecordBatchFileWriter(); - class ARROW_NO_EXPORT RecordBatchFileWriterImpl; - std::unique_ptr file_impl_; -}; - -/// \brief Low-level API for writing a record batch (without schema) to an OutputStream -/// -/// \param[in] batch the record batch to write -/// \param[in] buffer_start_offset the start offset to use in the buffer metadata, -/// generally should be 0 -/// \param[in] dst an OutputStream -/// \param[out] metadata_length the size of the length-prefixed flatbuffer -/// including padding to a 64-byte boundary -/// \param[out] body_length the size of the contiguous buffer block plus -/// \param[in] pool the memory pool to allocate memory from -/// \param[in] max_recursion_depth the maximum permitted nesting schema depth -/// \param[in] allow_64bit permit field lengths exceeding INT32_MAX. May not be -/// readable by other Arrow implementations -/// padding bytes -/// \return Status -/// -/// Write the RecordBatch (collection of equal-length Arrow arrays) to the -/// output stream in a contiguous block. The record batch metadata is written as -/// a flatbuffer (see format/Message.fbs -- the RecordBatch message type) -/// prefixed by its size, followed by each of the memory buffers in the batch -/// written end to end (with appropriate alignment and padding): -/// -/// \code -/// -/// \endcode -/// -/// Finally, the absolute offsets (relative to the start of the output stream) -/// to the end of the body and end of the metadata / data header (suffixed by -/// the header size) is returned in out-variables -ARROW_EXPORT -Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, - io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, MemoryPool* pool, - int max_recursion_depth = kMaxNestingDepth, - bool allow_64bit = false); - -/// \brief Serialize record batch as encapsulated IPC message in a new buffer -/// -/// \param[in] batch the record batch -/// \param[in] pool a MemoryPool to allocate memory from -/// \param[out] out the serialized message -/// \return Status -ARROW_EXPORT -Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool, - std::shared_ptr* out); - -/// \brief Write record batch to OutputStream -/// -/// \param[in] batch the record batch to write -/// \param[in] pool a MemoryPool to use for temporary allocations, if needed -/// \param[in] out the OutputStream to write the output to -/// \return Status -/// -/// If writing to pre-allocated memory, you can use -/// arrow::ipc::GetRecordBatchSize to compute how much space is required -ARROW_EXPORT -Status SerializeRecordBatch(const RecordBatch& batch, MemoryPool* pool, - io::OutputStream* out); - -/// \brief Serialize schema as encapsulated IPC message -/// -/// \param[in] schema the schema to write -/// \param[in] dictionary_memo a DictionaryMemo for recording dictionary ids -/// \param[in] pool a MemoryPool to allocate memory from -/// \param[out] out the serialized schema -/// \return Status -ARROW_EXPORT -Status SerializeSchema(const Schema& schema, DictionaryMemo* dictionary_memo, - MemoryPool* pool, std::shared_ptr* out); - -/// \brief Write multiple record batches to OutputStream, including schema -/// \param[in] batches a vector of batches. Must all have same schema -/// \param[out] dst an OutputStream -/// \return Status -ARROW_EXPORT -Status WriteRecordBatchStream(const std::vector>& batches, - io::OutputStream* dst); - -/// \brief Compute the number of bytes needed to write a record batch including metadata -/// -/// \param[in] batch the record batch to write -/// \param[out] size the size of the complete encapsulated message -/// \return Status -ARROW_EXPORT -Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); - -/// \brief Compute the number of bytes needed to write a tensor including metadata -/// -/// \param[in] tensor the tenseor to write -/// \param[out] size the size of the complete encapsulated message -/// \return Status -ARROW_EXPORT -Status GetTensorSize(const Tensor& tensor, int64_t* size); - -/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory -/// allocation -/// -/// \param[in] tensor the Tensor to write -/// \param[in] pool MemoryPool to allocate space for metadata -/// \param[out] out the resulting Message -/// \return Status -ARROW_EXPORT -Status GetTensorMessage(const Tensor& tensor, MemoryPool* pool, - std::unique_ptr* out); - -/// \brief Write arrow::Tensor as a contiguous message. -/// -/// The metadata and body are written assuming 64-byte alignment. It is the -/// user's responsibility to ensure that the OutputStream has been aligned -/// to a 64-byte multiple before writing the message. -/// -/// The message is written out as followed: -/// \code -/// -/// \endcode -/// -/// \param[in] tensor the Tensor to write -/// \param[in] dst the OutputStream to write to -/// \param[out] metadata_length the actual metadata length, including padding -/// \param[out] body_length the acutal message body length -/// \return Status -ARROW_EXPORT -Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length); - -// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous mesasge. The metadata, -// sparse index, and body are written assuming 64-byte alignment. It is the -// user's responsibility to ensure that the OutputStream has been aligned -// to a 64-byte multiple before writing the message. -// -// \param[in] tensor the SparseTensor to write -// \param[in] dst the OutputStream to write to -// \param[out] metadata_length the actual metadata length, including padding -// \param[out] body_length the actual message body length -ARROW_EXPORT -Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst, - int32_t* metadata_length, int64_t* body_length, - MemoryPool* pool); - -namespace internal { - -// These internal APIs may change without warning or deprecation - -// Intermediate data structure with metadata header, and zero or more buffers -// for the message body. -struct IpcPayload { - Message::Type type = Message::NONE; - std::shared_ptr metadata; - std::vector> body_buffers; - int64_t body_length = 0; -}; - -class ARROW_EXPORT IpcPayloadWriter { - public: - virtual ~IpcPayloadWriter(); - - // Default implementation is a no-op - virtual Status Start(); - - virtual Status WritePayload(const IpcPayload& payload) = 0; - - virtual Status Close() = 0; -}; - -/// Create a new RecordBatchWriter from IpcPayloadWriter and schema. -/// -/// \param[in] sink the IpcPayloadWriter to write to -/// \param[in] schema the schema of the record batches to be written -/// \param[out] out the created RecordBatchWriter -/// \return Status -ARROW_EXPORT -Status OpenRecordBatchWriter(std::unique_ptr sink, - const std::shared_ptr& schema, - std::unique_ptr* out); - -/// \brief Compute IpcPayload for the given schema -/// \param[in] schema the Schema that is being serialized -/// \param[in,out] dictionary_memo class to populate with assigned dictionary ids -/// \param[out] out the returned vector of IpcPayloads -/// \return Status -ARROW_EXPORT -Status GetSchemaPayload(const Schema& schema, DictionaryMemo* dictionary_memo, - IpcPayload* out); - -/// \brief Compute IpcPayload for a dictionary -/// \param[in] id the dictionary id -/// \param[in] dictionary the dictionary values -/// \param[out] payload the output IpcPayload -/// \return Status -ARROW_EXPORT -Status GetDictionaryPayload(int64_t id, const std::shared_ptr& dictionary, - MemoryPool* pool, IpcPayload* payload); - -/// \brief Compute IpcPayload for the given record batch -/// \param[in] batch the RecordBatch that is being serialized -/// \param[in,out] pool for any required temporary memory allocations -/// \param[out] out the returned IpcPayload -/// \return Status -ARROW_EXPORT -Status GetRecordBatchPayload(const RecordBatch& batch, MemoryPool* pool, IpcPayload* out); - -} // namespace internal - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_WRITER_H diff --git a/r/R/inst/include/arrow/json/api.h b/r/R/inst/include/arrow/json/api.h deleted file mode 100644 index 47b56684b5a..00000000000 --- a/r/R/inst/include/arrow/json/api.h +++ /dev/null @@ -1,21 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/json/options.h" -#include "arrow/json/reader.h" diff --git a/r/R/inst/include/arrow/json/chunked-builder.h b/r/R/inst/include/arrow/json/chunked-builder.h deleted file mode 100644 index b2cfbefdf45..00000000000 --- a/r/R/inst/include/arrow/json/chunked-builder.h +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -namespace internal { -class TaskGroup; -} // namespace internal - -class Array; -class MemoryPool; -class DataType; -class Field; -class ChunkedArray; - -namespace json { - -class PromotionGraph; - -class ARROW_EXPORT ChunkedArrayBuilder { - public: - virtual ~ChunkedArrayBuilder() = default; - - /// Spawn a task that will try to convert and insert the given JSON block - virtual void Insert(int64_t block_index, - const std::shared_ptr& unconverted_field, - const std::shared_ptr& unconverted) = 0; - - /// Return the final chunked array. - /// Every chunk must be inserted before this is called! - virtual Status Finish(std::shared_ptr* out) = 0; - - /// Finish current task group and substitute a new one - virtual Status ReplaceTaskGroup( - const std::shared_ptr& task_group) = 0; - - protected: - explicit ChunkedArrayBuilder(const std::shared_ptr& task_group) - : task_group_(task_group) {} - - std::shared_ptr task_group_; -}; - -/// create a chunked builder -/// -/// if unexpected fields and promotion need to be handled, promotion_graph must be -/// non-null -ARROW_EXPORT Status MakeChunkedArrayBuilder( - const std::shared_ptr& task_group, MemoryPool* pool, - const PromotionGraph* promotion_graph, const std::shared_ptr& type, - std::unique_ptr* out); - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/chunker.h b/r/R/inst/include/arrow/json/chunker.h deleted file mode 100644 index 0f94d81afd3..00000000000 --- a/r/R/inst/include/arrow/json/chunker.h +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; - -namespace json { - -struct ParseOptions; - -/// \class Chunker -/// \brief A reusable block-based chunker for JSON data -/// -/// The chunker takes a block of JSON data and finds a suitable place -/// to cut it up without splitting an object. -class ARROW_EXPORT Chunker { - public: - virtual ~Chunker() = default; - - /// \brief Carve up a chunk in a block of data to contain only whole objects - /// \param[in] block json data to be chunked - /// \param[out] whole subrange of block containing whole json objects - /// \param[out] partial subrange of block a partial json object - virtual Status Process(const std::shared_ptr& block, - std::shared_ptr* whole, - std::shared_ptr* partial) = 0; - - /// \brief Carve the completion of a partial object out of a block - /// \param[in] partial incomplete json object - /// \param[in] block json data - /// \param[out] completion subrange of block containing the completion of partial - /// \param[out] rest subrange of block containing what completion does not cover - virtual Status ProcessWithPartial(const std::shared_ptr& partial, - const std::shared_ptr& block, - std::shared_ptr* completion, - std::shared_ptr* rest) = 0; - - static std::unique_ptr Make(const ParseOptions& options); - - protected: - Chunker() = default; - ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); -}; - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/converter.h b/r/R/inst/include/arrow/json/converter.h deleted file mode 100644 index 9a812dd3c3a..00000000000 --- a/r/R/inst/include/arrow/json/converter.h +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class DataType; -class Field; -class MemoryPool; - -namespace json { - -/// \brief interface for conversion of Arrays -/// -/// Converters are not required to be correct for arbitrary input- only -/// for unconverted arrays emitted by a corresponding parser. -class ARROW_EXPORT Converter { - public: - virtual ~Converter() = default; - - /// convert an array - /// on failure, this converter may be promoted to another converter which - /// *can* convert the given input. - virtual Status Convert(const std::shared_ptr& in, - std::shared_ptr* out) = 0; - - std::shared_ptr out_type() const { return out_type_; } - - MemoryPool* pool() { return pool_; } - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); - - Converter(MemoryPool* pool, const std::shared_ptr& out_type) - : pool_(pool), out_type_(out_type) {} - - MemoryPool* pool_; - std::shared_ptr out_type_; -}; - -/// \brief produce a single converter to the specified out_type -ARROW_EXPORT Status MakeConverter(const std::shared_ptr& out_type, - MemoryPool* pool, std::shared_ptr* out); - -class ARROW_EXPORT PromotionGraph { - public: - virtual ~PromotionGraph() = default; - - /// \brief produce a valid field which will be inferred as null - virtual std::shared_ptr Null(const std::string& name) const = 0; - - /// \brief given an unexpected field encountered during parsing, return a type to which - /// it may be convertible (may return null if none is available) - virtual std::shared_ptr Infer( - const std::shared_ptr& unexpected_field) const = 0; - - /// \brief given a type to which conversion failed, return a promoted type to which - /// conversion may succeed (may return null if none is available) - virtual std::shared_ptr Promote( - const std::shared_ptr& failed, - const std::shared_ptr& unexpected_field) const = 0; - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph); - PromotionGraph() = default; -}; - -ARROW_EXPORT const PromotionGraph* GetPromotionGraph(); - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/options.h b/r/R/inst/include/arrow/json/options.h deleted file mode 100644 index 8d27faabea2..00000000000 --- a/r/R/inst/include/arrow/json/options.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class DataType; -class Schema; - -namespace json { - -enum class UnexpectedFieldBehavior : char { Ignore, Error, InferType }; - -struct ARROW_EXPORT ParseOptions { - // Parsing options - - // Optional explicit schema (no type inference, ignores other fields) - std::shared_ptr explicit_schema; - - // Whether objects may be printed across multiple lines (for example pretty printed) - // NB: if false, input must end with an empty line - bool newlines_in_values = false; - - // How should parse handle fields outside the explicit_schema? - UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; - - static ParseOptions Defaults(); -}; - -struct ARROW_EXPORT ReadOptions { - // Reader options - - // Whether to use the global CPU thread pool - bool use_threads = true; - // Block size we request from the IO layer; also determines the size of - // chunks when use_threads is true - int32_t block_size = 1 << 20; // 1 MB - - static ReadOptions Defaults(); -}; - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/parser.h b/r/R/inst/include/arrow/json/parser.h deleted file mode 100644 index ec12eeec370..00000000000 --- a/r/R/inst/include/arrow/json/parser.h +++ /dev/null @@ -1,96 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include "arrow/json/options.h" -#include "arrow/status.h" -#include "arrow/util/key_value_metadata.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Buffer; -class MemoryPool; -class KeyValueMetadata; -class ResizableBuffer; - -namespace json { - -struct Kind { - enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; - - static const std::string& Name(Kind::type); - - static const std::shared_ptr& Tag(Kind::type); - - static Kind::type FromTag(const std::shared_ptr& tag); - - static Status ForType(const DataType& type, Kind::type* kind); -}; - -constexpr int32_t kMaxParserNumRows = 100000; - -/// \class BlockParser -/// \brief A reusable block-based parser for JSON data -/// -/// The parser takes a block of newline delimited JSON data and extracts Arrays -/// of unconverted strings which can be fed to a Converter to obtain a usable Array. -/// -/// Note that in addition to parse errors (such as malformed JSON) some conversion -/// errors are caught at parse time: -/// - A null value in non-nullable column -/// - Change in the JSON kind of a column. For example, if an explicit schema is provided -/// which stipulates that field "a" is integral, a row of {"a": "not a number"} will -/// result in an error. This also applies to fields outside an explicit schema. -class ARROW_EXPORT BlockParser { - public: - virtual ~BlockParser() = default; - - /// \brief Reserve storage for scalars parsed from a block of json - virtual Status ReserveScalarStorage(int64_t nbytes) = 0; - - /// \brief Parse a block of data - virtual Status Parse(const std::shared_ptr& json) = 0; - - /// \brief Extract parsed data - virtual Status Finish(std::shared_ptr* parsed) = 0; - - /// \brief Return the number of parsed rows - int32_t num_rows() const { return num_rows_; } - - static Status Make(MemoryPool* pool, const ParseOptions& options, - std::unique_ptr* out); - - static Status Make(const ParseOptions& options, std::unique_ptr* out); - - protected: - ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); - - explicit BlockParser(MemoryPool* pool) : pool_(pool) {} - - MemoryPool* pool_; - int32_t num_rows_ = 0; -}; - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/rapidjson-defs.h b/r/R/inst/include/arrow/json/rapidjson-defs.h deleted file mode 100644 index 68dd0be6386..00000000000 --- a/r/R/inst/include/arrow/json/rapidjson-defs.h +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Include this file before including any RapidJSON headers. - -#define RAPIDJSON_HAS_STDSTRING 1 -#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 -#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 - -// rapidjson will be defined in namespace arrow::rapidjson -#define RAPIDJSON_NAMESPACE arrow::rapidjson -#define RAPIDJSON_NAMESPACE_BEGIN \ - namespace arrow { \ - namespace rapidjson { -#define RAPIDJSON_NAMESPACE_END \ - } \ - } - -#include "arrow/util/sse-util.h" - -// enable SIMD whitespace skipping, if available -#if defined(ARROW_HAVE_SSE2) -#define RAPIDJSON_SSE2 1 -#define ARROW_RAPIDJSON_SKIP_WHITESPACE_SIMD 1 -#endif - -#if defined(ARROW_HAVE_SSE4_2) -#define RAPIDJSON_SSE42 1 -#define ARROW_RAPIDJSON_SKIP_WHITESPACE_SIMD 1 -#endif diff --git a/r/R/inst/include/arrow/json/reader.h b/r/R/inst/include/arrow/json/reader.h deleted file mode 100644 index 51a3473a04e..00000000000 --- a/r/R/inst/include/arrow/json/reader.h +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/json/options.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Table; -class RecordBatch; -class Array; -class DataType; - -namespace io { -class InputStream; -} // namespace io - -namespace json { - -class ARROW_EXPORT TableReader { - public: - virtual ~TableReader() = default; - - virtual Status Read(std::shared_ptr
* out) = 0; - - static Status Make(MemoryPool* pool, std::shared_ptr input, - const ReadOptions&, const ParseOptions&, - std::shared_ptr* out); -}; - -ARROW_EXPORT Status ParseOne(ParseOptions options, std::shared_ptr json, - std::shared_ptr* out); - -/// \brief convert an Array produced by BlockParser into an Array of out_type -ARROW_EXPORT Status Convert(const std::shared_ptr& out_type, - const std::shared_ptr& in, - std::shared_ptr* out); - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/json/test-common.h b/r/R/inst/include/arrow/json/test-common.h deleted file mode 100644 index 2905ae9556b..00000000000 --- a/r/R/inst/include/arrow/json/test-common.h +++ /dev/null @@ -1,183 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include "arrow/json/rapidjson-defs.h" -#include "rapidjson/document.h" -#include "rapidjson/prettywriter.h" -#include "rapidjson/reader.h" -#include "rapidjson/writer.h" - -#include "arrow/io/memory.h" -#include "arrow/json/converter.h" -#include "arrow/json/options.h" -#include "arrow/json/parser.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/type.h" -#include "arrow/util/string_view.h" -#include "arrow/visitor_inline.h" - -namespace arrow { -namespace json { - -namespace rj = arrow::rapidjson; - -using rj::StringBuffer; -using util::string_view; -using Writer = rj::Writer; - -inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); } - -template -inline static Status Generate(const std::shared_ptr& type, Engine& e, - Writer* writer); - -template -inline static Status Generate(const std::vector>& fields, - Engine& e, Writer* writer); - -template -inline static Status Generate(const std::shared_ptr& schm, Engine& e, - Writer* writer) { - return Generate(schm->fields(), e, writer); -} - -template -struct GenerateImpl { - Status Visit(const BooleanType&) { - return OK(writer.Bool(std::uniform_int_distribution{}(e)&1)); - } - template - Status Visit(T const&, enable_if_unsigned_integer* = nullptr) { - auto val = std::uniform_int_distribution<>{}(e); - return OK(writer.Uint64(static_cast(val))); - } - template - Status Visit(T const&, enable_if_signed_integer* = nullptr) { - auto val = std::uniform_int_distribution<>{}(e); - return OK(writer.Int64(static_cast(val))); - } - template - Status Visit(T const&, enable_if_floating_point* = nullptr) { - auto val = std::normal_distribution{0, 1 << 10}(e); - return OK(writer.Double(val)); - } - Status Visit(HalfFloatType const&) { - auto val = std::normal_distribution{0, 1 << 10}(e); - return OK(writer.Double(val)); - } - template - Status Visit(T const&, enable_if_binary* = nullptr) { - auto size = std::poisson_distribution<>{4}(e); - std::uniform_int_distribution gen_char(32, 127); // FIXME generate UTF8 - std::string s(size, '\0'); - for (char& ch : s) ch = static_cast(gen_char(e)); - return OK(writer.String(s.c_str())); - } - template - Status Visit( - T const& t, typename std::enable_if::value>::type* = nullptr, - typename std::enable_if::value>::type* = nullptr) { - return Status::Invalid("can't generate a value of type " + t.name()); - } - Status Visit(const ListType& t) { - auto size = std::poisson_distribution<>{4}(e); - writer.StartArray(); - for (int i = 0; i < size; ++i) RETURN_NOT_OK(Generate(t.value_type(), e, &writer)); - return OK(writer.EndArray(size)); - } - Status Visit(const StructType& t) { return Generate(t.children(), e, &writer); } - Engine& e; - rj::Writer& writer; -}; - -template -inline static Status Generate(const std::shared_ptr& type, Engine& e, - Writer* writer) { - if (std::uniform_real_distribution<>{0, 1}(e) < .2) { - // one out of 5 chance of null, anywhere - writer->Null(); - return Status::OK(); - } - GenerateImpl visitor = {e, *writer}; - return VisitTypeInline(*type, &visitor); -} - -template -inline static Status Generate(const std::vector>& fields, - Engine& e, Writer* writer) { - RETURN_NOT_OK(OK(writer->StartObject())); - for (const auto& f : fields) { - writer->Key(f->name().c_str()); - RETURN_NOT_OK(Generate(f->type(), e, writer)); - } - return OK(writer->EndObject(static_cast(fields.size()))); -} - -inline static Status MakeStream(string_view src_str, - std::shared_ptr* out) { - auto src = std::make_shared(src_str); - *out = std::make_shared(src); - return Status::OK(); -} - -// scalar values (numbers and strings) are parsed into a -// dictionary. This can be decoded for ease of comparison -inline static Status DecodeStringDictionary(const DictionaryArray& dict_array, - std::shared_ptr* decoded) { - const StringArray& dict = static_cast(*dict_array.dictionary()); - const Int32Array& indices = static_cast(*dict_array.indices()); - StringBuilder builder; - RETURN_NOT_OK(builder.Resize(indices.length())); - for (int64_t i = 0; i < indices.length(); ++i) { - if (indices.IsNull(i)) { - builder.UnsafeAppendNull(); - continue; - } - auto value = dict.GetView(indices.GetView(i)); - RETURN_NOT_OK(builder.ReserveData(value.size())); - builder.UnsafeAppend(value); - } - return builder.Finish(decoded); -} - -inline static Status ParseFromString(ParseOptions options, string_view src_str, - std::shared_ptr* parsed) { - auto src = std::make_shared(src_str); - std::unique_ptr parser; - RETURN_NOT_OK(BlockParser::Make(options, &parser)); - RETURN_NOT_OK(parser->Parse(src)); - return parser->Finish(parsed); -} - -static inline std::string PrettyPrint(string_view one_line) { - rj::Document document; - - // Must pass size to avoid ASAN issues. - document.Parse(one_line.data(), one_line.size()); - rj::StringBuffer sb; - rj::PrettyWriter writer(sb); - document.Accept(writer); - return sb.GetString(); -} - -} // namespace json -} // namespace arrow diff --git a/r/R/inst/include/arrow/memory_pool-test.h b/r/R/inst/include/arrow/memory_pool-test.h deleted file mode 100644 index 3eca585a1b7..00000000000 --- a/r/R/inst/include/arrow/memory_pool-test.h +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include - -#include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/testing/gtest_util.h" - -namespace arrow { - -class TestMemoryPoolBase : public ::testing::Test { - public: - virtual ::arrow::MemoryPool* memory_pool() = 0; - - void TestMemoryTracking() { - auto pool = memory_pool(); - - uint8_t* data; - ASSERT_OK(pool->Allocate(100, &data)); - EXPECT_EQ(static_cast(0), reinterpret_cast(data) % 64); - ASSERT_EQ(100, pool->bytes_allocated()); - - uint8_t* data2; - ASSERT_OK(pool->Allocate(27, &data2)); - EXPECT_EQ(static_cast(0), reinterpret_cast(data2) % 64); - ASSERT_EQ(127, pool->bytes_allocated()); - - pool->Free(data, 100); - ASSERT_EQ(27, pool->bytes_allocated()); - pool->Free(data2, 27); - ASSERT_EQ(0, pool->bytes_allocated()); - } - - void TestOOM() { - auto pool = memory_pool(); - - uint8_t* data; - int64_t to_alloc = std::min(std::numeric_limits::max(), - std::numeric_limits::max()); - // subtract 63 to prevent overflow after the size is aligned - to_alloc -= 63; - ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); - } - - void TestReallocate() { - auto pool = memory_pool(); - - uint8_t* data; - ASSERT_OK(pool->Allocate(10, &data)); - ASSERT_EQ(10, pool->bytes_allocated()); - data[0] = 35; - data[9] = 12; - - // Expand - ASSERT_OK(pool->Reallocate(10, 20, &data)); - ASSERT_EQ(data[9], 12); - ASSERT_EQ(20, pool->bytes_allocated()); - - // Shrink - ASSERT_OK(pool->Reallocate(20, 5, &data)); - ASSERT_EQ(data[0], 35); - ASSERT_EQ(5, pool->bytes_allocated()); - - // Free - pool->Free(data, 5); - ASSERT_EQ(0, pool->bytes_allocated()); - } -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/memory_pool.h b/r/R/inst/include/arrow/memory_pool.h deleted file mode 100644 index 60643c387f4..00000000000 --- a/r/R/inst/include/arrow/memory_pool.h +++ /dev/null @@ -1,155 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_MEMORY_POOL_H -#define ARROW_MEMORY_POOL_H - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -namespace internal { - -/////////////////////////////////////////////////////////////////////// -// Helper tracking memory statistics - -class MemoryPoolStats { - public: - MemoryPoolStats() : bytes_allocated_(0), max_memory_(0) {} - - int64_t max_memory() const { return max_memory_.load(); } - - int64_t bytes_allocated() const { return bytes_allocated_.load(); } - - inline void UpdateAllocatedBytes(int64_t diff) { - auto allocated = bytes_allocated_.fetch_add(diff) + diff; - // "maximum" allocated memory is ill-defined in multi-threaded code, - // so don't try to be too rigorous here - if (diff > 0 && allocated > max_memory_) { - max_memory_ = allocated; - } - } - - protected: - std::atomic bytes_allocated_; - std::atomic max_memory_; -}; - -} // namespace internal - -/// Base class for memory allocation. -/// -/// Besides tracking the number of allocated bytes, the allocator also should -/// take care of the required 64-byte alignment. -class ARROW_EXPORT MemoryPool { - public: - virtual ~MemoryPool(); - - /// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool - static std::unique_ptr CreateDefault(); - - /// Allocate a new memory region of at least size bytes. - /// - /// The allocated region shall be 64-byte aligned. - virtual Status Allocate(int64_t size, uint8_t** out) = 0; - - /// Resize an already allocated memory section. - /// - /// As by default most default allocators on a platform don't support aligned - /// reallocation, this function can involve a copy of the underlying data. - virtual Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) = 0; - - /// Free an allocated region. - /// - /// @param buffer Pointer to the start of the allocated memory region - /// @param size Allocated size located at buffer. An allocator implementation - /// may use this for tracking the amount of allocated bytes as well as for - /// faster deallocation if supported by its backend. - virtual void Free(uint8_t* buffer, int64_t size) = 0; - - /// The number of bytes that were allocated and not yet free'd through - /// this allocator. - virtual int64_t bytes_allocated() const = 0; - - /// Return peak memory allocation in this memory pool - /// - /// \return Maximum bytes allocated. If not known (or not implemented), - /// returns -1 - virtual int64_t max_memory() const; - - protected: - MemoryPool(); -}; - -class ARROW_EXPORT LoggingMemoryPool : public MemoryPool { - public: - explicit LoggingMemoryPool(MemoryPool* pool); - ~LoggingMemoryPool() override = default; - - Status Allocate(int64_t size, uint8_t** out) override; - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override; - - void Free(uint8_t* buffer, int64_t size) override; - - int64_t bytes_allocated() const override; - - int64_t max_memory() const override; - - private: - MemoryPool* pool_; -}; - -/// Derived class for memory allocation. -/// -/// Tracks the number of bytes and maximum memory allocated through its direct -/// calls. Actual allocation is delegated to MemoryPool class. -class ARROW_EXPORT ProxyMemoryPool : public MemoryPool { - public: - explicit ProxyMemoryPool(MemoryPool* pool); - ~ProxyMemoryPool() override; - - Status Allocate(int64_t size, uint8_t** out) override; - Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) override; - - void Free(uint8_t* buffer, int64_t size) override; - - int64_t bytes_allocated() const override; - - int64_t max_memory() const override; - - private: - class ProxyMemoryPoolImpl; - std::unique_ptr impl_; -}; - -/// Return the process-wide default memory pool. -ARROW_EXPORT MemoryPool* default_memory_pool(); - -#ifdef ARROW_NO_DEFAULT_MEMORY_POOL -#define ARROW_MEMORY_POOL_DEFAULT -#else -#define ARROW_MEMORY_POOL_DEFAULT = default_memory_pool() -#endif - -} // namespace arrow - -#endif // ARROW_MEMORY_POOL_H diff --git a/r/R/inst/include/arrow/pretty_print.h b/r/R/inst/include/arrow/pretty_print.h deleted file mode 100644 index 9c2708f16ee..00000000000 --- a/r/R/inst/include/arrow/pretty_print.h +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PRETTY_PRINT_H -#define ARROW_PRETTY_PRINT_H - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Column; -class ChunkedArray; -class RecordBatch; -class Schema; -class Status; -class Table; - -struct PrettyPrintOptions { - PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, - std::string null_rep_arg = "null", bool skip_new_lines_arg = false) - : indent(indent_arg), - indent_size(indent_size_arg), - window(window_arg), - null_rep(null_rep_arg), - skip_new_lines(skip_new_lines_arg) {} - - /// Number of spaces to shift entire formatted object to the right - int indent; - - /// Size of internal indents - int indent_size; - - /// Maximum number of elements to show at the beginning and at the end. - int window; - - /// String to use for representing a null value, defaults to "null" - std::string null_rep; - - /// Skip new lines between elements, defaults to false - bool skip_new_lines; -}; - -/// \brief Print human-readable representation of RecordBatch -ARROW_EXPORT -Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink); - -/// \brief Print human-readable representation of Table -ARROW_EXPORT -Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, - std::ostream* sink); - -/// \brief Print human-readable representation of Array -ARROW_EXPORT -Status PrettyPrint(const Array& arr, int indent, std::ostream* sink); - -/// \brief Print human-readable representation of Array -ARROW_EXPORT -Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, - std::ostream* sink); - -/// \brief Print human-readable representation of Array -ARROW_EXPORT -Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, - std::string* result); - -/// \brief Print human-readable representation of ChunkedArray -ARROW_EXPORT -Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, - std::ostream* sink); - -/// \brief Print human-readable representation of ChunkedArray -ARROW_EXPORT -Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, - std::string* result); - -/// \brief Print human-readable representation of Column -ARROW_EXPORT -Status PrettyPrint(const Column& column, const PrettyPrintOptions& options, - std::ostream* sink); - -ARROW_EXPORT -Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, - std::ostream* sink); - -ARROW_EXPORT -Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, - std::string* result); - -ARROW_EXPORT -Status DebugPrint(const Array& arr, int indent); - -} // namespace arrow - -#endif // ARROW_PRETTY_PRINT_H diff --git a/r/R/inst/include/arrow/python/api.h b/r/R/inst/include/arrow/python/api.h deleted file mode 100644 index 6bbfcbfa34b..00000000000 --- a/r/R/inst/include/arrow/python/api.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_API_H -#define ARROW_PYTHON_API_H - -#include "arrow/python/arrow_to_pandas.h" -#include "arrow/python/common.h" -#include "arrow/python/deserialize.h" -#include "arrow/python/helpers.h" -#include "arrow/python/inference.h" -#include "arrow/python/io.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/numpy_to_arrow.h" -#include "arrow/python/python_to_arrow.h" -#include "arrow/python/serialize.h" - -#endif // ARROW_PYTHON_API_H diff --git a/r/R/inst/include/arrow/python/arrow_to_pandas.h b/r/R/inst/include/arrow/python/arrow_to_pandas.h deleted file mode 100644 index 20bad409710..00000000000 --- a/r/R/inst/include/arrow/python/arrow_to_pandas.h +++ /dev/null @@ -1,97 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H -#define ARROW_PYTHON_ADAPTERS_PANDAS_H - -#include "arrow/python/platform.h" - -#include -#include -#include - -#include "arrow/python/visibility.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class Column; -class DataType; -class MemoryPool; -class Status; -class Table; - -namespace py { - -struct PandasOptions { - /// If true, we will convert all string columns to categoricals - bool strings_to_categorical = false; - bool zero_copy_only = false; - bool integer_object_nulls = false; - bool date_as_object = false; - bool use_threads = false; - - /// \brief If true, do not create duplicate PyObject versions of equal - /// objects. This only applies to immutable objects like strings or datetime - /// objects - bool deduplicate_objects = false; -}; - -ARROW_PYTHON_EXPORT -Status ConvertArrayToPandas(const PandasOptions& options, - const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out); - -ARROW_PYTHON_EXPORT -Status ConvertChunkedArrayToPandas(const PandasOptions& options, - const std::shared_ptr& col, - PyObject* py_ref, PyObject** out); - -ARROW_PYTHON_EXPORT -Status ConvertColumnToPandas(const PandasOptions& options, - const std::shared_ptr& col, PyObject* py_ref, - PyObject** out); - -// Convert a whole table as efficiently as possible to a pandas.DataFrame. -// -// The returned Python object is a list of tuples consisting of the exact 2D -// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. -// -// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) -ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(const PandasOptions& options, - const std::shared_ptr
& table, MemoryPool* pool, - PyObject** out); - -/// Convert a whole table as efficiently as possible to a pandas.DataFrame. -/// -/// Explicitly name columns that should be a categorical -/// This option is only used on conversions that are applied to a table. -ARROW_PYTHON_EXPORT -Status ConvertTableToPandas(const PandasOptions& options, - const std::unordered_set& categorical_columns, - const std::shared_ptr
& table, MemoryPool* pool, - PyObject** out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ADAPTERS_PANDAS_H diff --git a/r/R/inst/include/arrow/python/benchmark.h b/r/R/inst/include/arrow/python/benchmark.h deleted file mode 100644 index caaff32b365..00000000000 --- a/r/R/inst/include/arrow/python/benchmark.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_BENCHMARK_H -#define ARROW_PYTHON_BENCHMARK_H - -#include "arrow/python/platform.h" - -#include "arrow/python/visibility.h" - -namespace arrow { -namespace py { -namespace benchmark { - -// Micro-benchmark routines for use from ASV - -// Run PandasObjectIsNull() once over every object in *list* -ARROW_PYTHON_EXPORT -void Benchmark_PandasObjectIsNull(PyObject* list); - -} // namespace benchmark -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_BENCHMARK_H diff --git a/r/R/inst/include/arrow/python/common.h b/r/R/inst/include/arrow/python/common.h deleted file mode 100644 index a759d393a66..00000000000 --- a/r/R/inst/include/arrow/python/common.h +++ /dev/null @@ -1,265 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_COMMON_H -#define ARROW_PYTHON_COMMON_H - -#include -#include - -#include "arrow/python/config.h" - -#include "arrow/buffer.h" -#include "arrow/python/visibility.h" -#include "arrow/util/macros.h" - -namespace arrow { - -class MemoryPool; - -namespace py { - -ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); - -// Catch a pending Python exception and return the corresponding Status. -// If no exception is pending, Status::OK() is returned. -inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) { - if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - return Status::OK(); - } else { - return ConvertPyError(code); - } -} - -ARROW_PYTHON_EXPORT Status PassPyError(); - -// TODO(wesm): We can just let errors pass through. To be explored later -#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError()); - -#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)); - -// A RAII-style helper that ensures the GIL is acquired inside a lexical block. -class ARROW_PYTHON_EXPORT PyAcquireGIL { - public: - PyAcquireGIL() : acquired_gil_(false) { acquire(); } - - ~PyAcquireGIL() { release(); } - - void acquire() { - if (!acquired_gil_) { - state_ = PyGILState_Ensure(); - acquired_gil_ = true; - } - } - - // idempotent - void release() { - if (acquired_gil_) { - PyGILState_Release(state_); - acquired_gil_ = false; - } - } - - private: - bool acquired_gil_; - PyGILState_STATE state_; - ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); -}; - -// A helper to call safely into the Python interpreter from arbitrary C++ code. -// The GIL is acquired, and the current thread's error status is preserved. -template -Status SafeCallIntoPython(Function&& func) { - PyAcquireGIL lock; - PyObject* exc_type; - PyObject* exc_value; - PyObject* exc_traceback; - PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); - Status st = std::forward(func)(); - // If the return Status is a "Python error", the current Python error status - // describes the error and shouldn't be clobbered. - if (!st.IsPythonError() && exc_type != NULLPTR) { - PyErr_Restore(exc_type, exc_value, exc_traceback); - } - return st; -} - -#define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2 - -// A RAII primitive that DECREFs the underlying PyObject* when it -// goes out of scope. -class ARROW_PYTHON_EXPORT OwnedRef { - public: - OwnedRef() : obj_(NULLPTR) {} - OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} - explicit OwnedRef(PyObject* obj) : obj_(obj) {} - - OwnedRef& operator=(OwnedRef&& other) { - obj_ = other.detach(); - return *this; - } - - ~OwnedRef() { reset(); } - - void reset(PyObject* obj) { - Py_XDECREF(obj_); - obj_ = obj; - } - - void reset() { reset(NULLPTR); } - - PyObject* detach() { - PyObject* result = obj_; - obj_ = NULLPTR; - return result; - } - - PyObject* obj() const { return obj_; } - - PyObject** ref() { return &obj_; } - - operator bool() const { return obj_ != NULLPTR; } - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef); - - PyObject* obj_; -}; - -// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope. -// This is for situations where the GIL is not always known to be held -// (e.g. if it is released in the middle of a function for performance reasons) -class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { - public: - OwnedRefNoGIL() : OwnedRef() {} - OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {} - explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {} - - ~OwnedRefNoGIL() { - PyAcquireGIL lock; - reset(); - } -}; - -// A temporary conversion of a Python object to a bytes area. -struct PyBytesView { - const char* bytes; - Py_ssize_t size; - - PyBytesView() : bytes(NULLPTR), size(0), ref(NULLPTR) {} - - // View the given Python object as binary-like, i.e. bytes - Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); } - - Status FromString(PyObject* obj) { - bool ignored = false; - return FromString(obj, false, &ignored); - } - - Status FromString(PyObject* obj, bool* is_utf8) { - return FromString(obj, true, is_utf8); - } - - Status FromUnicode(PyObject* obj) { -#if PY_MAJOR_VERSION >= 3 - Py_ssize_t size; - // The utf-8 representation is cached on the unicode object - const char* data = PyUnicode_AsUTF8AndSize(obj, &size); - RETURN_IF_PYERROR(); - this->bytes = data; - this->size = size; - this->ref.reset(); -#else - PyObject* converted = PyUnicode_AsUTF8String(obj); - RETURN_IF_PYERROR(); - this->bytes = PyBytes_AS_STRING(converted); - this->size = PyBytes_GET_SIZE(converted); - this->ref.reset(converted); -#endif - return Status::OK(); - } - - protected: - PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = NULLPTR) - : bytes(b), size(s), ref(obj) {} - - // View the given Python object as string-like, i.e. str or (utf8) bytes - Status FromString(PyObject* obj, bool check_utf8, bool* is_utf8) { - if (PyUnicode_Check(obj)) { - *is_utf8 = true; - return FromUnicode(obj); - } else { - ARROW_RETURN_NOT_OK(FromBinary(obj, "a string or bytes object")); - if (check_utf8) { - // Check the bytes are utf8 utf-8 - OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size)); - if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { - *is_utf8 = true; - } else { - *is_utf8 = false; - PyErr_Clear(); - } - } else { - *is_utf8 = false; - } - return Status::OK(); - } - } - - Status FromBinary(PyObject* obj, const char* expected_msg) { - if (PyBytes_Check(obj)) { - this->bytes = PyBytes_AS_STRING(obj); - this->size = PyBytes_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); - } else if (PyByteArray_Check(obj)) { - this->bytes = PyByteArray_AS_STRING(obj); - this->size = PyByteArray_GET_SIZE(obj); - this->ref.reset(); - return Status::OK(); - } else { - return Status::TypeError("Expected ", expected_msg, ", got a '", - Py_TYPE(obj)->tp_name, "' object"); - } - } - - OwnedRef ref; -}; - -// Return the common PyArrow memory pool -ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool); -ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool(); - -class ARROW_PYTHON_EXPORT PyBuffer : public Buffer { - public: - /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports - /// one-dimensional byte buffers. - ~PyBuffer(); - - static Status FromPyObject(PyObject* obj, std::shared_ptr* out); - - private: - PyBuffer(); - Status Init(PyObject*); - - Py_buffer py_buf_; -}; - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_COMMON_H diff --git a/r/R/inst/include/arrow/python/config.h b/r/R/inst/include/arrow/python/config.h deleted file mode 100644 index 5649ffe55c2..00000000000 --- a/r/R/inst/include/arrow/python/config.h +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_CONFIG_H -#define ARROW_PYTHON_CONFIG_H - -#include "arrow/python/platform.h" - -#include "arrow/python/numpy_interop.h" -#include "arrow/python/visibility.h" - -#if PY_MAJOR_VERSION >= 3 -#define PyString_Check PyUnicode_Check -#endif - -namespace arrow { -namespace py { - -ARROW_PYTHON_EXPORT -extern PyObject* numpy_nan; - -ARROW_PYTHON_EXPORT -void set_numpy_nan(PyObject* obj); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_CONFIG_H diff --git a/r/R/inst/include/arrow/python/decimal.h b/r/R/inst/include/arrow/python/decimal.h deleted file mode 100644 index 0477be87f8f..00000000000 --- a/r/R/inst/include/arrow/python/decimal.h +++ /dev/null @@ -1,113 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_DECIMAL_H -#define ARROW_PYTHON_DECIMAL_H - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" - -namespace arrow { - -class Decimal128; - -namespace py { - -class OwnedRef; - -// -// Python Decimal support -// - -namespace internal { - -// \brief Import the Python Decimal type -ARROW_PYTHON_EXPORT -Status ImportDecimalType(OwnedRef* decimal_type); - -// \brief Convert a Python Decimal object to a C++ string -// \param[in] python_decimal A Python decimal.Decimal instance -// \param[out] The string representation of the Python Decimal instance -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status PythonDecimalToString(PyObject* python_decimal, std::string* out); - -// \brief Convert a C++ std::string to a Python Decimal instance -// \param[in] decimal_constructor The decimal type object -// \param[in] decimal_string A decimal string -// \return An instance of decimal.Decimal -ARROW_PYTHON_EXPORT -PyObject* DecimalFromString(PyObject* decimal_constructor, - const std::string& decimal_string); - -// \brief Convert a Python decimal to an Arrow Decimal128 object -// \param[in] python_decimal A Python decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal128 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, - Decimal128* out); - -// \brief Convert a Python object to an Arrow Decimal128 object -// \param[in] python_decimal A Python int or decimal.Decimal instance -// \param[in] arrow_type An instance of arrow::DecimalType -// \param[out] out A pointer to a Decimal128 -// \return The status of the operation -ARROW_PYTHON_EXPORT -Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out); - -// \brief Check whether obj is an instance of Decimal -ARROW_PYTHON_EXPORT -bool PyDecimal_Check(PyObject* obj); - -// \brief Check whether obj is nan. This function will abort the program if the argument -// is not a Decimal instance -ARROW_PYTHON_EXPORT -bool PyDecimal_ISNAN(PyObject* obj); - -// \brief Helper class to track and update the precision and scale of a decimal -class ARROW_PYTHON_EXPORT DecimalMetadata { - public: - DecimalMetadata(); - DecimalMetadata(int32_t precision, int32_t scale); - - // \brief Adjust the precision and scale of a decimal type given a new precision and a - // new scale \param[in] suggested_precision A candidate precision \param[in] - // suggested_scale A candidate scale \return The status of the operation - Status Update(int32_t suggested_precision, int32_t suggested_scale); - - // \brief A convenient interface for updating the precision and scale based on a Python - // Decimal object \param object A Python Decimal object \return The status of the - // operation - Status Update(PyObject* object); - - int32_t precision() const { return precision_; } - int32_t scale() const { return scale_; } - - private: - int32_t precision_; - int32_t scale_; -}; - -} // namespace internal -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_DECIMAL_H diff --git a/r/R/inst/include/arrow/python/deserialize.h b/r/R/inst/include/arrow/python/deserialize.h deleted file mode 100644 index b9c4984a3b0..00000000000 --- a/r/R/inst/include/arrow/python/deserialize.h +++ /dev/null @@ -1,92 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_ARROW_TO_PYTHON_H -#define ARROW_PYTHON_ARROW_TO_PYTHON_H - -#include -#include -#include - -#include "arrow/python/serialize.h" -#include "arrow/python/visibility.h" -#include "arrow/status.h" - -namespace arrow { - -class RecordBatch; -class Tensor; - -namespace io { - -class RandomAccessFile; - -} // namespace io - -namespace py { - -/// \brief Read serialized Python sequence from file interface using Arrow IPC -/// \param[in] src a RandomAccessFile -/// \param[out] out the reconstructed data -/// \return Status -ARROW_PYTHON_EXPORT -Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); - -/// \brief Reconstruct SerializedPyObject from representation produced by -/// SerializedPyObject::GetComponents. -/// -/// \param[in] num_tensors number of tensors in the object -/// \param[in] num_ndarrays number of numpy Ndarrays in the object -/// \param[in] num_buffers number of buffers in the object -/// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 + -/// num_tensors * 2 + num_buffers in length -/// \param[out] out the reconstructed object -/// \return Status -ARROW_PYTHON_EXPORT -Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_buffers, - PyObject* data, SerializedPyObject* out); - -/// \brief Reconstruct Python object from Arrow-serialized representation -/// \param[in] context Serialization context which contains custom serialization -/// and deserialization callbacks. Can be any Python object with a -/// _serialize_callback method for serialization and a _deserialize_callback -/// method for deserialization. If context is None, no custom serialization -/// will be attempted. -/// \param[in] object Object to deserialize -/// \param[in] base a Python object holding the underlying data that any NumPy -/// arrays will reference, to avoid premature deallocation -/// \param[out] out The returned object -/// \return Status -/// This acquires the GIL -ARROW_PYTHON_EXPORT -Status DeserializeObject(PyObject* context, const SerializedPyObject& object, - PyObject* base, PyObject** out); - -/// \brief Reconstruct Ndarray from Arrow-serialized representation -/// \param[in] object Object to deserialize -/// \param[out] out The deserialized tensor -/// \return Status -ARROW_PYTHON_EXPORT -Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr* out); - -ARROW_PYTHON_EXPORT -Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ARROW_TO_PYTHON_H diff --git a/r/R/inst/include/arrow/python/flight.h b/r/R/inst/include/arrow/python/flight.h deleted file mode 100644 index 432885cb764..00000000000 --- a/r/R/inst/include/arrow/python/flight.h +++ /dev/null @@ -1,207 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_FLIGHT_H -#define PYARROW_FLIGHT_H - -#include -#include -#include - -#include "arrow/flight/api.h" -#include "arrow/ipc/dictionary.h" -#include "arrow/python/common.h" -#include "arrow/python/config.h" - -namespace arrow { - -namespace py { - -namespace flight { - -/// \brief A table of function pointers for calling from C++ into -/// Python. -class ARROW_PYTHON_EXPORT PyFlightServerVtable { - public: - std::function*)> - list_flights; - std::function*)> - get_flight_info; - std::function*)> - do_get; - std::function)> - do_put; - std::function*)> - do_action; - std::function*)> - list_actions; -}; - -class ARROW_PYTHON_EXPORT PyServerAuthHandlerVtable { - public: - std::function - authenticate; - std::function is_valid; -}; - -class ARROW_PYTHON_EXPORT PyClientAuthHandlerVtable { - public: - std::function - authenticate; - std::function get_token; -}; - -/// \brief A helper to implement an auth mechanism in Python. -class ARROW_PYTHON_EXPORT PyServerAuthHandler : public arrow::flight::ServerAuthHandler { - public: - explicit PyServerAuthHandler(PyObject* handler, PyServerAuthHandlerVtable vtable); - Status Authenticate(arrow::flight::ServerAuthSender* outgoing, - arrow::flight::ServerAuthReader* incoming) override; - Status IsValid(const std::string& token, std::string* peer_identity) override; - - private: - OwnedRefNoGIL handler_; - PyServerAuthHandlerVtable vtable_; -}; - -/// \brief A helper to implement an auth mechanism in Python. -class ARROW_PYTHON_EXPORT PyClientAuthHandler : public arrow::flight::ClientAuthHandler { - public: - explicit PyClientAuthHandler(PyObject* handler, PyClientAuthHandlerVtable vtable); - Status Authenticate(arrow::flight::ClientAuthSender* outgoing, - arrow::flight::ClientAuthReader* incoming) override; - Status GetToken(std::string* token) override; - - private: - OwnedRefNoGIL handler_; - PyClientAuthHandlerVtable vtable_; -}; - -class ARROW_PYTHON_EXPORT PyFlightServer : public arrow::flight::FlightServerBase { - public: - explicit PyFlightServer(PyObject* server, PyFlightServerVtable vtable); - - // Like Serve(), but set up signals and invoke Python signal handlers - // if necessary. This function may return with a Python exception set. - Status ServeWithSignals(); - - Status ListFlights(const arrow::flight::ServerCallContext& context, - const arrow::flight::Criteria* criteria, - std::unique_ptr* listings) override; - Status GetFlightInfo(const arrow::flight::ServerCallContext& context, - const arrow::flight::FlightDescriptor& request, - std::unique_ptr* info) override; - Status DoGet(const arrow::flight::ServerCallContext& context, - const arrow::flight::Ticket& request, - std::unique_ptr* stream) override; - Status DoPut(const arrow::flight::ServerCallContext& context, - std::unique_ptr reader) override; - Status DoAction(const arrow::flight::ServerCallContext& context, - const arrow::flight::Action& action, - std::unique_ptr* result) override; - Status ListActions(const arrow::flight::ServerCallContext& context, - std::vector* actions) override; - - private: - OwnedRefNoGIL server_; - PyFlightServerVtable vtable_; -}; - -/// \brief A callback that obtains the next result from a Flight action. -typedef std::function*)> - PyFlightResultStreamCallback; - -/// \brief A ResultStream built around a Python callback. -class ARROW_PYTHON_EXPORT PyFlightResultStream : public arrow::flight::ResultStream { - public: - /// \brief Construct a FlightResultStream from a Python object and callback. - /// Must only be called while holding the GIL. - explicit PyFlightResultStream(PyObject* generator, - PyFlightResultStreamCallback callback); - Status Next(std::unique_ptr* result) override; - - private: - OwnedRefNoGIL generator_; - PyFlightResultStreamCallback callback_; -}; - -/// \brief A wrapper around a FlightDataStream that keeps alive a -/// Python object backing it. -class ARROW_PYTHON_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream { - public: - /// \brief Construct a FlightDataStream from a Python object and underlying stream. - /// Must only be called while holding the GIL. - explicit PyFlightDataStream(PyObject* data_source, - std::unique_ptr stream); - - std::shared_ptr schema() override; - Status GetSchemaPayload(arrow::flight::FlightPayload* payload) override; - Status Next(arrow::flight::FlightPayload* payload) override; - - private: - OwnedRefNoGIL data_source_; - std::unique_ptr stream_; -}; - -/// \brief A callback that obtains the next payload from a Flight result stream. -typedef std::function - PyGeneratorFlightDataStreamCallback; - -/// \brief A FlightDataStream built around a Python callback. -class ARROW_PYTHON_EXPORT PyGeneratorFlightDataStream - : public arrow::flight::FlightDataStream { - public: - /// \brief Construct a FlightDataStream from a Python object and underlying stream. - /// Must only be called while holding the GIL. - explicit PyGeneratorFlightDataStream(PyObject* generator, - std::shared_ptr schema, - PyGeneratorFlightDataStreamCallback callback); - std::shared_ptr schema() override; - Status GetSchemaPayload(arrow::flight::FlightPayload* payload) override; - Status Next(arrow::flight::FlightPayload* payload) override; - - private: - OwnedRefNoGIL generator_; - std::shared_ptr schema_; - ipc::DictionaryMemo dictionary_memo_; - PyGeneratorFlightDataStreamCallback callback_; -}; - -ARROW_PYTHON_EXPORT -Status CreateFlightInfo(const std::shared_ptr& schema, - const arrow::flight::FlightDescriptor& descriptor, - const std::vector& endpoints, - int64_t total_records, int64_t total_bytes, - std::unique_ptr* out); - -} // namespace flight -} // namespace py -} // namespace arrow - -#endif // PYARROW_FLIGHT_H diff --git a/r/R/inst/include/arrow/python/helpers.h b/r/R/inst/include/arrow/python/helpers.h deleted file mode 100644 index 2d44feea5ac..00000000000 --- a/r/R/inst/include/arrow/python/helpers.h +++ /dev/null @@ -1,136 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_HELPERS_H -#define ARROW_PYTHON_HELPERS_H - -#include "arrow/python/platform.h" - -#include -#include -#include -#include - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -namespace arrow { - -namespace py { - -class OwnedRef; - -// \brief Get an arrow DataType instance from Arrow's Type::type enum -// \param[in] type One of the values of Arrow's Type::type enum -// \return A shared pointer to DataType -ARROW_PYTHON_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); - -// \brief Construct a np.float16 object from a npy_half value. -ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value); - -// \brief Convert a Python object to a npy_half value. -ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out); - -namespace internal { - -// \brief Import a Python module -// \param[in] module_name The name of the module -// \param[out] ref The OwnedRef containing the module PyObject* -ARROW_PYTHON_EXPORT -Status ImportModule(const std::string& module_name, OwnedRef* ref); - -// \brief Import an object from a Python module -// \param[in] module A Python module -// \param[in] name The name of the object to import -// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c -// module -ARROW_PYTHON_EXPORT -Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref); - -// \brief Check whether obj is an integer, independent of Python versions. -inline bool IsPyInteger(PyObject* obj) { -#if PYARROW_IS_PY2 - return PyLong_Check(obj) || PyInt_Check(obj); -#else - return PyLong_Check(obj); -#endif -} - -// \brief Use pandas missing value semantics to check if a value is null -ARROW_PYTHON_EXPORT -bool PandasObjectIsNull(PyObject* obj); - -// \brief Check whether obj is a floating-point NaN -ARROW_PYTHON_EXPORT -bool PyFloat_IsNaN(PyObject* obj); - -inline bool IsPyBinary(PyObject* obj) { - return PyBytes_Check(obj) || PyByteArray_Check(obj); -} - -// \brief Convert a Python integer into a C integer -// \param[in] obj A Python integer -// \param[out] out A pointer to a C integer to hold the result of the conversion -// \return The status of the operation -template -Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = ""); - -// \brief Convert a Python unicode string to a std::string -ARROW_PYTHON_EXPORT -Status PyUnicode_AsStdString(PyObject* obj, std::string* out); - -// \brief Convert a Python bytes object to a std::string -ARROW_PYTHON_EXPORT -std::string PyBytes_AsStdString(PyObject* obj); - -// \brief Call str() on the given object and return the result as a std::string -ARROW_PYTHON_EXPORT -Status PyObject_StdStringStr(PyObject* obj, std::string* out); - -// \brief Return the repr() of the given object (always succeeds) -ARROW_PYTHON_EXPORT -std::string PyObject_StdStringRepr(PyObject* obj); - -// \brief Cast the given size to int32_t, with error checking -inline Status CastSize(Py_ssize_t size, int32_t* out, - const char* error_msg = "Maximum size exceeded (2GB)") { - // size is assumed to be positive - if (size > std::numeric_limits::max()) { - return Status::Invalid(error_msg); - } - *out = static_cast(size); - return Status::OK(); -} - -// \brief Print the Python object's __str__ form along with the passed error -// message -ARROW_PYTHON_EXPORT -Status InvalidValue(PyObject* obj, const std::string& why); - -ARROW_PYTHON_EXPORT -Status IntegerScalarToDoubleSafe(PyObject* obj, double* result); -ARROW_PYTHON_EXPORT -Status IntegerScalarToFloat32Safe(PyObject* obj, float* result); - -} // namespace internal -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_HELPERS_H diff --git a/r/R/inst/include/arrow/python/inference.h b/r/R/inst/include/arrow/python/inference.h deleted file mode 100644 index 8790250f543..00000000000 --- a/r/R/inst/include/arrow/python/inference.h +++ /dev/null @@ -1,64 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between CPython built-in data structures and Arrow -// data structures - -#ifndef ARROW_PYTHON_INFERENCE_H -#define ARROW_PYTHON_INFERENCE_H - -#include "arrow/python/platform.h" - -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -#include "arrow/python/common.h" - -namespace arrow { - -class Array; -class Status; - -namespace py { - -// These three functions take a sequence input, not arbitrary iterables -ARROW_PYTHON_EXPORT -arrow::Status InferArrowType(PyObject* obj, std::shared_ptr* out_type); - -ARROW_PYTHON_EXPORT -arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size, - std::shared_ptr* out_type); - -/// Checks whether the passed Python object is a boolean scalar -ARROW_PYTHON_EXPORT -bool IsPyBool(PyObject* obj); - -/// Checks whether the passed Python object is an integer scalar -ARROW_PYTHON_EXPORT -bool IsPyInt(PyObject* obj); - -/// Checks whether the passed Python object is a float scalar -ARROW_PYTHON_EXPORT -bool IsPyFloat(PyObject* obj); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_INFERENCE_H diff --git a/r/R/inst/include/arrow/python/init.h b/r/R/inst/include/arrow/python/init.h deleted file mode 100644 index 34d19b21fdf..00000000000 --- a/r/R/inst/include/arrow/python/init.h +++ /dev/null @@ -1,29 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_INIT_H -#define ARROW_PYTHON_INIT_H - -#include "arrow/python/platform.h" -#include "arrow/python/visibility.h" - -extern "C" { -ARROW_PYTHON_EXPORT -int arrow_init_numpy(); -} - -#endif // ARROW_PYTHON_INIT_H diff --git a/r/R/inst/include/arrow/python/io.h b/r/R/inst/include/arrow/python/io.h deleted file mode 100644 index d3b7c999eb8..00000000000 --- a/r/R/inst/include/arrow/python/io.h +++ /dev/null @@ -1,108 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_IO_H -#define PYARROW_IO_H - -#include - -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" -#include "arrow/python/visibility.h" - -#include "arrow/python/config.h" - -#include "arrow/python/common.h" - -namespace arrow { - -class MemoryPool; - -namespace py { - -class ARROW_NO_EXPORT PythonFile; - -class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile { - public: - explicit PyReadableFile(PyObject* file); - ~PyReadableFile() override; - - Status Close() override; - bool closed() const override; - - Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - // Thread-safe version - Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, - void* out) override; - - // Thread-safe version - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status GetSize(int64_t* size) override; - - Status Seek(int64_t position) override; - - Status Tell(int64_t* position) const override; - - private: - std::unique_ptr file_; -}; - -class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream { - public: - explicit PyOutputStream(PyObject* file); - ~PyOutputStream() override; - - Status Close() override; - bool closed() const override; - Status Tell(int64_t* position) const override; - Status Write(const void* data, int64_t nbytes) override; - - private: - std::unique_ptr file_; - int64_t position_; -}; - -// TODO(wesm): seekable output files - -// A Buffer subclass that keeps a PyObject reference throughout its -// lifetime, such that the Python object is kept alive as long as the -// C++ buffer is still needed. -// Keeping the reference in a Python wrapper would be incorrect as -// the Python wrapper can get destroyed even though the wrapped C++ -// buffer is still alive (ARROW-2270). -class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer { - public: - static Status Make(const uint8_t* data, int64_t size, PyObject* base, - std::shared_ptr* out); - - private: - PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base) - : Buffer(data, size) { - Py_INCREF(base); - base_.reset(base); - } - - OwnedRefNoGIL base_; -}; - -} // namespace py -} // namespace arrow - -#endif // PYARROW_IO_H diff --git a/r/R/inst/include/arrow/python/iterators.h b/r/R/inst/include/arrow/python/iterators.h deleted file mode 100644 index 40e40aa984a..00000000000 --- a/r/R/inst/include/arrow/python/iterators.h +++ /dev/null @@ -1,157 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_ITERATORS_H -#define ARROW_PYTHON_ITERATORS_H - -#include - -#include "arrow/python/common.h" -#include "arrow/python/numpy-internal.h" - -namespace arrow { -namespace py { -namespace internal { - -// Visit the Python sequence, calling the given callable on each element. If -// the callable returns a non-OK status, iteration stops and the status is -// returned. -// -// The call signature for Visitor must be -// -// Visit(PyObject* obj, int64_t index, bool* keep_going) -// -// If keep_going is set to false, the iteration terminates -template -inline Status VisitSequenceGeneric(PyObject* obj, VisitorFunc&& func) { - // VisitorFunc may set to false to terminate iteration - bool keep_going = true; - - if (PyArray_Check(obj)) { - PyArrayObject* arr_obj = reinterpret_cast(obj); - if (PyArray_NDIM(arr_obj) != 1) { - return Status::Invalid("Only 1D arrays accepted"); - } - - if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) { - // It's an array object, we can fetch object pointers directly - const Ndarray1DIndexer objects(arr_obj); - for (int64_t i = 0; keep_going && i < objects.size(); ++i) { - RETURN_NOT_OK(func(objects[i], i, &keep_going)); - } - return Status::OK(); - } - // It's a non-object array, fall back on regular sequence access. - // (note PyArray_GETITEM() is slightly different: it returns standard - // Python types, not Numpy scalar types) - // This code path is inefficient: callers should implement dedicated - // logic for non-object arrays. - } - if (PySequence_Check(obj)) { - if (PyList_Check(obj) || PyTuple_Check(obj)) { - // Use fast item access - const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj); - for (Py_ssize_t i = 0; keep_going && i < size; ++i) { - PyObject* value = PySequence_Fast_GET_ITEM(obj, i); - RETURN_NOT_OK(func(value, static_cast(i), &keep_going)); - } - } else { - // Regular sequence: avoid making a potentially large copy - const Py_ssize_t size = PySequence_Size(obj); - RETURN_IF_PYERROR(); - for (Py_ssize_t i = 0; keep_going && i < size; ++i) { - OwnedRef value_ref(PySequence_ITEM(obj, i)); - RETURN_IF_PYERROR(); - RETURN_NOT_OK(func(value_ref.obj(), static_cast(i), &keep_going)); - } - } - } else { - return Status::TypeError("Object is not a sequence"); - } - return Status::OK(); -} - -// Visit sequence with no null mask -template -inline Status VisitSequence(PyObject* obj, VisitorFunc&& func) { - return VisitSequenceGeneric( - obj, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) { - return func(value, keep_going); - }); -} - -/// Visit sequence with null mask -template -inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, VisitorFunc&& func) { - if (mo == nullptr || !PyArray_Check(mo)) { - return Status::Invalid("Null mask must be NumPy array"); - } - - PyArrayObject* mask = reinterpret_cast(mo); - if (PyArray_NDIM(mask) != 1) { - return Status::Invalid("Mask must be 1D array"); - } - - const Py_ssize_t obj_size = PySequence_Size(obj); - if (PyArray_SIZE(mask) != static_cast(obj_size)) { - return Status::Invalid("Mask was a different length from sequence being converted"); - } - - const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num); - if (dtype == NPY_BOOL) { - Ndarray1DIndexer mask_values(mask); - - return VisitSequenceGeneric( - obj, [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) { - return func(value, mask_values[i], keep_going); - }); - } else { - return Status::Invalid("Mask must be boolean dtype"); - } -} - -// Like IterateSequence, but accepts any generic iterable (including -// non-restartable iterators, e.g. generators). -// -// The call signature for VisitorFunc must be Visit(PyObject*, bool* -// keep_going). If keep_going is set to false, the iteration terminates -template -inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) { - if (PySequence_Check(obj)) { - // Numpy arrays fall here as well - return VisitSequence(obj, std::forward(func)); - } - // Fall back on the iterator protocol - OwnedRef iter_ref(PyObject_GetIter(obj)); - PyObject* iter = iter_ref.obj(); - RETURN_IF_PYERROR(); - PyObject* value; - - bool keep_going = true; - while (keep_going && (value = PyIter_Next(iter))) { - OwnedRef value_ref(value); - RETURN_NOT_OK(func(value_ref.obj(), &keep_going)); - } - RETURN_IF_PYERROR(); // __next__() might have raised - return Status::OK(); -} - -} // namespace internal -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ITERATORS_H diff --git a/r/R/inst/include/arrow/python/numpy-internal.h b/r/R/inst/include/arrow/python/numpy-internal.h deleted file mode 100644 index 19bcde0318f..00000000000 --- a/r/R/inst/include/arrow/python/numpy-internal.h +++ /dev/null @@ -1,179 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal utilities for dealing with NumPy - -#ifndef ARROW_PYTHON_NUMPY_INTERNAL_H -#define ARROW_PYTHON_NUMPY_INTERNAL_H - -#include "arrow/python/numpy_interop.h" - -#include "arrow/status.h" - -#include "arrow/python/platform.h" - -#include -#include -#include - -namespace arrow { -namespace py { - -/// Indexing convenience for interacting with strided 1-dim ndarray objects -template -class Ndarray1DIndexer { - public: - typedef int64_t size_type; - - Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {} - - explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { - arr_ = arr; - DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays"; - Py_INCREF(arr); - data_ = reinterpret_cast(PyArray_DATA(arr)); - stride_ = PyArray_STRIDES(arr)[0]; - } - - ~Ndarray1DIndexer() { Py_XDECREF(arr_); } - - int64_t size() const { return PyArray_SIZE(arr_); } - - T* data() const { return data_; } - - bool is_strided() const { return stride_ != sizeof(T); } - - T& operator[](size_type index) { - return *reinterpret_cast(data_ + index * stride_); - } - const T& operator[](size_type index) const { - return *reinterpret_cast(data_ + index * stride_); - } - - private: - PyArrayObject* arr_; - uint8_t* data_; - int64_t stride_; -}; - -// Handling of Numpy Types by their static numbers -// (the NPY_TYPES enum and related defines) - -static inline std::string GetNumPyTypeName(int npy_type) { -#define TYPE_CASE(TYPE, NAME) \ - case NPY_##TYPE: \ - return NAME; - - switch (npy_type) { - TYPE_CASE(BOOL, "bool") - TYPE_CASE(INT8, "int8") - TYPE_CASE(INT16, "int16") - TYPE_CASE(INT32, "int32") - TYPE_CASE(INT64, "int64") -#if !NPY_INT32_IS_INT - TYPE_CASE(INT, "intc") -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_CASE(LONGLONG, "longlong") -#endif - TYPE_CASE(UINT8, "uint8") - TYPE_CASE(UINT16, "uint16") - TYPE_CASE(UINT32, "uint32") - TYPE_CASE(UINT64, "uint64") -#if !NPY_INT32_IS_INT - TYPE_CASE(UINT, "uintc") -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_CASE(ULONGLONG, "ulonglong") -#endif - TYPE_CASE(FLOAT16, "float16") - TYPE_CASE(FLOAT32, "float32") - TYPE_CASE(FLOAT64, "float64") - TYPE_CASE(DATETIME, "datetime64") - TYPE_CASE(OBJECT, "object") - TYPE_CASE(VOID, "void") - default: - break; - } - -#undef TYPE_CASE - std::stringstream ss; - ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; - return ss.str(); -} - -#define TYPE_VISIT_INLINE(TYPE) \ - case NPY_##TYPE: \ - return visitor->template Visit(arr); - -template -inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { - switch (PyArray_TYPE(arr)) { - TYPE_VISIT_INLINE(BOOL); - TYPE_VISIT_INLINE(INT8); - TYPE_VISIT_INLINE(UINT8); - TYPE_VISIT_INLINE(INT16); - TYPE_VISIT_INLINE(UINT16); - TYPE_VISIT_INLINE(INT32); - TYPE_VISIT_INLINE(UINT32); - TYPE_VISIT_INLINE(INT64); - TYPE_VISIT_INLINE(UINT64); -#if !NPY_INT32_IS_INT - TYPE_VISIT_INLINE(INT); - TYPE_VISIT_INLINE(UINT); -#endif -#if !NPY_INT64_IS_LONG_LONG - TYPE_VISIT_INLINE(LONGLONG); - TYPE_VISIT_INLINE(ULONGLONG); -#endif - TYPE_VISIT_INLINE(FLOAT16); - TYPE_VISIT_INLINE(FLOAT32); - TYPE_VISIT_INLINE(FLOAT64); - TYPE_VISIT_INLINE(DATETIME); - TYPE_VISIT_INLINE(OBJECT); - } - return Status::NotImplemented("NumPy type not implemented: ", - GetNumPyTypeName(PyArray_TYPE(arr))); -} - -#undef TYPE_VISIT_INLINE - -namespace internal { - -inline bool PyFloatScalar_Check(PyObject* obj) { - return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); -} - -inline bool PyIntScalar_Check(PyObject* obj) { -#if PY_MAJOR_VERSION < 3 - if (PyInt_Check(obj)) { - return true; - } -#endif - return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); -} - -inline bool PyBoolScalar_Check(PyObject* obj) { - return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); -} - -} // namespace internal - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_NUMPY_INTERNAL_H diff --git a/r/R/inst/include/arrow/python/numpy_convert.h b/r/R/inst/include/arrow/python/numpy_convert.h deleted file mode 100644 index dce5fe522d6..00000000000 --- a/r/R/inst/include/arrow/python/numpy_convert.h +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#ifndef ARROW_PYTHON_NUMPY_CONVERT_H -#define ARROW_PYTHON_NUMPY_CONVERT_H - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/buffer.h" -#include "arrow/python/visibility.h" - -namespace arrow { - -class DataType; -class MemoryPool; -class Status; -class Tensor; - -namespace py { - -class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer { - public: - explicit NumPyBuffer(PyObject* arr); - virtual ~NumPyBuffer(); - - private: - PyObject* arr_; -}; - -// Handle misbehaved types like LONGLONG and ULONGLONG -ARROW_PYTHON_EXPORT -int cast_npy_type_compat(int type_num); - -ARROW_PYTHON_EXPORT -bool is_contiguous(PyObject* array); - -ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out); -ARROW_PYTHON_EXPORT -Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out); - -Status GetTensorType(PyObject* dtype, std::shared_ptr* out); -Status GetNumPyType(const DataType& type, int* type_num); - -ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, - PyObject* base, PyObject** out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_NUMPY_CONVERT_H diff --git a/r/R/inst/include/arrow/python/numpy_interop.h b/r/R/inst/include/arrow/python/numpy_interop.h deleted file mode 100644 index 094c3213758..00000000000 --- a/r/R/inst/include/arrow/python/numpy_interop.h +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_NUMPY_INTEROP_H -#define PYARROW_NUMPY_INTEROP_H - -#include "arrow/python/platform.h" // IWYU pragma: export - -#include // IWYU pragma: export - -// Don't use the deprecated Numpy functions -#ifdef NPY_1_7_API_VERSION -#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#else -#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED -#define NPY_ARRAY_ALIGNED NPY_ALIGNED -#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE -#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY -#endif - -// This is required to be able to access the NumPy C API properly in C++ files -// other than this main one -#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API -#ifndef NUMPY_IMPORT_ARRAY -#define NO_IMPORT_ARRAY -#endif - -#include // IWYU pragma: export -#include // IWYU pragma: export -#include // IWYU pragma: export - -// A bit subtle. Numpy has 5 canonical integer types: -// (or, rather, type pairs: signed and unsigned) -// NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG -// It also has 4 fixed-width integer aliases. -// When mapping Arrow integer types to these 4 fixed-width aliases, -// we always miss one of the canonical types (even though it may -// have the same width as one of the aliases). -// Which one depends on the platform... -// On a LP64 system, NPY_INT64 maps to NPY_LONG and -// NPY_LONGLONG needs to be handled separately. -// On a LLP64 system, NPY_INT32 maps to NPY_LONG and -// NPY_INT needs to be handled separately. - -#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64 -#define NPY_INT64_IS_LONG_LONG 1 -#else -#define NPY_INT64_IS_LONG_LONG 0 -#endif - -#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64 -#define NPY_INT32_IS_INT 1 -#else -#define NPY_INT32_IS_INT 0 -#endif - -namespace arrow { -namespace py { - -inline int import_numpy() { -#ifdef NUMPY_IMPORT_ARRAY - import_array1(-1); - import_umath1(-1); -#endif - - return 0; -} - -// See above about the missing Numpy integer type numbers -inline int fix_numpy_type_num(int type_num) { -#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 - if (type_num == NPY_INT) return NPY_INT32; - if (type_num == NPY_UINT) return NPY_UINT32; -#endif -#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 - if (type_num == NPY_LONGLONG) return NPY_INT64; - if (type_num == NPY_ULONGLONG) return NPY_UINT64; -#endif - return type_num; -} - -} // namespace py -} // namespace arrow - -#endif // PYARROW_NUMPY_INTEROP_H diff --git a/r/R/inst/include/arrow/python/numpy_to_arrow.h b/r/R/inst/include/arrow/python/numpy_to_arrow.h deleted file mode 100644 index 4edc7669bb8..00000000000 --- a/r/R/inst/include/arrow/python/numpy_to_arrow.h +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Converting from pandas memory representation to Arrow data structures - -#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H -#define ARROW_PYTHON_NUMPY_TO_ARROW_H - -#include "arrow/python/platform.h" - -#include - -#include "arrow/compute/kernels/cast.h" -#include "arrow/python/visibility.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class DataType; -class MemoryPool; -class Status; - -namespace py { - -/// Convert NumPy arrays to Arrow. If target data type is not known, pass a -/// type with null -/// -/// \param[in] pool Memory pool for any memory allocations -/// \param[in] ao an ndarray with the array data -/// \param[in] mo an ndarray with a null mask (True is null), optional -/// \param[in] from_pandas If true, use pandas's null sentinels to determine -/// whether values are null -/// \param[in] type a specific type to cast to, may be null -/// \param[in] cast_options casting options -/// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_PYTHON_EXPORT -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - const compute::CastOptions& cast_options, - std::shared_ptr* out); - -/// Safely convert NumPy arrays to Arrow. If target data type is not known, -/// pass a type with null. -/// -/// \param[in] pool Memory pool for any memory allocations -/// \param[in] ao an ndarray with the array data -/// \param[in] mo an ndarray with a null mask (True is null), optional -/// \param[in] from_pandas If true, use pandas's null sentinels to determine -/// whether values are null -/// \param[in] type a specific type to cast to, may be null -/// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_PYTHON_EXPORT -Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, - const std::shared_ptr& type, - std::shared_ptr* out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_NUMPY_TO_ARROW_H diff --git a/r/R/inst/include/arrow/python/platform.h b/r/R/inst/include/arrow/python/platform.h deleted file mode 100644 index bc06df9c38c..00000000000 --- a/r/R/inst/include/arrow/python/platform.h +++ /dev/null @@ -1,34 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#ifndef ARROW_PYTHON_PLATFORM_H -#define ARROW_PYTHON_PLATFORM_H - -#include // IWYU pragma: export -#include - -// Work around C2528 error -#ifdef _MSC_VER -#if _MSC_VER >= 1900 -#undef timezone -#endif -#endif - -#endif // ARROW_PYTHON_PLATFORM_H diff --git a/r/R/inst/include/arrow/python/pyarrow.h b/r/R/inst/include/arrow/python/pyarrow.h deleted file mode 100644 index a5a39108479..00000000000 --- a/r/R/inst/include/arrow/python/pyarrow.h +++ /dev/null @@ -1,86 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_PYARROW_H -#define ARROW_PYTHON_PYARROW_H - -#include "arrow/python/platform.h" - -#include - -#include "arrow/python/visibility.h" - -namespace arrow { - -class Array; -class Buffer; -class Column; -class DataType; -class Field; -class RecordBatch; -class Schema; -class Status; -class Table; -class Tensor; - -namespace py { - -ARROW_PYTHON_EXPORT int import_pyarrow(); - -ARROW_PYTHON_EXPORT bool is_buffer(PyObject* buffer); -ARROW_PYTHON_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_buffer(const std::shared_ptr& buffer); - -ARROW_PYTHON_EXPORT bool is_data_type(PyObject* data_type); -ARROW_PYTHON_EXPORT Status unwrap_data_type(PyObject* data_type, - std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_data_type(const std::shared_ptr& type); - -ARROW_PYTHON_EXPORT bool is_field(PyObject* field); -ARROW_PYTHON_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_field(const std::shared_ptr& field); - -ARROW_PYTHON_EXPORT bool is_schema(PyObject* schema); -ARROW_PYTHON_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_schema(const std::shared_ptr& schema); - -ARROW_PYTHON_EXPORT bool is_array(PyObject* array); -ARROW_PYTHON_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_array(const std::shared_ptr& array); - -ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor); -ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); - -ARROW_PYTHON_EXPORT bool is_column(PyObject* column); -ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); - -ARROW_PYTHON_EXPORT bool is_table(PyObject* table); -ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); -ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); - -ARROW_PYTHON_EXPORT bool is_record_batch(PyObject* batch); -ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch, - std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_record_batch( - const std::shared_ptr& batch); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_PYARROW_H diff --git a/r/R/inst/include/arrow/python/pyarrow_api.h b/r/R/inst/include/arrow/python/pyarrow_api.h deleted file mode 100644 index f6a211290e5..00000000000 --- a/r/R/inst/include/arrow/python/pyarrow_api.h +++ /dev/null @@ -1,187 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// DO NOT EDIT THIS FILE. Update from pyarrow/lib_api.h after pyarrow build - -/* Generated by Cython 0.29 */ - -#ifndef __PYX_HAVE_API__pyarrow__lib -#define __PYX_HAVE_API__pyarrow__lib -#include "Python.h" -#include "pyarrow_lib.h" - -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array> const &) = 0; -#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch> const &) = 0; -#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; -#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column)(std::shared_ptr< arrow::Column> const &) = 0; -#define pyarrow_wrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0; -#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0; -#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer)(std::shared_ptr< arrow::ResizableBuffer> const &) = 0; -#define pyarrow_wrap_resizable_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema> const &) = 0; -#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table> const &) = 0; -#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor> const &) = 0; -#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor -static std::shared_ptr< arrow::Array> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0; -#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array -static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0; -#define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch -static std::shared_ptr< arrow::Buffer> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0; -#define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer -static std::shared_ptr< arrow::Column> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column)(PyObject *) = 0; -#define pyarrow_unwrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column -static std::shared_ptr< arrow::DataType> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0; -#define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type -static std::shared_ptr< arrow::Field> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0; -#define pyarrow_unwrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field -static std::shared_ptr< arrow::Schema> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema)(PyObject *) = 0; -#define pyarrow_unwrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema -static std::shared_ptr< arrow::Table> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table)(PyObject *) = 0; -#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table -static std::shared_ptr< arrow::Tensor> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0; -#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0; -#define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0; -#define pyarrow_is_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_field)(PyObject *) = 0; -#define pyarrow_is_field __pyx_api_f_7pyarrow_3lib_pyarrow_is_field -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0; -#define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0; -#define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray> const &) = 0; -#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0; -#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; -#define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; -#define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0; -#define pyarrow_is_batch __pyx_api_f_7pyarrow_3lib_pyarrow_is_batch -#if !defined(__Pyx_PyIdentifier_FromString) -#if PY_MAJOR_VERSION < 3 - #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s) -#else - #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s) -#endif -#endif - -#ifndef __PYX_HAVE_RT_ImportFunction -#define __PYX_HAVE_RT_ImportFunction -static int __Pyx_ImportFunction(PyObject *module, const char *funcname, void (**f)(void), const char *sig) { - PyObject *d = 0; - PyObject *cobj = 0; - union { - void (*fp)(void); - void *p; - } tmp; - d = PyObject_GetAttrString(module, (char *)"__pyx_capi__"); - if (!d) - goto bad; - cobj = PyDict_GetItemString(d, funcname); - if (!cobj) { - PyErr_Format(PyExc_ImportError, - "%.200s does not export expected C function %.200s", - PyModule_GetName(module), funcname); - goto bad; - } -#if PY_VERSION_HEX >= 0x02070000 - if (!PyCapsule_IsValid(cobj, sig)) { - PyErr_Format(PyExc_TypeError, - "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", - PyModule_GetName(module), funcname, sig, PyCapsule_GetName(cobj)); - goto bad; - } - tmp.p = PyCapsule_GetPointer(cobj, sig); -#else - {const char *desc, *s1, *s2; - desc = (const char *)PyCObject_GetDesc(cobj); - if (!desc) - goto bad; - s1 = desc; s2 = sig; - while (*s1 != '\0' && *s1 == *s2) { s1++; s2++; } - if (*s1 != *s2) { - PyErr_Format(PyExc_TypeError, - "C function %.200s.%.200s has wrong signature (expected %.500s, got %.500s)", - PyModule_GetName(module), funcname, sig, desc); - goto bad; - } - tmp.p = PyCObject_AsVoidPtr(cobj);} -#endif - *f = tmp.fp; - if (!(*f)) - goto bad; - Py_DECREF(d); - return 0; -bad: - Py_XDECREF(d); - return -1; -} -#endif - - -static int import_pyarrow__lib(void) { - PyObject *module = 0; - module = PyImport_ImportModule("pyarrow.lib"); - if (!module) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column, "std::shared_ptr< arrow::Column> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; - Py_DECREF(module); module = 0; - return 0; - bad: - Py_XDECREF(module); - return -1; -} - -#endif /* !__PYX_HAVE_API__pyarrow__lib */ diff --git a/r/R/inst/include/arrow/python/pyarrow_lib.h b/r/R/inst/include/arrow/python/pyarrow_lib.h deleted file mode 100644 index 4a99a073b50..00000000000 --- a/r/R/inst/include/arrow/python/pyarrow_lib.h +++ /dev/null @@ -1,81 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// DO NOT EDIT THIS FILE. Update from pyarrow/lib.h after pyarrow build - -/* Generated by Cython 0.29 */ - -#ifndef __PYX_HAVE__pyarrow__lib -#define __PYX_HAVE__pyarrow__lib - - -#ifndef __PYX_HAVE_API__pyarrow__lib - -#ifndef __PYX_EXTERN_C - #ifdef __cplusplus - #define __PYX_EXTERN_C extern "C" - #else - #define __PYX_EXTERN_C extern - #endif -#endif - -#ifndef DL_IMPORT - #define DL_IMPORT(_T) _T -#endif - -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_array(std::shared_ptr< arrow::Array> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_column(std::shared_ptr< arrow::Column> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_data_type(std::shared_ptr< arrow::DataType> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_field(std::shared_ptr< arrow::Field> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer(std::shared_ptr< arrow::ResizableBuffer> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_schema(std::shared_ptr< arrow::Schema> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_table(std::shared_ptr< arrow::Table> const &); -__PYX_EXTERN_C PyObject *__pyx_f_7pyarrow_3lib_pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor> const &); -__PYX_EXTERN_C std::shared_ptr< arrow::Array> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_array(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_batch(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Buffer> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_buffer(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Column> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_column(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::DataType> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_data_type(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Field> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_field(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Schema> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_schema(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Table> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_table(PyObject *); -__PYX_EXTERN_C std::shared_ptr< arrow::Tensor> __pyx_f_7pyarrow_3lib_pyarrow_unwrap_tensor(PyObject *); -__PYX_EXTERN_C int pyarrow_is_buffer(PyObject *); -__PYX_EXTERN_C int pyarrow_is_data_type(PyObject *); -__PYX_EXTERN_C int pyarrow_is_field(PyObject *); -__PYX_EXTERN_C int pyarrow_is_schema(PyObject *); -__PYX_EXTERN_C int pyarrow_is_array(PyObject *); -__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray> const &); -__PYX_EXTERN_C int pyarrow_is_tensor(PyObject *); -__PYX_EXTERN_C int pyarrow_is_column(PyObject *); -__PYX_EXTERN_C int pyarrow_is_table(PyObject *); -__PYX_EXTERN_C int pyarrow_is_batch(PyObject *); - -#endif /* !__PYX_HAVE_API__pyarrow__lib */ - -/* WARNING: the interface of the module init function changed in CPython 3.5. */ -/* It now returns a PyModuleDef instance instead of a PyModule instance. */ - -#if PY_MAJOR_VERSION < 3 -PyMODINIT_FUNC initlib(void); -#else -PyMODINIT_FUNC PyInit_lib(void); -#endif - -#endif /* !__PYX_HAVE__pyarrow__lib */ diff --git a/r/R/inst/include/arrow/python/python_to_arrow.h b/r/R/inst/include/arrow/python/python_to_arrow.h deleted file mode 100644 index f9d97569ef4..00000000000 --- a/r/R/inst/include/arrow/python/python_to_arrow.h +++ /dev/null @@ -1,83 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between CPython built-in data structures and Arrow -// data structures - -#ifndef ARROW_PYTHON_ADAPTERS_BUILTIN_H -#define ARROW_PYTHON_ADAPTERS_BUILTIN_H - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/python/visibility.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" - -#include "arrow/python/common.h" - -namespace arrow { - -class Array; -class Status; - -namespace py { - -struct PyConversionOptions { - PyConversionOptions() : type(NULLPTR), size(-1), pool(NULLPTR), from_pandas(false) {} - - PyConversionOptions(const std::shared_ptr& type, int64_t size, - MemoryPool* pool, bool from_pandas) - : type(type), size(size), pool(default_memory_pool()), from_pandas(from_pandas) {} - - // Set to null if to be inferred - std::shared_ptr type; - - // Default is -1: infer from data - int64_t size; - - // Memory pool to use for allocations - MemoryPool* pool; - - // Default false - bool from_pandas; -}; - -/// \brief Convert sequence (list, generator, NumPy array with dtype object) of -/// Python objects. -/// \param[in] obj the sequence to convert -/// \param[in] mask a NumPy array of true/false values to indicate whether -/// values in the sequence are null (true) or not null (false). This parameter -/// may be null -/// \param[in] options various conversion options -/// \param[out] out a ChunkedArray containing one or more chunks -/// \return Status -ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out); - -ARROW_PYTHON_EXPORT -Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, - std::shared_ptr* out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ADAPTERS_BUILTIN_H diff --git a/r/R/inst/include/arrow/python/serialize.h b/r/R/inst/include/arrow/python/serialize.h deleted file mode 100644 index 6cdbbe5053f..00000000000 --- a/r/R/inst/include/arrow/python/serialize.h +++ /dev/null @@ -1,136 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PYTHON_PYTHON_TO_ARROW_H -#define ARROW_PYTHON_PYTHON_TO_ARROW_H - -#include -#include - -#include "arrow/python/visibility.h" -#include "arrow/status.h" - -// Forward declaring PyObject, see -// https://mail.python.org/pipermail/python-dev/2003-August/037601.html -#ifndef PyObject_HEAD -struct _object; -typedef _object PyObject; -#endif - -namespace arrow { - -class Buffer; -class DataType; -class MemoryPool; -class RecordBatch; -class Tensor; - -namespace io { - -class OutputStream; - -} // namespace io - -namespace py { - -struct ARROW_PYTHON_EXPORT SerializedPyObject { - std::shared_ptr batch; - std::vector> tensors; - std::vector> ndarrays; - std::vector> buffers; - - /// \brief Write serialized Python object to OutputStream - /// \param[in,out] dst an OutputStream - /// \return Status - Status WriteTo(io::OutputStream* dst); - - /// \brief Convert SerializedPyObject to a dict containing the message - /// components as Buffer instances with minimal memory allocation - /// - /// { - /// 'num_tensors': N, - /// 'num_buffers': K, - /// 'data': [Buffer] - /// } - /// - /// Each tensor is written as two buffers, one for the metadata and one for - /// the body. Therefore, the number of buffers in 'data' is 2 * N + K + 1, - /// with the first buffer containing the serialized record batch containing - /// the UnionArray that describes the whole object - Status GetComponents(MemoryPool* pool, PyObject** out); -}; - -/// \brief Serialize Python sequence as a SerializedPyObject. -/// \param[in] context Serialization context which contains custom serialization -/// and deserialization callbacks. Can be any Python object with a -/// _serialize_callback method for serialization and a _deserialize_callback -/// method for deserialization. If context is None, no custom serialization -/// will be attempted. -/// \param[in] sequence A Python sequence object to serialize to Arrow data -/// structures -/// \param[out] out The serialized representation -/// \return Status -/// -/// Release GIL before calling -ARROW_PYTHON_EXPORT -Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); - -/// \brief Serialize an Arrow Tensor as a SerializedPyObject. -/// \param[in] tensor Tensor to be serialized -/// \param[out] out The serialized representation -/// \return Status -ARROW_PYTHON_EXPORT -Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* out); - -/// \brief Write the Tensor metadata header to an OutputStream. -/// \param[in] dtype DataType of the Tensor -/// \param[in] shape The shape of the tensor -/// \param[in] tensor_num_bytes The lengh of the Tensor data in bytes -/// \param[in] dst The OutputStream to write the Tensor header to -/// \return Status -ARROW_PYTHON_EXPORT -Status WriteNdarrayHeader(std::shared_ptr dtype, - const std::vector& shape, int64_t tensor_num_bytes, - io::OutputStream* dst); - -struct PythonType { - enum type { - BOOL, - INT, - PY2INT, - BYTES, - STRING, - HALF_FLOAT, - FLOAT, - DOUBLE, - DATE64, - LIST, - DICT, - TUPLE, - SET, - TENSOR, - NDARRAY, - BUFFER, - NUM_PYTHON_TYPES - }; -}; - -} // namespace py - -} // namespace arrow - -#endif // ARROW_PYTHON_PYTHON_TO_ARROW_H diff --git a/r/R/inst/include/arrow/python/type_traits.h b/r/R/inst/include/arrow/python/type_traits.h deleted file mode 100644 index bc71ec4e90b..00000000000 --- a/r/R/inst/include/arrow/python/type_traits.h +++ /dev/null @@ -1,302 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Internal header - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/python/numpy_interop.h" - -#include - -#include "arrow/builder.h" -#include "arrow/type.h" -#include "arrow/util/logging.h" - -namespace arrow { -namespace py { -namespace internal { - -// -// Type traits for Numpy -> Arrow equivalence -// -template -struct npy_traits {}; - -template <> -struct npy_traits { - typedef uint8_t value_type; - using TypeClass = BooleanType; - using BuilderClass = BooleanBuilder; - - static constexpr bool supports_nulls = false; - static inline bool isnull(uint8_t v) { return false; } -}; - -#define NPY_INT_DECL(TYPE, CapType, T) \ - template <> \ - struct npy_traits { \ - typedef T value_type; \ - using TypeClass = CapType##Type; \ - using BuilderClass = CapType##Builder; \ - \ - static constexpr bool supports_nulls = false; \ - static inline bool isnull(T v) { return false; } \ - }; - -NPY_INT_DECL(INT8, Int8, int8_t); -NPY_INT_DECL(INT16, Int16, int16_t); -NPY_INT_DECL(INT32, Int32, int32_t); -NPY_INT_DECL(INT64, Int64, int64_t); - -NPY_INT_DECL(UINT8, UInt8, uint8_t); -NPY_INT_DECL(UINT16, UInt16, uint16_t); -NPY_INT_DECL(UINT32, UInt32, uint32_t); -NPY_INT_DECL(UINT64, UInt64, uint64_t); - -#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32 -NPY_INT_DECL(INT, Int32, int32_t); -NPY_INT_DECL(UINT, UInt32, uint32_t); -#endif -#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64 -NPY_INT_DECL(LONGLONG, Int64, int64_t); -NPY_INT_DECL(ULONGLONG, UInt64, uint64_t); -#endif - -template <> -struct npy_traits { - typedef npy_half value_type; - using TypeClass = HalfFloatType; - using BuilderClass = HalfFloatBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(npy_half v) { return v == NPY_HALF_NAN; } -}; - -template <> -struct npy_traits { - typedef float value_type; - using TypeClass = FloatType; - using BuilderClass = FloatBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(float v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef double value_type; - using TypeClass = DoubleType; - using BuilderClass = DoubleBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(double v) { return v != v; } -}; - -template <> -struct npy_traits { - typedef int64_t value_type; - using TypeClass = TimestampType; - using BuilderClass = TimestampBuilder; - - static constexpr bool supports_nulls = true; - - static inline bool isnull(int64_t v) { - // NaT = -2**63 - // = -0x8000000000000000 - // = -9223372036854775808; - // = std::numeric_limits::min() - return v == std::numeric_limits::min(); - } -}; - -template <> -struct npy_traits { - typedef PyObject* value_type; - static constexpr bool supports_nulls = true; - - static inline bool isnull(PyObject* v) { return v == Py_None; } -}; - -// -// Type traits for Arrow -> Numpy equivalence -// Note *supports_nulls* means the equivalent Numpy type support nulls -// -template -struct arrow_traits {}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_BOOL; - static constexpr bool supports_nulls = false; - typedef typename npy_traits::value_type T; -}; - -#define INT_DECL(TYPE) \ - template <> \ - struct arrow_traits { \ - static constexpr int npy_type = NPY_##TYPE; \ - static constexpr bool supports_nulls = false; \ - static constexpr double na_value = NAN; \ - typedef typename npy_traits::value_type T; \ - }; - -INT_DECL(INT8); -INT_DECL(INT16); -INT_DECL(INT32); -INT_DECL(INT64); -INT_DECL(UINT8); -INT_DECL(UINT16); -INT_DECL(UINT32); -INT_DECL(UINT64); - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT16; - static constexpr bool supports_nulls = true; - static constexpr uint16_t na_value = NPY_HALF_NAN; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT32; - static constexpr bool supports_nulls = true; - static constexpr float na_value = NAN; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_FLOAT64; - static constexpr bool supports_nulls = true; - static constexpr double na_value = NAN; - typedef typename npy_traits::value_type T; -}; - -static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); - -constexpr int64_t kNanosecondsInDay = 86400000000000LL; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_DATETIME; - static constexpr int64_t npy_shift = 1; - - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - // Data stores as FR_D day unit - static constexpr int npy_type = NPY_DATETIME; - static constexpr int64_t npy_shift = 1; - - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; - - static constexpr int64_t na_value = kPandasTimestampNull; - static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } -}; - -template <> -struct arrow_traits { - // Data stores as FR_D day unit - static constexpr int npy_type = NPY_DATETIME; - - // There are 1000 * 60 * 60 * 24 = 86400000ms in a day - static constexpr int64_t npy_shift = 86400000; - - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; - - static constexpr int64_t na_value = kPandasTimestampNull; - static inline bool isnull(int64_t v) { return npy_traits::isnull(v); } -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - static constexpr int64_t na_value = kPandasTimestampNull; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; - typedef typename npy_traits::value_type T; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; -}; - -template <> -struct arrow_traits { - static constexpr int npy_type = NPY_OBJECT; - static constexpr bool supports_nulls = true; -}; - -static inline int NumPyTypeSize(int npy_type) { - npy_type = fix_numpy_type_num(npy_type); - - switch (npy_type) { - case NPY_BOOL: - case NPY_INT8: - case NPY_UINT8: - return 1; - case NPY_INT16: - case NPY_UINT16: - return 2; - case NPY_INT32: - case NPY_UINT32: - return 4; - case NPY_INT64: - case NPY_UINT64: - return 8; - case NPY_FLOAT16: - return 2; - case NPY_FLOAT32: - return 4; - case NPY_FLOAT64: - return 8; - case NPY_DATETIME: - return 8; - case NPY_OBJECT: - return sizeof(void*); - default: - DCHECK(false) << "unhandled numpy type"; - break; - } - return -1; -} - -} // namespace internal -} // namespace py -} // namespace arrow diff --git a/r/R/inst/include/arrow/python/util/datetime.h b/r/R/inst/include/arrow/python/util/datetime.h deleted file mode 100644 index a6e9c87f4e2..00000000000 --- a/r/R/inst/include/arrow/python/util/datetime.h +++ /dev/null @@ -1,308 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_UTIL_DATETIME_H -#define PYARROW_UTIL_DATETIME_H - -#include - -#include -#include "arrow/python/platform.h" -#include "arrow/status.h" -#include "arrow/util/logging.h" - -namespace arrow { -namespace py { - -// The following code is adapted from -// https://github.com/numpy/numpy/blob/master/numpy/core/src/multiarray/datetime.c - -// Days per month, regular year and leap year -static int64_t _days_per_month_table[2][12] = { - {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - -static bool is_leapyear(int64_t year) { - return (year & 0x3) == 0 && // year % 4 == 0 - ((year % 100) != 0 || (year % 400) == 0); -} - -// Calculates the days offset from the 1970 epoch. -static int64_t get_days_from_date(int64_t date_year, int64_t date_month, - int64_t date_day) { - int64_t i, month; - int64_t year, days = 0; - int64_t* month_lengths; - - year = date_year - 1970; - days = year * 365; - - // Adjust for leap years - if (days >= 0) { - // 1968 is the closest leap year before 1970. - // Exclude the current year, so add 1. - year += 1; - // Add one day for each 4 years - days += year / 4; - // 1900 is the closest previous year divisible by 100 - year += 68; - // Subtract one day for each 100 years - days -= year / 100; - // 1600 is the closest previous year divisible by 400 - year += 300; - // Add one day for each 400 years - days += year / 400; - } else { - // 1972 is the closest later year after 1970. - // Include the current year, so subtract 2. - year -= 2; - // Subtract one day for each 4 years - days += year / 4; - // 2000 is the closest later year divisible by 100 - year -= 28; - // Add one day for each 100 years - days -= year / 100; - // 2000 is also the closest later year divisible by 400 - // Subtract one day for each 400 years - days += year / 400; - } - - month_lengths = _days_per_month_table[is_leapyear(date_year)]; - month = date_month - 1; - - // Add the months - for (i = 0; i < month; ++i) { - days += month_lengths[i]; - } - - // Add the days - days += date_day - 1; - - return days; -} - -// Modifies '*days_' to be the day offset within the year, -// and returns the year. -static int64_t days_to_yearsdays(int64_t* days_) { - const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1); - // Adjust so it's relative to the year 2000 (divisible by 400) - int64_t days = (*days_) - (365 * 30 + 7); - int64_t year; - - // Break down the 400 year cycle to get the year and day within the year - if (days >= 0) { - year = 400 * (days / days_per_400years); - days = days % days_per_400years; - } else { - year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); - days = days % days_per_400years; - if (days < 0) { - days += days_per_400years; - } - } - - // Work out the year/day within the 400 year cycle - if (days >= 366) { - year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); - days = (days - 1) % (100 * 365 + 25 - 1); - if (days >= 365) { - year += 4 * ((days + 1) / (4 * 365 + 1)); - days = (days + 1) % (4 * 365 + 1); - if (days >= 366) { - year += (days - 1) / 365; - days = (days - 1) % 365; - } - } - } - - *days_ = days; - return year + 2000; -} - -// Extracts the month and year and day number from a number of days -static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month, - int64_t* date_day) { - int64_t *month_lengths, i; - - *date_year = days_to_yearsdays(&days); - month_lengths = _days_per_month_table[is_leapyear(*date_year)]; - - for (i = 0; i < 12; ++i) { - if (days < month_lengths[i]) { - *date_month = i + 1; - *date_day = days + 1; - return; - } else { - days -= month_lengths[i]; - } - } - - // Should never get here - return; -} - -static inline int64_t PyTime_to_us(PyObject* pytime) { - return (static_cast(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL + - static_cast(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL + - static_cast(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL + - PyDateTime_TIME_GET_MICROSECOND(pytime)); -} - -static inline int64_t PyTime_to_s(PyObject* pytime) { - return PyTime_to_us(pytime) / 1000000; -} - -static inline int64_t PyTime_to_ms(PyObject* pytime) { - return PyTime_to_us(pytime) / 1000; -} - -static inline int64_t PyTime_to_ns(PyObject* pytime) { - return PyTime_to_us(pytime) * 1000; -} - -// Splitting time quantities, for example splitting total seconds into -// minutes and remaining seconds. After we run -// int64_t remaining = split_time(total, quotient, &next) -// we have -// total = next * quotient + remaining. Handles negative values by propagating -// them: If total is negative, next will be negative and remaining will -// always be non-negative. -static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { - int64_t r = total % quotient; - if (r < 0) { - *next = total / quotient - 1; - return r + quotient; - } else { - *next = total / quotient; - return r; - } -} - -static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, - int64_t* hour, int64_t* minute, int64_t* second, - int64_t* microsecond) { - switch (unit) { - case TimeUnit::NANO: - if (val % 1000 != 0) { - return Status::Invalid("Value ", val, " has non-zero nanoseconds"); - } - val /= 1000; - // fall through - case TimeUnit::MICRO: - *microsecond = split_time(val, 1000000LL, &val); - *second = split_time(val, 60, &val); - *minute = split_time(val, 60, hour); - break; - case TimeUnit::MILLI: - *microsecond = split_time(val, 1000, &val) * 1000; - // fall through - case TimeUnit::SECOND: - *second = split_time(val, 60, &val); - *minute = split_time(val, 60, hour); - break; - default: - break; - } - return Status::OK(); -} - -static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, - int64_t* month, int64_t* day) { - switch (unit) { - case DateUnit::MILLI: - val /= 86400000LL; // fall through - case DateUnit::DAY: - get_date_from_days(val, year, month, day); - default: - break; - } - return Status::OK(); -} - -static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit, - PyObject** out) { - int64_t hour = 0, minute = 0, second = 0, microsecond = 0; - RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); - *out = PyTime_FromTime(static_cast(hour), static_cast(minute), - static_cast(second), static_cast(microsecond)); - return Status::OK(); -} - -static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { - int64_t year = 0, month = 0, day = 0; - RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); - *out = PyDate_FromDate(static_cast(year), static_cast(month), - static_cast(day)); - return Status::OK(); -} - -static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, - PyObject** out) { - int64_t hour = 0, minute = 0, second = 0, microsecond = 0; - RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); - int64_t total_days = 0; - hour = split_time(hour, 24, &total_days); - int64_t year = 0, month = 0, day = 0; - get_date_from_days(total_days, &year, &month, &day); - *out = PyDateTime_FromDateAndTime( - static_cast(year), static_cast(month), static_cast(day), - static_cast(hour), static_cast(minute), - static_cast(second), static_cast(microsecond)); - return Status::OK(); -} - -static inline int64_t PyDate_to_days(PyDateTime_Date* pydate) { - return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), - PyDateTime_GET_DAY(pydate)); -} - -static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) { - int64_t total_seconds = 0; - total_seconds += PyDateTime_DATE_GET_SECOND(pydate); - total_seconds += PyDateTime_DATE_GET_MINUTE(pydate) * 60; - total_seconds += PyDateTime_DATE_GET_HOUR(pydate) * 3600; - int64_t days = - get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), - PyDateTime_GET_DAY(pydate)); - total_seconds += days * 24 * 3600; - return total_seconds * 1000; -} - -static inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) { - return PyDate_to_ms(reinterpret_cast(pydatetime)) / 1000LL; -} - -static inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) { - int64_t date_ms = PyDate_to_ms(reinterpret_cast(pydatetime)); - int ms = PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000; - return date_ms + ms; -} - -static inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) { - int64_t ms = PyDate_to_ms(reinterpret_cast(pydatetime)); - int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); - return ms * 1000 + us; -} - -static inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) { - return PyDateTime_to_us(pydatetime) * 1000; -} - -} // namespace py -} // namespace arrow - -#endif // PYARROW_UTIL_DATETIME_H diff --git a/r/R/inst/include/arrow/python/visibility.h b/r/R/inst/include/arrow/python/visibility.h deleted file mode 100644 index c0b343c70e9..00000000000 --- a/r/R/inst/include/arrow/python/visibility.h +++ /dev/null @@ -1,39 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#if defined(_WIN32) || defined(__CYGWIN__) // Windows -#if defined(_MSC_VER) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef ARROW_STATIC -#define ARROW_PYTHON_EXPORT -#elif defined(ARROW_PYTHON_EXPORTING) -#define ARROW_PYTHON_EXPORT __declspec(dllexport) -#else -#define ARROW_PYTHON_EXPORT __declspec(dllimport) -#endif - -#else // Not Windows -#ifndef ARROW_PYTHON_EXPORT -#define ARROW_PYTHON_EXPORT __attribute__((visibility("default"))) -#endif -#endif // Non-Windows diff --git a/r/R/inst/include/arrow/record_batch.h b/r/R/inst/include/arrow/record_batch.h deleted file mode 100644 index f80d4ed7683..00000000000 --- a/r/R/inst/include/arrow/record_batch.h +++ /dev/null @@ -1,190 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_RECORD_BATCH_H -#define ARROW_RECORD_BATCH_H - -#include -#include -#include -#include - -#include "arrow/type_fwd.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -/// \class RecordBatch -/// \brief Collection of equal-length arrays matching a particular Schema -/// -/// A record batch is table-like data structure that is semantically a sequence -/// of fields, each a contiguous Arrow array -class ARROW_EXPORT RecordBatch { - public: - virtual ~RecordBatch() = default; - - /// \param[in] schema The record batch schema - /// \param[in] num_rows length of fields in the record batch. Each array - /// should have the same length as num_rows - /// \param[in] columns the record batch fields as vector of arrays - static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); - - /// \brief Move-based constructor for a vector of Array instances - static std::shared_ptr Make(const std::shared_ptr& schema, - int64_t num_rows, - std::vector>&& columns); - - /// \brief Construct record batch from vector of internal data structures - /// \since 0.5.0 - /// - /// This class is only provided with an rvalue-reference for the input data, - /// and is intended for internal use, or advanced users. - /// - /// \param schema the record batch schema - /// \param num_rows the number of semantic rows in the record batch. This - /// should be equal to the length of each field - /// \param columns the data for the batch's columns - static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, - std::vector>&& columns); - - /// \brief Construct record batch by copying vector of array data - /// \since 0.5.0 - static std::shared_ptr Make( - const std::shared_ptr& schema, int64_t num_rows, - const std::vector>& columns); - - /// \brief Determine if two record batches are exactly equal - /// \return true if batches are equal - bool Equals(const RecordBatch& other) const; - - /// \brief Determine if two record batches are approximately equal - bool ApproxEquals(const RecordBatch& other) const; - - // \return the table's schema - /// \return true if batches are equal - std::shared_ptr schema() const { return schema_; } - - /// \brief Retrieve an array from the record batch - /// \param[in] i field index, does not boundscheck - /// \return an Array object - virtual std::shared_ptr column(int i) const = 0; - - /// \brief Retrieve an array from the record batch - /// \param[in] name field name - /// \return an Array or null if no field was found - std::shared_ptr GetColumnByName(const std::string& name) const; - - /// \brief Retrieve an array's internaldata from the record batch - /// \param[in] i field index, does not boundscheck - /// \return an internal ArrayData object - virtual std::shared_ptr column_data(int i) const = 0; - - /// \brief Add column to the record batch, producing a new RecordBatch - /// - /// \param[in] i field index, which will be boundschecked - /// \param[in] field field to be added - /// \param[in] column column to be added - /// \param[out] out record batch with column added - virtual Status AddColumn(int i, const std::shared_ptr& field, - const std::shared_ptr& column, - std::shared_ptr* out) const = 0; - - /// \brief Add new nullable column to the record batch, producing a new - /// RecordBatch. - /// - /// For non-nullable columns, use the Field-based version of this method. - /// - /// \param[in] i field index, which will be boundschecked - /// \param[in] field_name name of field to be added - /// \param[in] column column to be added - /// \param[out] out record batch with column added - virtual Status AddColumn(int i, const std::string& field_name, - const std::shared_ptr& column, - std::shared_ptr* out) const; - - /// \brief Remove column from the record batch, producing a new RecordBatch - /// - /// \param[in] i field index, does boundscheck - /// \param[out] out record batch with column removed - virtual Status RemoveColumn(int i, std::shared_ptr* out) const = 0; - - virtual std::shared_ptr ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const = 0; - - /// \brief Name in i-th column - const std::string& column_name(int i) const; - - /// \return the number of columns in the table - int num_columns() const; - - /// \return the number of rows (the corresponding length of each column) - int64_t num_rows() const { return num_rows_; } - - /// \brief Slice each of the arrays in the record batch - /// \param[in] offset the starting offset to slice, through end of batch - /// \return new record batch - virtual std::shared_ptr Slice(int64_t offset) const; - - /// \brief Slice each of the arrays in the record batch - /// \param[in] offset the starting offset to slice - /// \param[in] length the number of elements to slice from offset - /// \return new record batch - virtual std::shared_ptr Slice(int64_t offset, int64_t length) const = 0; - - /// \brief Check for schema or length inconsistencies - /// \return Status - virtual Status Validate() const; - - protected: - RecordBatch(const std::shared_ptr& schema, int64_t num_rows); - - std::shared_ptr schema_; - int64_t num_rows_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch); -}; - -/// \brief Abstract interface for reading stream of record batches -class ARROW_EXPORT RecordBatchReader { - public: - virtual ~RecordBatchReader(); - - /// \return the shared schema of the record batches in the stream - virtual std::shared_ptr schema() const = 0; - - /// \brief Read the next record batch in the stream. Return null for batch - /// when reaching end of stream - /// - /// \param[out] batch the next loaded batch, null at end of stream - /// \return Status - virtual Status ReadNext(std::shared_ptr* batch) = 0; - - /// \brief Consume entire stream as a vector of record batches - Status ReadAll(std::vector>* batches); - - /// \brief Read all batches and concatenate as arrow::Table - Status ReadAll(std::shared_ptr
* table); -}; - -} // namespace arrow - -#endif // ARROW_RECORD_BATCH_H diff --git a/r/R/inst/include/arrow/scalar.h b/r/R/inst/include/arrow/scalar.h deleted file mode 100644 index 51b5e71c345..00000000000 --- a/r/R/inst/include/arrow/scalar.h +++ /dev/null @@ -1,199 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Object model for scalar (non-Array) values. Not intended for use with large -// amounts of data -// -// NOTE: This API is experimental as of the 0.13 version and subject to change -// without deprecation warnings - -#pragma once - -#include -#include - -#include "arrow/type.h" -#include "arrow/type_fwd.h" -#include "arrow/type_traits.h" -#include "arrow/util/decimal.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; - -/// \brief Base class for scalar values, representing a single value occupying -/// an array "slot" -struct ARROW_EXPORT Scalar { - virtual ~Scalar() = default; - - /// \brief The type of the scalar value - std::shared_ptr type; - - /// \brief Whether the value is valid (not null) or not - bool is_valid; - - bool Equals(const Scalar& other) const; - bool Equals(const std::shared_ptr& other) const { - if (other) return Equals(*other); - return false; - } - - protected: - Scalar(const std::shared_ptr& type, bool is_valid) - : type(type), is_valid(is_valid) {} -}; - -/// \brief A scalar value for NullType. Never valid -struct ARROW_EXPORT NullScalar : public Scalar { - public: - NullScalar() : Scalar{null(), false} {} -}; - -namespace internal { - -struct ARROW_EXPORT PrimitiveScalar : public Scalar { - using Scalar::Scalar; -}; - -} // namespace internal - -struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar { - bool value; - explicit BooleanScalar(bool value, bool is_valid = true) - : internal::PrimitiveScalar{boolean(), is_valid}, value(value) {} -}; - -template -struct NumericScalar : public internal::PrimitiveScalar { - using T = typename Type::c_type; - T value; - - explicit NumericScalar(T value, bool is_valid = true) - : NumericScalar(value, TypeTraits::type_singleton(), is_valid) {} - - protected: - explicit NumericScalar(T value, const std::shared_ptr& type, bool is_valid) - : internal::PrimitiveScalar{type, is_valid}, value(value) {} -}; - -struct ARROW_EXPORT BinaryScalar : public Scalar { - std::shared_ptr value; - explicit BinaryScalar(const std::shared_ptr& value, bool is_valid = true) - : BinaryScalar(value, binary(), is_valid) {} - - protected: - BinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid = true) - : Scalar{type, is_valid}, value(value) {} -}; - -struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar { - FixedSizeBinaryScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid = true); -}; - -struct ARROW_EXPORT StringScalar : public BinaryScalar { - explicit StringScalar(const std::shared_ptr& value, bool is_valid = true) - : BinaryScalar(value, utf8(), is_valid) {} -}; - -class ARROW_EXPORT Date32Scalar : public NumericScalar { - public: - using NumericScalar::NumericScalar; -}; - -class ARROW_EXPORT Date64Scalar : public NumericScalar { - public: - using NumericScalar::NumericScalar; -}; - -class ARROW_EXPORT Time32Scalar : public internal::PrimitiveScalar { - public: - int32_t value; - Time32Scalar(int32_t value, const std::shared_ptr& type, - bool is_valid = true); -}; - -class ARROW_EXPORT Time64Scalar : public internal::PrimitiveScalar { - public: - int64_t value; - Time64Scalar(int64_t value, const std::shared_ptr& type, - bool is_valid = true); -}; - -class ARROW_EXPORT TimestampScalar : public internal::PrimitiveScalar { - public: - int64_t value; - TimestampScalar(int64_t value, const std::shared_ptr& type, - bool is_valid = true); -}; - -class ARROW_EXPORT DurationScalar : public internal::PrimitiveScalar { - public: - int64_t value; - DurationScalar(int64_t value, const std::shared_ptr& type, - bool is_valid = true); -}; - -class ARROW_EXPORT MonthIntervalScalar : public internal::PrimitiveScalar { - public: - int32_t value; - MonthIntervalScalar(int32_t value, const std::shared_ptr& type, - bool is_valid = true); -}; - -class ARROW_EXPORT DayTimeIntervalScalar : public Scalar { - public: - DayTimeIntervalType::DayMilliseconds value; - DayTimeIntervalScalar(DayTimeIntervalType::DayMilliseconds value, - const std::shared_ptr& type, bool is_valid = true); -}; - -struct ARROW_EXPORT Decimal128Scalar : public Scalar { - Decimal128 value; - Decimal128Scalar(const Decimal128& value, const std::shared_ptr& type, - bool is_valid = true); -}; - -struct ARROW_EXPORT ListScalar : public Scalar { - std::shared_ptr value; - - ListScalar(const std::shared_ptr& value, const std::shared_ptr& type, - bool is_valid = true); - - explicit ListScalar(const std::shared_ptr& value, bool is_valid = true); -}; - -struct ARROW_EXPORT FixedSizeListScalar : public Scalar { - std::shared_ptr value; - - FixedSizeListScalar(const std::shared_ptr& value, - const std::shared_ptr& type, bool is_valid = true); - - explicit FixedSizeListScalar(const std::shared_ptr& value, bool is_valid = true); -}; - -struct ARROW_EXPORT StructScalar : public Scalar { - std::vector> value; -}; - -class ARROW_EXPORT UnionScalar : public Scalar {}; -class ARROW_EXPORT DictionaryScalar : public Scalar {}; -class ARROW_EXPORT ExtensionScalar : public Scalar {}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/sparse_tensor.h b/r/R/inst/include/arrow/sparse_tensor.h deleted file mode 100644 index e622245d633..00000000000 --- a/r/R/inst/include/arrow/sparse_tensor.h +++ /dev/null @@ -1,259 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_SPARSE_TENSOR_H -#define ARROW_SPARSE_TENSOR_H - -#include -#include -#include - -#include "arrow/tensor.h" - -namespace arrow { - -// ---------------------------------------------------------------------- -// SparseIndex class - -struct SparseTensorFormat { - /// EXPERIMENTAL: The index format type of SparseTensor - enum type { COO, CSR }; -}; - -/// \brief EXPERIMENTAL: The base class for the index of a sparse tensor -/// -/// SparseIndex describes where the non-zero elements are within a SparseTensor. -/// -/// There are several ways to represent this. The format_id is used to -/// distinguish what kind of representation is used. Each possible value of -/// format_id must have only one corresponding concrete subclass of SparseIndex. -class ARROW_EXPORT SparseIndex { - public: - explicit SparseIndex(SparseTensorFormat::type format_id, int64_t non_zero_length) - : format_id_(format_id), non_zero_length_(non_zero_length) {} - - virtual ~SparseIndex() = default; - - /// \brief Return the identifier of the format type - SparseTensorFormat::type format_id() const { return format_id_; } - - /// \brief Return the number of non zero values in the sparse tensor related - /// to this sparse index - int64_t non_zero_length() const { return non_zero_length_; } - - /// \brief Return the string representation of the sparse index - virtual std::string ToString() const = 0; - - protected: - SparseTensorFormat::type format_id_; - int64_t non_zero_length_; -}; - -namespace internal { -template -class SparseIndexBase : public SparseIndex { - public: - explicit SparseIndexBase(int64_t non_zero_length) - : SparseIndex(SparseIndexType::format_id, non_zero_length) {} -}; -} // namespace internal - -// ---------------------------------------------------------------------- -// SparseCOOIndex class - -/// \brief EXPERIMENTAL: The index data for a COO sparse tensor -/// -/// A COO sparse index manages the location of its non-zero values by their -/// coordinates. -class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase { - public: - using CoordsTensor = NumericTensor; - - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO; - - // Constructor with a column-major NumericTensor - explicit SparseCOOIndex(const std::shared_ptr& coords); - - /// \brief Return a tensor that has the coordinates of the non-zero values - const std::shared_ptr& indices() const { return coords_; } - - /// \brief Return a string representation of the sparse index - std::string ToString() const override; - - /// \brief Return whether the COO indices are equal - bool Equals(const SparseCOOIndex& other) const { - return indices()->Equals(*other.indices()); - } - - protected: - std::shared_ptr coords_; -}; - -// ---------------------------------------------------------------------- -// SparseCSRIndex class - -/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix -/// -/// A CSR sparse index manages the location of its non-zero values by two -/// vectors. -/// -/// The first vector, called indptr, represents the range of the rows; the i-th -/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector. -/// So the length of an indptr vector is the number of rows + 1. -/// -/// The other vector, called indices, represents the column indices of the -/// corresponding non-zero values. So the length of an indices vector is same -/// as the number of non-zero-values. -class ARROW_EXPORT SparseCSRIndex : public internal::SparseIndexBase { - public: - using IndexTensor = NumericTensor; - - static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR; - - // Constructor with two index vectors - explicit SparseCSRIndex(const std::shared_ptr& indptr, - const std::shared_ptr& indices); - - /// \brief Return a 1D tensor of indptr vector - const std::shared_ptr& indptr() const { return indptr_; } - - /// \brief Return a 1D tensor of indices vector - const std::shared_ptr& indices() const { return indices_; } - - /// \brief Return a string representation of the sparse index - std::string ToString() const override; - - /// \brief Return whether the CSR indices are equal - bool Equals(const SparseCSRIndex& other) const { - return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices()); - } - - protected: - std::shared_ptr indptr_; - std::shared_ptr indices_; -}; - -// ---------------------------------------------------------------------- -// SparseTensor class - -/// \brief EXPERIMENTAL: The base class of sparse tensor container -class ARROW_EXPORT SparseTensor { - public: - virtual ~SparseTensor() = default; - - SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); } - - /// \brief Return a value type of the sparse tensor - std::shared_ptr type() const { return type_; } - - /// \brief Return a buffer that contains the value vector of the sparse tensor - std::shared_ptr data() const { return data_; } - - /// \brief Return an immutable raw data pointer - const uint8_t* raw_data() const { return data_->data(); } - - /// \brief Return a mutable raw data pointer - uint8_t* raw_mutable_data() const { return data_->mutable_data(); } - - /// \brief Return a shape vector of the sparse tensor - const std::vector& shape() const { return shape_; } - - /// \brief Return a sparse index of the sparse tensor - const std::shared_ptr& sparse_index() const { return sparse_index_; } - - /// \brief Return a number of dimensions of the sparse tensor - int ndim() const { return static_cast(shape_.size()); } - - /// \brief Return a vector of dimension names - const std::vector& dim_names() const { return dim_names_; } - - /// \brief Return the name of the i-th dimension - const std::string& dim_name(int i) const; - - /// \brief Total number of value cells in the sparse tensor - int64_t size() const; - - /// \brief Return true if the underlying data buffer is mutable - bool is_mutable() const { return data_->is_mutable(); } - - /// \brief Total number of non-zero cells in the sparse tensor - int64_t non_zero_length() const { - return sparse_index_ ? sparse_index_->non_zero_length() : 0; - } - - /// \brief Return whether sparse tensors are equal - bool Equals(const SparseTensor& other) const; - - protected: - // Constructor with all attributes - SparseTensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, - const std::shared_ptr& sparse_index, - const std::vector& dim_names); - - std::shared_ptr type_; - std::shared_ptr data_; - std::vector shape_; - std::shared_ptr sparse_index_; - - // These names are optional - std::vector dim_names_; -}; - -// ---------------------------------------------------------------------- -// SparseTensorImpl class - -/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index -/// type -template -class ARROW_EXPORT SparseTensorImpl : public SparseTensor { - public: - virtual ~SparseTensorImpl() = default; - - // Constructor with all attributes - SparseTensorImpl(const std::shared_ptr& sparse_index, - const std::shared_ptr& type, - const std::shared_ptr& data, const std::vector& shape, - const std::vector& dim_names) - : SparseTensor(type, data, shape, sparse_index, dim_names) {} - - // Constructor for empty sparse tensor - SparseTensorImpl(const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names = {}); - - // Constructor with a dense numeric tensor - template - explicit SparseTensorImpl(const NumericTensor& tensor); - - // Constructor with a dense tensor - explicit SparseTensorImpl(const Tensor& tensor); - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl); -}; - -/// \brief EXPERIMENTAL: Type alias for COO sparse tensor -using SparseTensorCOO = SparseTensorImpl; - -/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix -using SparseTensorCSR = SparseTensorImpl; -using SparseMatrixCSR = SparseTensorImpl; - -} // namespace arrow - -#endif // ARROW_SPARSE_TENSOR_H diff --git a/r/R/inst/include/arrow/status.h b/r/R/inst/include/arrow/status.h deleted file mode 100644 index 790d9b71d23..00000000000 --- a/r/R/inst/include/arrow/status.h +++ /dev/null @@ -1,424 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. -// -// Multiple threads can invoke const methods on a Status without -// external synchronization, but if any of the threads may call a -// non-const method, all threads accessing the same Status must use -// external synchronization. - -// Adapted from Apache Kudu, TensorFlow - -#ifndef ARROW_STATUS_H_ -#define ARROW_STATUS_H_ - -#include -#include -#include -#include - -#ifdef ARROW_EXTRA_ERROR_CONTEXT -#include -#endif - -#include "arrow/util/macros.h" -#include "arrow/util/string_builder.h" -#include "arrow/util/visibility.h" - -#ifdef ARROW_EXTRA_ERROR_CONTEXT - -/// \brief Return with given status if condition is met. -#define ARROW_RETURN_IF_(condition, status, expr) \ - do { \ - if (ARROW_PREDICT_FALSE(condition)) { \ - ::arrow::Status _s = (status); \ - std::stringstream ss; \ - ss << _s.message() << "\n" << __FILE__ << ":" << __LINE__ << " code: " << expr; \ - return ::arrow::Status(_s.code(), ss.str()); \ - } \ - } while (0) - -#else - -#define ARROW_RETURN_IF_(condition, status, _) \ - do { \ - if (ARROW_PREDICT_FALSE(condition)) { \ - return (status); \ - } \ - } while (0) - -#endif // ARROW_EXTRA_ERROR_CONTEXT - -#define ARROW_RETURN_IF(condition, status) \ - ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status)) - -/// \brief Propagate any non-successful Status to the caller -#define ARROW_RETURN_NOT_OK(status) \ - do { \ - ::arrow::Status __s = (status); \ - ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \ - } while (false) - -#define RETURN_NOT_OK_ELSE(s, else_) \ - do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) { \ - else_; \ - return _s; \ - } \ - } while (false) - -// This is an internal-use macro and should not be used in public headers. -#ifndef RETURN_NOT_OK -#define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s) -#endif - -namespace arrow { - -enum class StatusCode : char { - OK = 0, - OutOfMemory = 1, - KeyError = 2, - TypeError = 3, - Invalid = 4, - IOError = 5, - CapacityError = 6, - IndexError = 7, - UnknownError = 9, - NotImplemented = 10, - SerializationError = 11, - PythonError = 12, - RError = 13, - PlasmaObjectExists = 20, - PlasmaObjectNonexistent = 21, - PlasmaStoreFull = 22, - PlasmaObjectAlreadySealed = 23, - StillExecuting = 24, - // Gandiva range of errors - CodeGenError = 40, - ExpressionValidationError = 41, - ExecutionError = 42 -}; - -#if defined(__clang__) -// Only clang supports warn_unused_result as a type annotation. -class ARROW_MUST_USE_RESULT ARROW_EXPORT Status; -#endif - -/// \brief Status outcome object (success or error) -/// -/// The Status object is an object holding the outcome of an operation. -/// The outcome is represented as a StatusCode, either success -/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values). -/// -/// Additionally, if an error occurred, a specific error message is generally -/// attached. -class ARROW_EXPORT Status { - public: - // Create a success status. - Status() noexcept : state_(NULLPTR) {} - ~Status() noexcept { - // ARROW-2400: On certain compilers, splitting off the slow path improves - // performance significantly. - if (ARROW_PREDICT_FALSE(state_ != NULL)) { - DeleteState(); - } - } - - Status(StatusCode code, const std::string& msg); - - // Copy the specified status. - inline Status(const Status& s); - inline Status& operator=(const Status& s); - - // Move the specified status. - inline Status(Status&& s) noexcept; - inline Status& operator=(Status&& s) noexcept; - - // AND the statuses. - inline Status operator&(const Status& s) const noexcept; - inline Status operator&(Status&& s) const noexcept; - inline Status& operator&=(const Status& s) noexcept; - inline Status& operator&=(Status&& s) noexcept; - - /// Return a success status - static Status OK() { return Status(); } - - /// Return a success status with a specific message - template - static Status OK(Args&&... args) { - return Status(StatusCode::OK, util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status for out-of-memory conditions - template - static Status OutOfMemory(Args&&... args) { - return Status(StatusCode::OutOfMemory, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status for failed key lookups (e.g. column name in a table) - template - static Status KeyError(Args&&... args) { - return Status(StatusCode::KeyError, util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status for type errors (such as mismatching data types) - template - static Status TypeError(Args&&... args) { - return Status(StatusCode::TypeError, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status for unknown errors - template - static Status UnknownError(Args&&... args) { - return Status(StatusCode::UnknownError, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status when an operation or a combination of operation and - /// data types is unimplemented - template - static Status NotImplemented(Args&&... args) { - return Status(StatusCode::NotImplemented, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status for invalid data (for example a string that fails parsing) - template - static Status Invalid(Args&&... args) { - return Status(StatusCode::Invalid, util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status when an index is out of bounds - template - static Status IndexError(Args&&... args) { - return Status(StatusCode::IndexError, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status when a container's capacity would exceed its limits - template - static Status CapacityError(Args&&... args) { - return Status(StatusCode::CapacityError, - util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status when some IO-related operation failed - template - static Status IOError(Args&&... args) { - return Status(StatusCode::IOError, util::StringBuilder(std::forward(args)...)); - } - - /// Return an error status when some (de)serialization operation failed - template - static Status SerializationError(Args&&... args) { - return Status(StatusCode::SerializationError, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status RError(Args&&... args) { - return Status(StatusCode::RError, util::StringBuilder(std::forward(args)...)); - } - - template - static Status PlasmaObjectExists(Args&&... args) { - return Status(StatusCode::PlasmaObjectExists, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status PlasmaObjectNonexistent(Args&&... args) { - return Status(StatusCode::PlasmaObjectNonexistent, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status PlasmaObjectAlreadySealed(Args&&... args) { - return Status(StatusCode::PlasmaObjectAlreadySealed, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status PlasmaStoreFull(Args&&... args) { - return Status(StatusCode::PlasmaStoreFull, - util::StringBuilder(std::forward(args)...)); - } - - static Status StillExecuting() { return Status(StatusCode::StillExecuting, ""); } - - template - static Status CodeGenError(Args&&... args) { - return Status(StatusCode::CodeGenError, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status ExpressionValidationError(Args&&... args) { - return Status(StatusCode::ExpressionValidationError, - util::StringBuilder(std::forward(args)...)); - } - - template - static Status ExecutionError(Args&&... args) { - return Status(StatusCode::ExecutionError, - util::StringBuilder(std::forward(args)...)); - } - - /// Return true iff the status indicates success. - bool ok() const { return (state_ == NULLPTR); } - - /// Return true iff the status indicates an out-of-memory error. - bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } - /// Return true iff the status indicates a key lookup error. - bool IsKeyError() const { return code() == StatusCode::KeyError; } - /// Return true iff the status indicates invalid data. - bool IsInvalid() const { return code() == StatusCode::Invalid; } - /// Return true iff the status indicates an IO-related failure. - bool IsIOError() const { return code() == StatusCode::IOError; } - /// Return true iff the status indicates a container reaching capacity limits. - bool IsCapacityError() const { return code() == StatusCode::CapacityError; } - /// Return true iff the status indicates an out of bounds index. - bool IsIndexError() const { return code() == StatusCode::IndexError; } - /// Return true iff the status indicates a type error. - bool IsTypeError() const { return code() == StatusCode::TypeError; } - /// Return true iff the status indicates an unknown error. - bool IsUnknownError() const { return code() == StatusCode::UnknownError; } - /// Return true iff the status indicates an unimplemented operation. - bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } - /// Return true iff the status indicates a (de)serialization failure - bool IsSerializationError() const { return code() == StatusCode::SerializationError; } - /// Return true iff the status indicates a R-originated error. - bool IsRError() const { return code() == StatusCode::RError; } - /// Return true iff the status indicates a Python-originated error. - bool IsPythonError() const { return code() == StatusCode::PythonError; } - /// Return true iff the status indicates an already existing Plasma object. - bool IsPlasmaObjectExists() const { return code() == StatusCode::PlasmaObjectExists; } - /// Return true iff the status indicates a non-existent Plasma object. - bool IsPlasmaObjectNonexistent() const { - return code() == StatusCode::PlasmaObjectNonexistent; - } - /// Return true iff the status indicates an already sealed Plasma object. - bool IsPlasmaObjectAlreadySealed() const { - return code() == StatusCode::PlasmaObjectAlreadySealed; - } - /// Return true iff the status indicates the Plasma store reached its capacity limit. - bool IsPlasmaStoreFull() const { return code() == StatusCode::PlasmaStoreFull; } - - bool IsStillExecuting() const { return code() == StatusCode::StillExecuting; } - - bool IsCodeGenError() const { return code() == StatusCode::CodeGenError; } - - bool IsExpressionValidationError() const { - return code() == StatusCode::ExpressionValidationError; - } - - bool IsExecutionError() const { return code() == StatusCode::ExecutionError; } - - /// \brief Return a string representation of this status suitable for printing. - /// - /// The string "OK" is returned for success. - std::string ToString() const; - - /// \brief Return a string representation of the status code, without the message - /// text or POSIX code information. - std::string CodeAsString() const; - - /// \brief Return the StatusCode value attached to this status. - StatusCode code() const { return ok() ? StatusCode::OK : state_->code; } - - /// \brief Return the specific error message attached to this status. - std::string message() const { return ok() ? "" : state_->msg; } - - [[noreturn]] void Abort() const; - [[noreturn]] void Abort(const std::string& message) const; - - private: - struct State { - StatusCode code; - std::string msg; - }; - // OK status has a `NULL` state_. Otherwise, `state_` points to - // a `State` structure containing the error code and message(s) - State* state_; - - void DeleteState() { - delete state_; - state_ = NULLPTR; - } - void CopyFrom(const Status& s); - inline void MoveFrom(Status& s); -}; - -static inline std::ostream& operator<<(std::ostream& os, const Status& x) { - os << x.ToString(); - return os; -} - -void Status::MoveFrom(Status& s) { - delete state_; - state_ = s.state_; - s.state_ = NULLPTR; -} - -Status::Status(const Status& s) - : state_((s.state_ == NULLPTR) ? NULLPTR : new State(*s.state_)) {} - -Status& Status::operator=(const Status& s) { - // The following condition catches both aliasing (when this == &s), - // and the common case where both s and *this are ok. - if (state_ != s.state_) { - CopyFrom(s); - } - return *this; -} - -Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULLPTR; } - -Status& Status::operator=(Status&& s) noexcept { - MoveFrom(s); - return *this; -} - -/// \cond FALSE -// (note: emits warnings on Doxygen < 1.8.15, -// see https://github.com/doxygen/doxygen/issues/6295) -Status Status::operator&(const Status& s) const noexcept { - if (ok()) { - return s; - } else { - return *this; - } -} - -Status Status::operator&(Status&& s) const noexcept { - if (ok()) { - return std::move(s); - } else { - return *this; - } -} - -Status& Status::operator&=(const Status& s) noexcept { - if (ok() && !s.ok()) { - CopyFrom(s); - } - return *this; -} - -Status& Status::operator&=(Status&& s) noexcept { - if (ok() && !s.ok()) { - MoveFrom(s); - } - return *this; -} -/// \endcond - -} // namespace arrow - -#endif // ARROW_STATUS_H_ diff --git a/r/R/inst/include/arrow/stl.h b/r/R/inst/include/arrow/stl.h deleted file mode 100644 index d641e39955b..00000000000 --- a/r/R/inst/include/arrow/stl.h +++ /dev/null @@ -1,373 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_STL_H -#define ARROW_STL_H - -#include -#include -#include -#include - -#include "arrow/builder.h" -#include "arrow/compute/api.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" - -namespace arrow { - -class Schema; - -namespace stl { - -/// Traits meta class to map standard C/C++ types to equivalent Arrow types. -template -struct ConversionTraits {}; - -#define ARROW_STL_CONVERSION(c_type, ArrowType_) \ - template <> \ - struct ConversionTraits : public CTypeTraits { \ - static Status AppendRow(typename TypeTraits::BuilderType& builder, \ - c_type cell) { \ - return builder.Append(cell); \ - } \ - static c_type GetEntry(const typename TypeTraits::ArrayType& array, \ - size_t j) { \ - return array.Value(j); \ - } \ - constexpr static bool nullable = false; \ - }; - -ARROW_STL_CONVERSION(bool, BooleanType) -ARROW_STL_CONVERSION(int8_t, Int8Type) -ARROW_STL_CONVERSION(int16_t, Int16Type) -ARROW_STL_CONVERSION(int32_t, Int32Type) -ARROW_STL_CONVERSION(int64_t, Int64Type) -ARROW_STL_CONVERSION(uint8_t, UInt8Type) -ARROW_STL_CONVERSION(uint16_t, UInt16Type) -ARROW_STL_CONVERSION(uint32_t, UInt32Type) -ARROW_STL_CONVERSION(uint64_t, UInt64Type) -ARROW_STL_CONVERSION(float, FloatType) -ARROW_STL_CONVERSION(double, DoubleType) - -template <> -struct ConversionTraits : public CTypeTraits { - static Status AppendRow(StringBuilder& builder, const std::string& cell) { - return builder.Append(cell); - } - static std::string GetEntry(const StringArray& array, size_t j) { - return array.GetString(j); - } - constexpr static bool nullable = false; -}; - -template -struct ConversionTraits> - : public CTypeTraits> { - static Status AppendRow(ListBuilder& builder, std::vector cell) { - using ElementBuilderType = typename TypeTraits< - typename ConversionTraits::ArrowType>::BuilderType; - ARROW_RETURN_NOT_OK(builder.Append()); - ElementBuilderType& value_builder = - ::arrow::internal::checked_cast(*builder.value_builder()); - for (auto const& value : cell) { - ARROW_RETURN_NOT_OK( - ConversionTraits::AppendRow(value_builder, value)); - } - return Status::OK(); - } - - static std::vector GetEntry(const ListArray& array, size_t j) { - using ElementArrayType = typename TypeTraits< - typename ConversionTraits::ArrowType>::ArrayType; - - const ElementArrayType& value_array = - ::arrow::internal::checked_cast(*array.values()); - - std::vector vec(array.value_length(j)); - for (int64_t i = 0; i < array.value_length(j); i++) { - vec[i] = ConversionTraits::GetEntry(value_array, - array.value_offset(j) + i); - } - return vec; - } - - constexpr static bool nullable = false; -}; - -/// Build an arrow::Schema based upon the types defined in a std::tuple-like structure. -/// -/// While the type information is available at compile-time, we still need to add the -/// column names at runtime, thus these methods are not constexpr. -template ::value> -struct SchemaFromTuple { - using Element = typename std::tuple_element::type; - - // Implementations that take a vector-like object for the column names. - - /// Recursively build a vector of arrow::Field from the defined types. - /// - /// In most cases MakeSchema is the better entrypoint for the Schema creation. - static std::vector> MakeSchemaRecursion( - const std::vector& names) { - std::vector> ret = - SchemaFromTuple::MakeSchemaRecursion(names); - std::shared_ptr type = CTypeTraits::type_singleton(); - ret.push_back(field(names[N - 1], type, false /* nullable */)); - return ret; - } - - /// Build a Schema from the types of the tuple-like structure passed in as template - /// parameter assign the column names at runtime. - /// - /// An example usage of this API can look like the following: - /// - /// \code{.cpp} - /// using TupleType = std::tuple>; - /// std::shared_ptr schema = - /// SchemaFromTuple::MakeSchema({"int_column", "list_of_strings_column"}); - /// \endcode - static std::shared_ptr MakeSchema(const std::vector& names) { - return std::make_shared(MakeSchemaRecursion(names)); - } - - // Implementations that take a tuple-like object for the column names. - - /// Recursively build a vector of arrow::Field from the defined types. - /// - /// In most cases MakeSchema is the better entrypoint for the Schema creation. - template - static std::vector> MakeSchemaRecursionT( - const NamesTuple& names) { - using std::get; - - std::vector> ret = - SchemaFromTuple::MakeSchemaRecursionT(names); - std::shared_ptr type = ConversionTraits::type_singleton(); - ret.push_back(field(get(names), type, ConversionTraits::nullable)); - return ret; - } - - /// Build a Schema from the types of the tuple-like structure passed in as template - /// parameter assign the column names at runtime. - /// - /// An example usage of this API can look like the following: - /// - /// \code{.cpp} - /// using TupleType = std::tuple>; - /// std::shared_ptr schema = - /// SchemaFromTuple::MakeSchema({"int_column", "list_of_strings_column"}); - /// \endcode - template - static std::shared_ptr MakeSchema(const NamesTuple& names) { - return std::make_shared(MakeSchemaRecursionT(names)); - } -}; - -template -struct SchemaFromTuple { - static std::vector> MakeSchemaRecursion( - const std::vector& names) { - std::vector> ret; - ret.reserve(names.size()); - return ret; - } - - template - static std::vector> MakeSchemaRecursionT( - const NamesTuple& names) { - std::vector> ret; - ret.reserve(std::tuple_size::value); - return ret; - } -}; - -namespace internal { -template ::value> -struct CreateBuildersRecursive { - static Status Make(MemoryPool* pool, - std::vector>* builders) { - using Element = typename std::tuple_element::type; - std::shared_ptr type = ConversionTraits::type_singleton(); - ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &builders->at(N - 1))); - - return CreateBuildersRecursive::Make(pool, builders); - } -}; - -template -struct CreateBuildersRecursive { - static Status Make(MemoryPool*, std::vector>*) { - return Status::OK(); - } -}; - -template ::value> -struct RowIterator { - static Status Append(const std::vector>& builders, - const Tuple& row) { - using std::get; - using Element = typename std::tuple_element::type; - using BuilderType = - typename TypeTraits::ArrowType>::BuilderType; - - BuilderType& builder = - ::arrow::internal::checked_cast(*builders[N - 1]); - ARROW_RETURN_NOT_OK(ConversionTraits::AppendRow(builder, get(row))); - - return RowIterator::Append(builders, row); - } -}; - -template -struct RowIterator { - static Status Append(const std::vector>& builders, - const Tuple& row) { - return Status::OK(); - } -}; - -template ::value> -struct EnsureColumnTypes { - static Status Cast(const Table& table, std::shared_ptr
* table_owner, - const compute::CastOptions& cast_options, - compute::FunctionContext* ctx, - std::reference_wrapper* result) { - using Element = typename std::tuple_element::type; - std::shared_ptr expected_type = ConversionTraits::type_singleton(); - - if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) { - compute::Datum casted; - ARROW_RETURN_NOT_OK(compute::Cast(ctx, compute::Datum(table.column(N - 1)->data()), - expected_type, cast_options, &casted)); - std::shared_ptr new_column = std::make_shared( - table.schema()->field(N - 1)->WithType(expected_type), casted.chunked_array()); - ARROW_RETURN_NOT_OK(table.SetColumn(N - 1, new_column, table_owner)); - *result = **table_owner; - } - - return EnsureColumnTypes::Cast(result->get(), table_owner, cast_options, - ctx, result); - } -}; - -template -struct EnsureColumnTypes { - static Status Cast(const Table& table, std::shared_ptr
* table_ownder, - const compute::CastOptions& cast_options, - compute::FunctionContext* ctx, - std::reference_wrapper* result) { - return Status::OK(); - } -}; - -template ::value> -struct TupleSetter { - static void Fill(const Table& table, Range* rows) { - using std::get; - using Element = typename std::tuple_element::type; - using ArrayType = - typename TypeTraits::ArrowType>::ArrayType; - - auto iter = rows->begin(); - const ChunkedArray& chunked_array = *table.column(N - 1)->data(); - for (int i = 0; i < chunked_array.num_chunks(); i++) { - const ArrayType& array = - ::arrow::internal::checked_cast(*chunked_array.chunk(i)); - for (int64_t j = 0; j < array.length(); j++) { - get(*iter++) = ConversionTraits::GetEntry(array, j); - } - } - - return TupleSetter::Fill(table, rows); - } -}; - -template -struct TupleSetter { - static void Fill(const Table& table, Range* rows) {} -}; - -} // namespace internal - -template -Status TableFromTupleRange(MemoryPool* pool, const Range& rows, - const std::vector& names, - std::shared_ptr
* table) { - using row_type = typename std::iterator_traits::value_type; - constexpr std::size_t n_columns = std::tuple_size::value; - - std::shared_ptr schema = SchemaFromTuple::MakeSchema(names); - - std::vector> builders(n_columns); - ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive::Make(pool, &builders)); - - for (auto const& row : rows) { - ARROW_RETURN_NOT_OK(internal::RowIterator::Append(builders, row)); - } - - std::vector> arrays; - for (auto const& builder : builders) { - std::shared_ptr array; - ARROW_RETURN_NOT_OK(builder->Finish(&array)); - arrays.emplace_back(array); - } - - *table = Table::Make(schema, arrays); - - return Status::OK(); -} - -template -Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options, - compute::FunctionContext* ctx, Range* rows) { - using row_type = typename std::decay::type; - constexpr std::size_t n_columns = std::tuple_size::value; - - if (table.schema()->num_fields() != n_columns) { - std::stringstream ss; - ss << "Number of columns in the table does not match the width of the target: "; - ss << table.schema()->num_fields() << " != " << n_columns; - return Status::Invalid(ss.str()); - } - - // TODO: Use std::size with C++17 - if (rows->size() != static_cast(table.num_rows())) { - std::stringstream ss; - ss << "Number of rows in the table does not match the size of the target: "; - ss << table.num_rows() << " != " << rows->size(); - return Status::Invalid(ss.str()); - } - - // Check that all columns have the correct type, otherwise cast them. - std::shared_ptr
table_owner; - std::reference_wrapper current_table(table); - - ARROW_RETURN_NOT_OK(internal::EnsureColumnTypes::Cast( - table, &table_owner, cast_options, ctx, ¤t_table)); - - internal::TupleSetter::Fill(current_table.get(), rows); - - return Status::OK(); -} - -} // namespace stl -} // namespace arrow - -#endif // ARROW_STL_H diff --git a/r/R/inst/include/arrow/table.h b/r/R/inst/include/arrow/table.h deleted file mode 100644 index 8016371d808..00000000000 --- a/r/R/inst/include/arrow/table.h +++ /dev/null @@ -1,377 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TABLE_H -#define ARROW_TABLE_H - -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/record_batch.h" -#include "arrow/type.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; -class Status; - -/// \class ChunkedArray -/// \brief A data structure managing a list of primitive Arrow arrays logically -/// as one large array -class ARROW_EXPORT ChunkedArray { - public: - /// \brief Construct a chunked array from a vector of arrays - /// - /// The vector should be non-empty and all its elements should have the same - /// data type. - explicit ChunkedArray(const ArrayVector& chunks); - - /// \brief Construct a chunked array from a single Array - explicit ChunkedArray(const std::shared_ptr& chunk) - : ChunkedArray(ArrayVector({chunk})) {} - - /// \brief Construct a chunked array from a vector of arrays and a data type - /// - /// As the data type is passed explicitly, the vector may be empty. - ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); - - /// \return the total length of the chunked array; computed on construction - int64_t length() const { return length_; } - - /// \return the total number of nulls among all chunks - int64_t null_count() const { return null_count_; } - - int num_chunks() const { return static_cast(chunks_.size()); } - - /// \return chunk a particular chunk from the chunked array - std::shared_ptr chunk(int i) const { return chunks_[i]; } - - const ArrayVector& chunks() const { return chunks_; } - - /// \brief Construct a zero-copy slice of the chunked array with the - /// indicated offset and length - /// - /// \param[in] offset the position of the first element in the constructed - /// slice - /// \param[in] length the length of the slice. If there are not enough - /// elements in the chunked array, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr - std::shared_ptr Slice(int64_t offset, int64_t length) const; - - /// \brief Slice from offset until end of the chunked array - std::shared_ptr Slice(int64_t offset) const; - - /// \brief Flatten this chunked array as a vector of chunked arrays, one - /// for each struct field - /// - /// \param[in] pool The pool for buffer allocations, if any - /// \param[out] out The resulting vector of arrays - Status Flatten(MemoryPool* pool, std::vector>* out) const; - - std::shared_ptr type() const { return type_; } - - /// \brief Determine if two chunked arrays are equal. - /// - /// Two chunked arrays can be equal only if they have equal datatypes. - /// However, they may be equal even if they have different chunkings. - bool Equals(const ChunkedArray& other) const; - /// \brief Determine if two chunked arrays are equal. - bool Equals(const std::shared_ptr& other) const; - - protected: - ArrayVector chunks_; - int64_t length_; - int64_t null_count_; - std::shared_ptr type_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); -}; - -/// \class Column -/// \brief An immutable column data structure consisting of a field (type -/// metadata) and a chunked data array -class ARROW_EXPORT Column { - public: - /// \brief Construct a column from a vector of arrays - /// - /// The array chunks' datatype must match the field's datatype. - Column(const std::shared_ptr& field, const ArrayVector& chunks); - /// \brief Construct a column from a chunked array - /// - /// The chunked array's datatype must match the field's datatype. - Column(const std::shared_ptr& field, const std::shared_ptr& data); - /// \brief Construct a column from a single array - /// - /// The array's datatype must match the field's datatype. - Column(const std::shared_ptr& field, const std::shared_ptr& data); - - /// \brief Construct a column from a name and an array - /// - /// A field with the given name and the array's datatype is automatically created. - Column(const std::string& name, const std::shared_ptr& data); - /// \brief Construct a column from a name and a chunked array - /// - /// A field with the given name and the array's datatype is automatically created. - Column(const std::string& name, const std::shared_ptr& data); - - int64_t length() const { return data_->length(); } - - int64_t null_count() const { return data_->null_count(); } - - std::shared_ptr field() const { return field_; } - - /// \brief The column name - /// \return the column's name in the passed metadata - const std::string& name() const { return field_->name(); } - - /// \brief The column type - /// \return the column's type according to the metadata - std::shared_ptr type() const { return field_->type(); } - - /// \brief The column data as a chunked array - /// \return the column's data as a chunked logical array - std::shared_ptr data() const { return data_; } - - /// \brief Construct a zero-copy slice of the column with the indicated - /// offset and length - /// - /// \param[in] offset the position of the first element in the constructed - /// slice - /// \param[in] length the length of the slice. If there are not enough - /// elements in the column, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr - std::shared_ptr Slice(int64_t offset, int64_t length) const { - return std::make_shared(field_, data_->Slice(offset, length)); - } - - /// \brief Slice from offset until end of the column - std::shared_ptr Slice(int64_t offset) const { - return std::make_shared(field_, data_->Slice(offset)); - } - - /// \brief Flatten this column as a vector of columns - /// - /// \param[in] pool The pool for buffer allocations, if any - /// \param[out] out The resulting vector of arrays - Status Flatten(MemoryPool* pool, std::vector>* out) const; - - /// \brief Determine if two columns are equal. - /// - /// Two columns can be equal only if they have equal datatypes. - /// However, they may be equal even if they have different chunkings. - bool Equals(const Column& other) const; - /// \brief Determine if the two columns are equal. - bool Equals(const std::shared_ptr& other) const; - - /// \brief Verify that the column's array data is consistent with the passed - /// field's metadata - Status ValidateData(); - - protected: - std::shared_ptr field_; - std::shared_ptr data_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Column); -}; - -/// \class Table -/// \brief Logical table as sequence of chunked arrays -class ARROW_EXPORT Table { - public: - virtual ~Table() = default; - - /// \brief Construct a Table from schema and columns - /// If columns is zero-length, the table's number of rows is zero - /// \param schema The table schema (column types) - /// \param columns The table's columns - /// \param num_rows number of rows in table, -1 (default) to infer from columns - static std::shared_ptr
Make(const std::shared_ptr& schema, - const std::vector>& columns, - int64_t num_rows = -1); - - /// \brief Construct a Table from columns, schema is assembled from column fields - /// If columns is zero-length, the table's number of rows is zero - /// \param columns The table's columns - /// \param num_rows number of rows in table, -1 (default) to infer from columns - static std::shared_ptr
Make(const std::vector>& columns, - int64_t num_rows = -1); - - /// \brief Construct a Table from schema and arrays - /// \param schema The table schema (column types) - /// \param arrays The table's columns as arrays - /// \param num_rows number of rows in table, -1 (default) to infer from columns - static std::shared_ptr
Make(const std::shared_ptr& schema, - const std::vector>& arrays, - int64_t num_rows = -1); - - /// \brief Construct a Table from RecordBatches, using schema supplied by the first - /// RecordBatch. - /// - /// \param[in] batches a std::vector of record batches - /// \param[out] table the returned table - /// \return Status Returns Status::Invalid if there is some problem - static Status FromRecordBatches( - const std::vector>& batches, - std::shared_ptr
* table); - - /// \brief Construct a Table from RecordBatches, using supplied schema. There may be - /// zero record batches - /// - /// \param[in] schema the arrow::Schema for each batch - /// \param[in] batches a std::vector of record batches - /// \param[out] table the returned table - /// \return Status - static Status FromRecordBatches( - const std::shared_ptr& schema, - const std::vector>& batches, - std::shared_ptr
* table); - - /// \brief Construct a Table from a chunked StructArray. One column will be produced - /// for each field of the StructArray. - /// - /// \param[in] array a chunked StructArray - /// \param[out] table the returned table - /// \return Status - static Status FromChunkedStructArray(const std::shared_ptr& array, - std::shared_ptr
* table); - - /// Return the table schema - std::shared_ptr schema() const { return schema_; } - - /// Return a column by index - virtual std::shared_ptr column(int i) const = 0; - - /// \brief Construct a zero-copy slice of the table with the - /// indicated offset and length - /// - /// \param[in] offset the index of the first row in the constructed - /// slice - /// \param[in] length the number of rows of the slice. If there are not enough - /// rows in the table, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr
- virtual std::shared_ptr
Slice(int64_t offset, int64_t length) const = 0; - - /// \brief Slice from first row at offset until end of the table - std::shared_ptr
Slice(int64_t offset) const { return Slice(offset, num_rows_); } - - /// \brief Return a column by name - /// \param[in] name field name - /// \return an Array or null if no field was found - std::shared_ptr GetColumnByName(const std::string& name) const { - auto i = schema_->GetFieldIndex(name); - return i == -1 ? NULLPTR : column(i); - } - - /// \brief Remove column from the table, producing a new Table - virtual Status RemoveColumn(int i, std::shared_ptr
* out) const = 0; - - /// \brief Add column to the table, producing a new Table - virtual Status AddColumn(int i, const std::shared_ptr& column, - std::shared_ptr
* out) const = 0; - - /// \brief Replace a column in the table, producing a new Table - virtual Status SetColumn(int i, const std::shared_ptr& column, - std::shared_ptr
* out) const = 0; - - /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) - /// \since 0.5.0 - /// - /// \param[in] metadata new KeyValueMetadata - /// \return new Table - virtual std::shared_ptr
ReplaceSchemaMetadata( - const std::shared_ptr& metadata) const = 0; - - /// \brief Flatten the table, producing a new Table. Any column with a - /// struct type will be flattened into multiple columns - /// - /// \param[in] pool The pool for buffer allocations, if any - /// \param[out] out The returned table - virtual Status Flatten(MemoryPool* pool, std::shared_ptr
* out) const = 0; - - /// \brief Perform any checks to validate the input arguments - virtual Status Validate() const = 0; - - /// \brief Return the number of columns in the table - int num_columns() const { return schema_->num_fields(); } - - /// \brief Return the number of rows (equal to each column's logical length) - int64_t num_rows() const { return num_rows_; } - - /// \brief Determine if tables are equal - /// - /// Two tables can be equal only if they have equal schemas. - /// However, they may be equal even if they have different chunkings. - bool Equals(const Table& other) const; - - protected: - Table(); - - std::shared_ptr schema_; - int64_t num_rows_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Table); -}; - -/// \brief Compute a stream of record batches from a (possibly chunked) Table -/// -/// The conversion is zero-copy: each record batch is a view over a slice -/// of the table's columns. -class ARROW_EXPORT TableBatchReader : public RecordBatchReader { - public: - ~TableBatchReader() override; - - /// \brief Construct a TableBatchReader for the given table - explicit TableBatchReader(const Table& table); - - std::shared_ptr schema() const override; - - Status ReadNext(std::shared_ptr* out) override; - - /// \brief Set the desired maximum chunk size of record batches - /// - /// The actual chunk size of each record batch may be smaller, depending - /// on actual chunking characteristics of each table column. - void set_chunksize(int64_t chunksize); - - private: - class TableBatchReaderImpl; - std::unique_ptr impl_; -}; - -/// \brief Construct table from multiple input tables. -/// -/// The tables are concatenated vertically. Therefore, all tables should -/// have the same schema. Each column in the output table is the result -/// of concatenating the corresponding columns in all input tables. -ARROW_EXPORT -Status ConcatenateTables(const std::vector>& tables, - std::shared_ptr
* table); - -} // namespace arrow - -#endif // ARROW_TABLE_H diff --git a/r/R/inst/include/arrow/table_builder.h b/r/R/inst/include/arrow/table_builder.h deleted file mode 100644 index 8e7dfc1e5b3..00000000000 --- a/r/R/inst/include/arrow/table_builder.h +++ /dev/null @@ -1,113 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TABLE_BUILDER_H -#define ARROW_TABLE_BUILDER_H - -#include -#include -#include - -#include "arrow/builder.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; -class RecordBatch; - -/// \class RecordBatchBuilder -/// \brief Helper class for creating record batches iteratively given a known -/// schema -class ARROW_EXPORT RecordBatchBuilder { - public: - /// \brief Create an initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] builder the created builder instance - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - std::unique_ptr* builder); - - /// \brief Create an initialize a RecordBatchBuilder - /// \param[in] schema The schema for the record batch - /// \param[in] pool A MemoryPool to use for allocations - /// \param[in] initial_capacity The initial capacity for the builders - /// \param[in] builder the created builder instance - static Status Make(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity, - std::unique_ptr* builder); - - /// \brief Get base pointer to field builder - /// \param i the field index - /// \return pointer to ArrayBuilder - ArrayBuilder* GetField(int i) { return raw_field_builders_[i]; } - - /// \brief Return field builder casted to indicated specific builder type - /// \param i the field index - /// \return pointer to template type - template - T* GetFieldAs(int i) { - return internal::checked_cast(raw_field_builders_[i]); - } - - /// \brief Finish current batch and optionally reset - /// \param[in] reset_builders the resulting RecordBatch - /// \param[out] batch the resulting RecordBatch - /// \return Status - Status Flush(bool reset_builders, std::shared_ptr* batch); - - /// \brief Finish current batch and reset - /// \param[out] batch the resulting RecordBatch - /// \return Status - Status Flush(std::shared_ptr* batch); - - /// \brief Set the initial capacity for new builders - void SetInitialCapacity(int64_t capacity); - - /// \brief The initial capacity for builders - int64_t initial_capacity() const { return initial_capacity_; } - - /// \brief The number of fields in the schema - int num_fields() const { return schema_->num_fields(); } - - /// \brief The number of fields in the schema - std::shared_ptr schema() const { return schema_; } - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchBuilder); - - RecordBatchBuilder(const std::shared_ptr& schema, MemoryPool* pool, - int64_t initial_capacity); - - Status CreateBuilders(); - Status InitBuilders(); - - std::shared_ptr schema_; - int64_t initial_capacity_; - MemoryPool* pool_; - - std::vector> field_builders_; - std::vector raw_field_builders_; -}; - -} // namespace arrow - -#endif // ARROW_TABLE_BUILDER_H diff --git a/r/R/inst/include/arrow/tensor.h b/r/R/inst/include/arrow/tensor.h deleted file mode 100644 index 317150234e3..00000000000 --- a/r/R/inst/include/arrow/tensor.h +++ /dev/null @@ -1,167 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TENSOR_H -#define ARROW_TENSOR_H - -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -static inline bool is_tensor_supported(Type::type type_id) { - switch (type_id) { - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::UINT32: - case Type::INT32: - case Type::UINT64: - case Type::INT64: - case Type::HALF_FLOAT: - case Type::FLOAT: - case Type::DOUBLE: - return true; - default: - break; - } - return false; -} - -template -class SparseTensorImpl; - -class ARROW_EXPORT Tensor { - public: - virtual ~Tensor() = default; - - /// Constructor with no dimension names or strides, data assumed to be row-major - Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape); - - /// Constructor with non-negative strides - Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides); - - /// Constructor with non-negative strides and dimension names - Tensor(const std::shared_ptr& type, const std::shared_ptr& data, - const std::vector& shape, const std::vector& strides, - const std::vector& dim_names); - - std::shared_ptr type() const { return type_; } - std::shared_ptr data() const { return data_; } - - const uint8_t* raw_data() const { return data_->data(); } - uint8_t* raw_mutable_data() { return data_->mutable_data(); } - - const std::vector& shape() const { return shape_; } - const std::vector& strides() const { return strides_; } - - int ndim() const { return static_cast(shape_.size()); } - - const std::vector& dim_names() const { return dim_names_; } - const std::string& dim_name(int i) const; - - /// Total number of value cells in the tensor - int64_t size() const; - - /// Return true if the underlying data buffer is mutable - bool is_mutable() const { return data_->is_mutable(); } - - /// Either row major or column major - bool is_contiguous() const; - - /// AKA "C order" - bool is_row_major() const; - - /// AKA "Fortran order" - bool is_column_major() const; - - Type::type type_id() const; - - bool Equals(const Tensor& other) const; - - /// Compute the number of non-zero values in the tensor - Status CountNonZero(int64_t* result) const; - - protected: - Tensor() {} - - std::shared_ptr type_; - std::shared_ptr data_; - std::vector shape_; - std::vector strides_; - - /// These names are optional - std::vector dim_names_; - - template - friend class SparseTensorImpl; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor); -}; - -template -class NumericTensor : public Tensor { - public: - using TypeClass = TYPE; - using value_type = typename TypeClass::c_type; - - /// Constructor with non-negative strides and dimension names - NumericTensor(const std::shared_ptr& data, const std::vector& shape, - const std::vector& strides, - const std::vector& dim_names) - : Tensor(TypeTraits::type_singleton(), data, shape, strides, dim_names) {} - - /// Constructor with no dimension names or strides, data assumed to be row-major - NumericTensor(const std::shared_ptr& data, const std::vector& shape) - : NumericTensor(data, shape, {}, {}) {} - - /// Constructor with non-negative strides - NumericTensor(const std::shared_ptr& data, const std::vector& shape, - const std::vector& strides) - : NumericTensor(data, shape, strides, {}) {} - - const value_type& Value(const std::vector& index) const { - int64_t offset = CalculateValueOffset(index); - const value_type* ptr = reinterpret_cast(raw_data() + offset); - return *ptr; - } - - protected: - int64_t CalculateValueOffset(const std::vector& index) const { - int64_t offset = 0; - for (size_t i = 0; i < index.size(); ++i) { - offset += index[i] * strides_[i]; - } - return offset; - } -}; - -} // namespace arrow - -#endif // ARROW_TENSOR_H diff --git a/r/R/inst/include/arrow/testing/gtest_common.h b/r/R/inst/include/arrow/testing/gtest_common.h deleted file mode 100644 index d0221de4b49..00000000000 --- a/r/R/inst/include/arrow/testing/gtest_common.h +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TEST_COMMON_H -#define ARROW_TEST_COMMON_H - -#include -#include -#include -#include -#include - -#include - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" -#include "arrow/memory_pool.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/util.h" - -namespace arrow { - -class TestBase : public ::testing::Test { - public: - void SetUp() { - pool_ = default_memory_pool(); - random_seed_ = 0; - } - - std::shared_ptr MakeRandomNullBitmap(int64_t length, int64_t null_count) { - const int64_t null_nbytes = BitUtil::BytesForBits(length); - - std::shared_ptr null_bitmap; - ARROW_EXPECT_OK(AllocateBuffer(pool_, null_nbytes, &null_bitmap)); - memset(null_bitmap->mutable_data(), 255, null_nbytes); - for (int64_t i = 0; i < null_count; i++) { - BitUtil::ClearBit(null_bitmap->mutable_data(), i * (length / null_count)); - } - return null_bitmap; - } - - template - inline std::shared_ptr MakeRandomArray(int64_t length, int64_t null_count = 0); - - protected: - uint32_t random_seed_; - MemoryPool* pool_; -}; - -template -std::shared_ptr TestBase::MakeRandomArray(int64_t length, int64_t null_count) { - const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type); - std::shared_ptr data; - ARROW_EXPECT_OK(AllocateBuffer(pool_, data_nbytes, &data)); - - // Fill with random data - random_bytes(data_nbytes, random_seed_++, data->mutable_data()); - std::shared_ptr null_bitmap = MakeRandomNullBitmap(length, null_count); - - return std::make_shared(length, data, null_bitmap, null_count); -} - -template <> -inline std::shared_ptr TestBase::MakeRandomArray(int64_t length, - int64_t null_count) { - return std::make_shared(length); -} - -template <> -inline std::shared_ptr TestBase::MakeRandomArray( - int64_t length, int64_t null_count) { - const int byte_width = 10; - std::shared_ptr null_bitmap = MakeRandomNullBitmap(length, null_count); - std::shared_ptr data; - ARROW_EXPECT_OK(AllocateBuffer(pool_, byte_width * length, &data)); - - ::arrow::random_bytes(data->size(), 0, data->mutable_data()); - return std::make_shared(fixed_size_binary(byte_width), length, - data, null_bitmap, null_count); -} - -template <> -inline std::shared_ptr TestBase::MakeRandomArray(int64_t length, - int64_t null_count) { - std::vector valid_bytes(length, 1); - for (int64_t i = 0; i < null_count; i++) { - valid_bytes[i * 2] = 0; - } - BinaryBuilder builder(pool_); - - const int kBufferSize = 10; - uint8_t buffer[kBufferSize]; - for (int64_t i = 0; i < length; i++) { - if (!valid_bytes[i]) { - ARROW_EXPECT_OK(builder.AppendNull()); - } else { - ::arrow::random_bytes(kBufferSize, static_cast(i), buffer); - ARROW_EXPECT_OK(builder.Append(buffer, kBufferSize)); - } - } - - std::shared_ptr out; - ARROW_EXPECT_OK(builder.Finish(&out)); - return out; -} - -class TestBuilder : public ::testing::Test { - public: - void SetUp() { pool_ = default_memory_pool(); } - - protected: - MemoryPool* pool_; - std::shared_ptr type_; -}; - -} // namespace arrow - -#endif // ARROW_TEST_COMMON_H_ diff --git a/r/R/inst/include/arrow/testing/gtest_util.h b/r/R/inst/include/arrow/testing/gtest_util.h deleted file mode 100644 index c44bb17653b..00000000000 --- a/r/R/inst/include/arrow/testing/gtest_util.h +++ /dev/null @@ -1,302 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "arrow/buffer.h" -#include "arrow/builder.h" -#include "arrow/status.h" -#include "arrow/type_fwd.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - ::arrow::Status _st = (expr); \ - if (!_st.Is##ENUM()) { \ - FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ - ENUM) ", but got " \ - << _st.ToString(); \ - } \ - } while (false) - -#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ - do { \ - ::arrow::Status _st = (expr); \ - if (!_st.Is##ENUM()) { \ - FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ - ENUM) ", but got " \ - << _st.ToString(); \ - } \ - ASSERT_EQ((message), _st.ToString()); \ - } while (false) - -#define ASSERT_OK(expr) \ - do { \ - ::arrow::Status _st = (expr); \ - if (!_st.ok()) { \ - FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \ - } \ - } while (false) - -#define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) - -#define ARROW_EXPECT_OK(expr) \ - do { \ - ::arrow::Status _st = (expr); \ - EXPECT_TRUE(_st.ok()); \ - } while (false) - -#define ABORT_NOT_OK(s) \ - do { \ - ::arrow::Status _st = (s); \ - if (ARROW_PREDICT_FALSE(!_st.ok())) { \ - _st.Abort(); \ - } \ - } while (false); - -namespace arrow { - -// ---------------------------------------------------------------------- -// Useful testing::Types declarations - -typedef ::testing::Types - NumericArrowTypes; - -class Array; -class ChunkedArray; -class Column; -class RecordBatch; -class Table; - -namespace compute { -struct Datum; -} - -using Datum = compute::Datum; - -using ArrayVector = std::vector>; - -#define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs)) -#define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs)) - -ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual); -ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected, - const RecordBatch& actual); -ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected, - const ChunkedArray& actual); -ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& actual, - const ArrayVector& expected); -ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, - const std::vector& expected); -ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::string& expected); -ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const Buffer& expected); -ARROW_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs); - -ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, - bool same_chunk_layout = true); - -ARROW_EXPORT void AssertDatumsEqual(const Datum& expected, const Datum& actual); - -template -void AssertNumericDataEqual(const C_TYPE* raw_data, - const std::vector& expected_values) { - for (auto expected : expected_values) { - ASSERT_EQ(expected, *raw_data); - ++raw_data; - } -} - -ARROW_EXPORT void CompareBatch(const RecordBatch& left, const RecordBatch& right, - bool compare_metadata = true); - -// Check if the padding of the buffers of the array is zero. -// Also cause valgrind warnings if the padding bytes are uninitialized. -ARROW_EXPORT void AssertZeroPadded(const Array& array); - -// Check if the valid buffer bytes are initialized -// and cause valgrind warnings otherwise. -ARROW_EXPORT void TestInitialized(const Array& array); - -template -void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr* out) { - ASSERT_OK(builder->Finish(out)); - AssertZeroPadded(**out); - TestInitialized(**out); -} - -#define DECL_T() typedef typename TestFixture::T T; - -#define DECL_TYPE() typedef typename TestFixture::Type Type; - -// ArrayFromJSON: construct an Array from a simple JSON representation - -ARROW_EXPORT -std::shared_ptr ArrayFromJSON(const std::shared_ptr&, - const std::string& json); - -// ArrayFromVector: construct an Array from vectors of C values - -template -void ArrayFromVector(const std::shared_ptr& type, - const std::vector& is_valid, const std::vector& values, - std::shared_ptr* out) { - DCHECK_EQ(TYPE::type_id, type->id()) - << "template parameter and concrete DataType instance don't agree"; - - std::unique_ptr builder_ptr; - ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); - // Get the concrete builder class to access its Append() specializations - auto& builder = dynamic_cast::BuilderType&>(*builder_ptr); - - for (size_t i = 0; i < values.size(); ++i) { - if (is_valid[i]) { - ASSERT_OK(builder.Append(values[i])); - } else { - ASSERT_OK(builder.AppendNull()); - } - } - ASSERT_OK(builder.Finish(out)); -} - -template -void ArrayFromVector(const std::shared_ptr& type, - const std::vector& values, std::shared_ptr* out) { - DCHECK_EQ(TYPE::type_id, type->id()) - << "template parameter and concrete DataType instance don't agree"; - - std::unique_ptr builder_ptr; - ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); - // Get the concrete builder class to access its Append() specializations - auto& builder = dynamic_cast::BuilderType&>(*builder_ptr); - - for (size_t i = 0; i < values.size(); ++i) { - ASSERT_OK(builder.Append(values[i])); - } - ASSERT_OK(builder.Finish(out)); -} - -// Overloads without a DataType argument, for parameterless types - -template -void ArrayFromVector(const std::vector& is_valid, const std::vector& values, - std::shared_ptr* out) { - auto type = TypeTraits::type_singleton(); - ArrayFromVector(type, is_valid, values, out); -} - -template -void ArrayFromVector(const std::vector& values, std::shared_ptr* out) { - auto type = TypeTraits::type_singleton(); - ArrayFromVector(type, values, out); -} - -// ChunkedArrayFromVector: construct a ChunkedArray from vectors of C values - -template -void ChunkedArrayFromVector(const std::shared_ptr& type, - const std::vector>& is_valid, - const std::vector>& values, - std::shared_ptr* out) { - ArrayVector chunks; - DCHECK_EQ(is_valid.size(), values.size()); - for (size_t i = 0; i < values.size(); ++i) { - std::shared_ptr array; - ArrayFromVector(type, is_valid[i], values[i], &array); - chunks.push_back(array); - } - *out = std::make_shared(chunks); -} - -template -void ChunkedArrayFromVector(const std::shared_ptr& type, - const std::vector>& values, - std::shared_ptr* out) { - ArrayVector chunks; - for (size_t i = 0; i < values.size(); ++i) { - std::shared_ptr array; - ArrayFromVector(type, values[i], &array); - chunks.push_back(array); - } - *out = std::make_shared(chunks); -} - -// Overloads without a DataType argument, for parameterless types - -template -void ChunkedArrayFromVector(const std::vector>& is_valid, - const std::vector>& values, - std::shared_ptr* out) { - auto type = TypeTraits::type_singleton(); - ChunkedArrayFromVector(type, is_valid, values, out); -} - -template -void ChunkedArrayFromVector(const std::vector>& values, - std::shared_ptr* out) { - auto type = TypeTraits::type_singleton(); - ChunkedArrayFromVector(type, values, out); -} - -template -static inline Status GetBitmapFromVector(const std::vector& is_valid, - std::shared_ptr* result) { - size_t length = is_valid.size(); - - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateEmptyBitmap(length, &buffer)); - - uint8_t* bitmap = buffer->mutable_data(); - for (size_t i = 0; i < static_cast(length); ++i) { - if (is_valid[i]) { - BitUtil::SetBit(bitmap, i); - } - } - - *result = buffer; - return Status::OK(); -} - -template -inline void BitmapFromVector(const std::vector& is_valid, - std::shared_ptr* out) { - ASSERT_OK(GetBitmapFromVector(is_valid, out)); -} - -template -void AssertSortedEquals(std::vector u, std::vector v) { - std::sort(u.begin(), u.end()); - std::sort(v.begin(), v.end()); - ASSERT_EQ(u, v); -} - -} // namespace arrow diff --git a/r/R/inst/include/arrow/testing/random.h b/r/R/inst/include/arrow/testing/random.h deleted file mode 100644 index 6b188fd573b..00000000000 --- a/r/R/inst/include/arrow/testing/random.h +++ /dev/null @@ -1,272 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/type.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; - -namespace random { - -using SeedType = std::random_device::result_type; -constexpr SeedType kSeedMax = std::numeric_limits::max(); - -class ARROW_EXPORT RandomArrayGenerator { - public: - explicit RandomArrayGenerator(SeedType seed) - : seed_distribution_(static_cast(1), kSeedMax), seed_rng_(seed) {} - - /// \brief Generates a random BooleanArray - /// - /// \param[in] size the size of the array to generate - /// \param[in] probability the estimated number of active bits - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Boolean(int64_t size, double probability, - double null_probability); - - /// \brief Generates a random UInt8Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr UInt8(int64_t size, uint8_t min, uint8_t max, - double null_probability); - - /// \brief Generates a random Int8Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Int8(int64_t size, int8_t min, int8_t max, - double null_probability); - - /// \brief Generates a random UInt16Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr UInt16(int64_t size, uint16_t min, uint16_t max, - double null_probability); - - /// \brief Generates a random Int16Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Int16(int64_t size, int16_t min, int16_t max, - double null_probability); - - /// \brief Generates a random UInt32Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr UInt32(int64_t size, uint32_t min, uint32_t max, - double null_probability); - - /// \brief Generates a random Int32Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Int32(int64_t size, int32_t min, int32_t max, - double null_probability); - - /// \brief Generates a random UInt64Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr UInt64(int64_t size, uint64_t min, uint64_t max, - double null_probability); - - /// \brief Generates a random Int64Array - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Int64(int64_t size, int64_t min, int64_t max, - double null_probability); - - /// \brief Generates a random FloatArray - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Float32(int64_t size, float min, float max, - double null_probability); - - /// \brief Generates a random DoubleArray - /// - /// \param[in] size the size of the array to generate - /// \param[in] min the lower bound of the uniform distribution - /// \param[in] max the upper bound of the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr Float64(int64_t size, double min, double max, - double null_probability); - - template - std::shared_ptr Numeric(int64_t size, CType min, CType max, - double null_probability) { - switch (ArrowType::type_id) { - case Type::UINT8: - return UInt8(size, static_cast(min), static_cast(max), - null_probability); - case Type::INT8: - return Int8(size, static_cast(min), static_cast(max), - null_probability); - case Type::UINT16: - return UInt16(size, static_cast(min), static_cast(max), - null_probability); - case Type::INT16: - return Int16(size, static_cast(min), static_cast(max), - null_probability); - case Type::UINT32: - return UInt32(size, static_cast(min), static_cast(max), - null_probability); - case Type::INT32: - return Int32(size, static_cast(min), static_cast(max), - null_probability); - case Type::UINT64: - return UInt64(size, static_cast(min), static_cast(max), - null_probability); - case Type::INT64: - return Int64(size, static_cast(min), static_cast(max), - null_probability); - case Type::FLOAT: - return Float32(size, static_cast(min), static_cast(max), - null_probability); - case Type::DOUBLE: - return Float64(size, static_cast(min), static_cast(max), - null_probability); - default: - return nullptr; - } - } - - /// \brief Generates a random StringArray - /// - /// \param[in] size the size of the array to generate - /// \param[in] min_length the lower bound of the string length - /// determined by the uniform distribution - /// \param[in] max_length the upper bound of the string length - /// determined by the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr String(int64_t size, int32_t min_length, - int32_t max_length, double null_probability); - - /// \brief Generates a random StringArray with repeated values - /// - /// \param[in] size the size of the array to generate - /// \param[in] unique the number of unique string values used - /// to populate the array - /// \param[in] min_length the lower bound of the string length - /// determined by the uniform distribution - /// \param[in] max_length the upper bound of the string length - /// determined by the uniform distribution - /// \param[in] null_probability the probability of a row being null - /// - /// \return a generated Array - std::shared_ptr StringWithRepeats(int64_t size, int64_t unique, - int32_t min_length, int32_t max_length, - double null_probability); - - private: - SeedType seed() { return seed_distribution_(seed_rng_); } - - std::uniform_int_distribution seed_distribution_; - std::default_random_engine seed_rng_; -}; - -} // namespace random - -// -// Assorted functions -// - -template -void randint(int64_t N, T lower, T upper, std::vector* out) { - const int random_seed = 0; - std::default_random_engine gen(random_seed); - std::uniform_int_distribution d(lower, upper); - out->resize(N, static_cast(0)); - std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); -} - -template -void random_real(int64_t n, uint32_t seed, T min_value, T max_value, - std::vector* out) { - std::default_random_engine gen(seed); - std::uniform_real_distribution d(min_value, max_value); - out->resize(n, static_cast(0)); - std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); -} - -template -void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { - assert(out || (n == 0)); - std::default_random_engine gen(seed); - std::uniform_int_distribution d(min_value, max_value); - std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); -} - -} // namespace arrow diff --git a/r/R/inst/include/arrow/testing/util.h b/r/R/inst/include/arrow/testing/util.h deleted file mode 100644 index d12f57e3b7e..00000000000 --- a/r/R/inst/include/arrow/testing/util.h +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/record_batch.h" -#include "arrow/status.h" -#include "arrow/type_fwd.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class Column; -class MemoryPool; -class RecordBatch; -class Table; - -using ArrayVector = std::vector>; - -template -Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, - std::shared_ptr* result) { - int64_t nbytes = static_cast(values.size()) * sizeof(T); - - std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool, nbytes, &buffer)); - auto immutable_data = reinterpret_cast(values.data()); - std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data()); - memset(buffer->mutable_data() + nbytes, 0, - static_cast(buffer->capacity() - nbytes)); - - *result = buffer; - return Status::OK(); -} - -// Sets approximately pct_null of the first n bytes in null_bytes to zero -// and the rest to non-zero (true) values. -ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); -ARROW_EXPORT void random_is_valid(int64_t n, double pct_null, std::vector* is_valid, - int random_seed = 0); -ARROW_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out); -ARROW_EXPORT int32_t DecimalSize(int32_t precision); -ARROW_EXPORT void random_decimals(int64_t n, uint32_t seed, int32_t precision, - uint8_t* out); -ARROW_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out); -ARROW_EXPORT int64_t CountNulls(const std::vector& valid_bytes); - -ARROW_EXPORT Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool, - std::shared_ptr* out, - uint32_t seed = 0); - -ARROW_EXPORT uint64_t random_seed(); - -template -Status MakeArray(const std::vector& valid_bytes, const std::vector& values, - int64_t size, Builder* builder, std::shared_ptr* out) { - // Append the first 1000 - for (int64_t i = 0; i < size; ++i) { - if (valid_bytes[i] > 0) { - RETURN_NOT_OK(builder->Append(values[i])); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return builder->Finish(out); -} - -#define DECL_T() typedef typename TestFixture::T T; - -#define DECL_TYPE() typedef typename TestFixture::Type Type; - -// ---------------------------------------------------------------------- -// A RecordBatchReader for serving a sequence of in-memory record batches - -class BatchIterator : public RecordBatchReader { - public: - BatchIterator(const std::shared_ptr& schema, - const std::vector>& batches) - : schema_(schema), batches_(batches), position_(0) {} - - std::shared_ptr schema() const override { return schema_; } - - Status ReadNext(std::shared_ptr* out) override { - if (position_ >= batches_.size()) { - *out = nullptr; - } else { - *out = batches_[position_++]; - } - return Status::OK(); - } - - private: - std::shared_ptr schema_; - std::vector> batches_; - size_t position_; -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/type.h b/r/R/inst/include/arrow/type.h deleted file mode 100644 index b5eef6ffc28..00000000000 --- a/r/R/inst/include/arrow/type.h +++ /dev/null @@ -1,1104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPE_H -#define ARROW_TYPE_H - -#include -#include -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/type_fwd.h" // IWYU pragma: export -#include "arrow/util/checked_cast.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" -#include "arrow/visitor.h" // IWYU pragma: keep - -namespace arrow { - -class Array; -class Field; -class MemoryPool; - -struct Type { - /// \brief Main data type enumeration - /// - /// This enumeration provides a quick way to interrogate the category - /// of a DataType instance. - enum type { - /// A NULL type having no physical storage - NA, - - /// Boolean as 1 bit, LSB bit-packed ordering - BOOL, - - /// Unsigned 8-bit little-endian integer - UINT8, - - /// Signed 8-bit little-endian integer - INT8, - - /// Unsigned 16-bit little-endian integer - UINT16, - - /// Signed 16-bit little-endian integer - INT16, - - /// Unsigned 32-bit little-endian integer - UINT32, - - /// Signed 32-bit little-endian integer - INT32, - - /// Unsigned 64-bit little-endian integer - UINT64, - - /// Signed 64-bit little-endian integer - INT64, - - /// 2-byte floating point value - HALF_FLOAT, - - /// 4-byte floating point value - FLOAT, - - /// 8-byte floating point value - DOUBLE, - - /// UTF8 variable-length string as List - STRING, - - /// Variable-length bytes (no guarantee of UTF8-ness) - BINARY, - - /// Fixed-size binary. Each value occupies the same number of bytes - FIXED_SIZE_BINARY, - - /// int32_t days since the UNIX epoch - DATE32, - - /// int64_t milliseconds since the UNIX epoch - DATE64, - - /// Exact timestamp encoded with int64 since UNIX epoch - /// Default unit millisecond - TIMESTAMP, - - /// Time as signed 32-bit integer, representing either seconds or - /// milliseconds since midnight - TIME32, - - /// Time as signed 64-bit integer, representing either microseconds or - /// nanoseconds since midnight - TIME64, - - /// YEAR_MONTH or DAY_TIME interval in SQL style - INTERVAL, - - /// Precision- and scale-based decimal type. Storage type depends on the - /// parameters. - DECIMAL, - - /// A list of some logical data type - LIST, - - /// Struct of logical types - STRUCT, - - /// Unions of logical types - UNION, - - /// Dictionary-encoded type, also called "categorical" or "factor" - /// in other programming languages. Holds the dictionary value - /// type but not the dictionary itself, which is part of the - /// ArrayData struct - DICTIONARY, - - /// Map, a repeated struct logical type - MAP, - - /// Custom data type, implemented by user - EXTENSION, - - /// Fixed size list of some logical type - FIXED_SIZE_LIST, - - /// Measure of elapsed time in either seconds, milliseconds, microseconds - /// or nanoseconds. - DURATION - }; -}; - -/// \brief Base class for all data types -/// -/// Data types in this library are all *logical*. They can be expressed as -/// either a primitive physical type (bytes or bits of some fixed size), a -/// nested type consisting of other data types, or another data type (e.g. a -/// timestamp encoded as an int64). -/// -/// Simple datatypes may be entirely described by their Type::type id, but -/// complex datatypes are usually parametric. -class ARROW_EXPORT DataType { - public: - explicit DataType(Type::type id) : id_(id) {} - virtual ~DataType(); - - /// \brief Return whether the types are equal - /// - /// Types that are logically convertible from one to another (e.g. List - /// and Binary) are NOT equal. - bool Equals(const DataType& other, bool check_metadata = true) const; - - /// \brief Return whether the types are equal - bool Equals(const std::shared_ptr& other) const; - - std::shared_ptr child(int i) const { return children_[i]; } - - const std::vector>& children() const { return children_; } - - int num_children() const { return static_cast(children_.size()); } - - Status Accept(TypeVisitor* visitor) const; - - /// \brief A string representation of the type, including any children - virtual std::string ToString() const = 0; - - /// \brief A string name of the type, omitting any child fields - /// - /// \note Experimental API - /// \since 0.7.0 - virtual std::string name() const = 0; - - /// \brief Return the type category - Type::type id() const { return id_; } - - protected: - Type::type id_; - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); -}; - -std::ostream& operator<<(std::ostream& os, const DataType& type); - -/// \brief Base class for all fixed-width data types -class ARROW_EXPORT FixedWidthType : public DataType { - public: - using DataType::DataType; - - virtual int bit_width() const = 0; -}; - -/// \brief Base class for all data types representing primitive values -class ARROW_EXPORT PrimitiveCType : public FixedWidthType { - public: - using FixedWidthType::FixedWidthType; -}; - -/// \brief Base class for all numeric data types -class ARROW_EXPORT NumberType : public PrimitiveCType { - public: - using PrimitiveCType::PrimitiveCType; -}; - -/// \brief Base class for all integral data types -class ARROW_EXPORT IntegerType : public NumberType { - public: - using NumberType::NumberType; - virtual bool is_signed() const = 0; -}; - -/// \brief Base class for all floating-point data types -class ARROW_EXPORT FloatingPointType : public NumberType { - public: - using NumberType::NumberType; - enum Precision { HALF, SINGLE, DOUBLE }; - virtual Precision precision() const = 0; -}; - -/// \brief Base class for all parametric data types -class ParametricType {}; - -class ARROW_EXPORT NestedType : public DataType, public ParametricType { - public: - using DataType::DataType; -}; - -class NoExtraMeta {}; - -/// \brief The combination of a field name and data type, with optional metadata -/// -/// Fields are used to describe the individual constituents of a -/// nested DataType or a Schema. -/// -/// A field's metadata is represented by a KeyValueMetadata instance, -/// which holds arbitrary key-value pairs. -class ARROW_EXPORT Field { - public: - Field(const std::string& name, const std::shared_ptr& type, - bool nullable = true, - const std::shared_ptr& metadata = NULLPTR) - : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} - - /// \brief Return the field's attached metadata - std::shared_ptr metadata() const { return metadata_; } - - /// \brief Return whether the field has non-empty metadata - bool HasMetadata() const; - - /// \brief Return a copy of this field with the given metadata attached to it - std::shared_ptr AddMetadata( - const std::shared_ptr& metadata) const; - /// \brief Return a copy of this field without any metadata attached to it - std::shared_ptr RemoveMetadata() const; - - /// \brief Return a copy of this field with the replaced type. - std::shared_ptr WithType(const std::shared_ptr& type) const; - - std::vector> Flatten() const; - - bool Equals(const Field& other, bool check_metadata = true) const; - bool Equals(const std::shared_ptr& other, bool check_metadata = true) const; - - /// \brief Return a string representation ot the field - std::string ToString() const; - - /// \brief Return the field name - const std::string& name() const { return name_; } - /// \brief Return the field data type - std::shared_ptr type() const { return type_; } - /// \brief Return whether the field is nullable - bool nullable() const { return nullable_; } - - std::shared_ptr Copy() const; - - private: - // Field name - std::string name_; - - // The field's data type - std::shared_ptr type_; - - // Fields can be nullable - bool nullable_; - - // The field's metadata, if any - std::shared_ptr metadata_; - - ARROW_DISALLOW_COPY_AND_ASSIGN(Field); -}; - -namespace detail { - -template -class ARROW_EXPORT CTypeImpl : public BASE { - public: - using c_type = C_TYPE; - static constexpr Type::type type_id = TYPE_ID; - - CTypeImpl() : BASE(TYPE_ID) {} - - int bit_width() const override { return static_cast(sizeof(C_TYPE) * CHAR_BIT); } - - std::string ToString() const override { return this->name(); } -}; - -template -class IntegerTypeImpl : public detail::CTypeImpl { - bool is_signed() const override { return std::is_signed::value; } -}; - -} // namespace detail - -/// Concrete type class for always-null data -class ARROW_EXPORT NullType : public DataType, public NoExtraMeta { - public: - static constexpr Type::type type_id = Type::NA; - - NullType() : DataType(Type::NA) {} - - std::string ToString() const override; - - std::string name() const override { return "null"; } -}; - -/// Concrete type class for boolean data -class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { - public: - static constexpr Type::type type_id = Type::BOOL; - - BooleanType() : FixedWidthType(Type::BOOL) {} - - std::string ToString() const override; - - int bit_width() const override { return 1; } - std::string name() const override { return "bool"; } -}; - -/// Concrete type class for unsigned 8-bit integer data -class ARROW_EXPORT UInt8Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "uint8"; } -}; - -/// Concrete type class for signed 8-bit integer data -class ARROW_EXPORT Int8Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "int8"; } -}; - -/// Concrete type class for unsigned 16-bit integer data -class ARROW_EXPORT UInt16Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "uint16"; } -}; - -/// Concrete type class for signed 16-bit integer data -class ARROW_EXPORT Int16Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "int16"; } -}; - -/// Concrete type class for unsigned 32-bit integer data -class ARROW_EXPORT UInt32Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "uint32"; } -}; - -/// Concrete type class for signed 32-bit integer data -class ARROW_EXPORT Int32Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "int32"; } -}; - -/// Concrete type class for unsigned 64-bit integer data -class ARROW_EXPORT UInt64Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "uint64"; } -}; - -/// Concrete type class for signed 64-bit integer data -class ARROW_EXPORT Int64Type - : public detail::IntegerTypeImpl { - public: - std::string name() const override { return "int64"; } -}; - -/// Concrete type class for 16-bit floating-point data -class ARROW_EXPORT HalfFloatType - : public detail::CTypeImpl { - public: - Precision precision() const override; - std::string name() const override { return "halffloat"; } -}; - -/// Concrete type class for 32-bit floating-point data (C "float") -class ARROW_EXPORT FloatType - : public detail::CTypeImpl { - public: - Precision precision() const override; - std::string name() const override { return "float"; } -}; - -/// Concrete type class for 64-bit floating-point data (C "double") -class ARROW_EXPORT DoubleType - : public detail::CTypeImpl { - public: - Precision precision() const override; - std::string name() const override { return "double"; } -}; - -/// \brief Concrete type class for list data -/// -/// List data is nested data where each value is a variable number of -/// child items. Lists can be recursively nested, for example -/// list(list(int32)). -class ARROW_EXPORT ListType : public NestedType { - public: - static constexpr Type::type type_id = Type::LIST; - - // List can contain any other logical value type - explicit ListType(const std::shared_ptr& value_type) - : ListType(std::make_shared("item", value_type)) {} - - explicit ListType(const std::shared_ptr& value_field) : NestedType(Type::LIST) { - children_ = {value_field}; - } - - std::shared_ptr value_field() const { return children_[0]; } - - std::shared_ptr value_type() const { return children_[0]->type(); } - - std::string ToString() const override; - - std::string name() const override { return "list"; } -}; - -/// \brief Concrete type class for fixed size list data -class ARROW_EXPORT FixedSizeListType : public NestedType { - public: - static constexpr Type::type type_id = Type::FIXED_SIZE_LIST; - - // List can contain any other logical value type - explicit FixedSizeListType(const std::shared_ptr& value_type, - int32_t list_size) - : FixedSizeListType(std::make_shared("item", value_type), list_size) {} - - explicit FixedSizeListType(const std::shared_ptr& value_field, int32_t list_size) - : NestedType(Type::FIXED_SIZE_LIST), list_size_(list_size) { - children_ = {value_field}; - } - - std::shared_ptr value_field() const { return children_[0]; } - - std::shared_ptr value_type() const { return children_[0]->type(); } - - std::string ToString() const override; - - std::string name() const override { return "fixed_size_list"; } - - int32_t list_size() const { return list_size_; } - - protected: - int32_t list_size_; -}; - -/// \brief Concrete type class for variable-size binary data -class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { - public: - static constexpr Type::type type_id = Type::BINARY; - - BinaryType() : BinaryType(Type::BINARY) {} - - std::string ToString() const override; - std::string name() const override { return "binary"; } - - protected: - // Allow subclasses to change the logical type. - explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} -}; - -/// \brief Concrete type class for fixed-size binary data -class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public ParametricType { - public: - static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY; - - explicit FixedSizeBinaryType(int32_t byte_width) - : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} - explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id) - : FixedWidthType(override_type_id), byte_width_(byte_width) {} - - std::string ToString() const override; - std::string name() const override { return "fixed_size_binary"; } - - int32_t byte_width() const { return byte_width_; } - int bit_width() const override; - - protected: - int32_t byte_width_; -}; - -/// \brief Concrete type class for variable-size string data, utf8-encoded -class ARROW_EXPORT StringType : public BinaryType { - public: - static constexpr Type::type type_id = Type::STRING; - - StringType() : BinaryType(Type::STRING) {} - - std::string ToString() const override; - std::string name() const override { return "utf8"; } -}; - -/// \brief Concrete type class for struct data -class ARROW_EXPORT StructType : public NestedType { - public: - static constexpr Type::type type_id = Type::STRUCT; - - explicit StructType(const std::vector>& fields); - - ~StructType() override; - - std::string ToString() const override; - std::string name() const override { return "struct"; } - - /// Returns null if name not found - std::shared_ptr GetFieldByName(const std::string& name) const; - - /// Return all fields having this name - std::vector> GetAllFieldsByName(const std::string& name) const; - - /// Returns -1 if name not found or if there are multiple fields having the - /// same name - int GetFieldIndex(const std::string& name) const; - - /// Return the indices of all fields having this name - std::vector GetAllFieldIndices(const std::string& name) const; - - ARROW_DEPRECATED("Use GetFieldByName") - std::shared_ptr GetChildByName(const std::string& name) const; - - ARROW_DEPRECATED("Use GetFieldIndex") - int GetChildIndex(const std::string& name) const; - - private: - class Impl; - std::unique_ptr impl_; -}; - -/// \brief Base type class for (fixed-size) decimal data -class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { - public: - explicit DecimalType(int32_t byte_width, int32_t precision, int32_t scale) - : FixedSizeBinaryType(byte_width, Type::DECIMAL), - precision_(precision), - scale_(scale) {} - - int32_t precision() const { return precision_; } - int32_t scale() const { return scale_; } - - protected: - int32_t precision_; - int32_t scale_; -}; - -/// \brief Concrete type class for 128-bit decimal data -class ARROW_EXPORT Decimal128Type : public DecimalType { - public: - static constexpr Type::type type_id = Type::DECIMAL; - - explicit Decimal128Type(int32_t precision, int32_t scale); - - std::string ToString() const override; - std::string name() const override { return "decimal"; } -}; - -struct UnionMode { - enum type { SPARSE, DENSE }; -}; - -/// \brief Concrete type class for union data -class ARROW_EXPORT UnionType : public NestedType { - public: - static constexpr Type::type type_id = Type::UNION; - - UnionType(const std::vector>& fields, - const std::vector& type_codes, - UnionMode::type mode = UnionMode::SPARSE); - - std::string ToString() const override; - std::string name() const override { return "union"; } - - const std::vector& type_codes() const { return type_codes_; } - - UnionMode::type mode() const { return mode_; } - - private: - UnionMode::type mode_; - - // The type id used in the data to indicate each data type in the union. For - // example, the first type in the union might be denoted by the id 5 (instead - // of 0). - std::vector type_codes_; -}; - -// ---------------------------------------------------------------------- -// Date and time types - -enum class DateUnit : char { DAY = 0, MILLI = 1 }; - -/// \brief Base type for all date and time types -class ARROW_EXPORT TemporalType : public FixedWidthType { - public: - using FixedWidthType::FixedWidthType; -}; - -/// \brief Base type class for date data -class ARROW_EXPORT DateType : public TemporalType { - public: - virtual DateUnit unit() const = 0; - - protected: - explicit DateType(Type::type type_id); -}; - -/// Concrete type class for 32-bit date data (as number of days since UNIX epoch) -class ARROW_EXPORT Date32Type : public DateType { - public: - static constexpr Type::type type_id = Type::DATE32; - static constexpr DateUnit UNIT = DateUnit::DAY; - - using c_type = int32_t; - - Date32Type(); - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - std::string ToString() const override; - - std::string name() const override { return "date32"; } - DateUnit unit() const override { return UNIT; } -}; - -/// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) -class ARROW_EXPORT Date64Type : public DateType { - public: - static constexpr Type::type type_id = Type::DATE64; - static constexpr DateUnit UNIT = DateUnit::MILLI; - - using c_type = int64_t; - - Date64Type(); - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - std::string ToString() const override; - - std::string name() const override { return "date64"; } - DateUnit unit() const override { return UNIT; } -}; - -struct TimeUnit { - /// The unit for a time or timestamp DataType - enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; -}; - -std::ostream& operator<<(std::ostream& os, TimeUnit::type unit); - -/// Base type class for time data -class ARROW_EXPORT TimeType : public TemporalType, public ParametricType { - public: - TimeUnit::type unit() const { return unit_; } - - protected: - TimeType(Type::type type_id, TimeUnit::type unit); - TimeUnit::type unit_; -}; - -class ARROW_EXPORT Time32Type : public TimeType { - public: - static constexpr Type::type type_id = Type::TIME32; - using c_type = int32_t; - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI); - - std::string ToString() const override; - - std::string name() const override { return "time32"; } -}; - -class ARROW_EXPORT Time64Type : public TimeType { - public: - static constexpr Type::type type_id = Type::TIME64; - using c_type = int64_t; - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO); - - std::string ToString() const override; - - std::string name() const override { return "time64"; } -}; - -class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType { - public: - using Unit = TimeUnit; - - typedef int64_t c_type; - static constexpr Type::type type_id = Type::TIMESTAMP; - - int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } - - explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI) - : TemporalType(Type::TIMESTAMP), unit_(unit) {} - - explicit TimestampType(TimeUnit::type unit, const std::string& timezone) - : TemporalType(Type::TIMESTAMP), unit_(unit), timezone_(timezone) {} - - std::string ToString() const override; - std::string name() const override { return "timestamp"; } - - TimeUnit::type unit() const { return unit_; } - const std::string& timezone() const { return timezone_; } - - private: - TimeUnit::type unit_; - std::string timezone_; -}; - -// Base class for the different kinds of intervals. -class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { - public: - enum type { MONTHS, DAY_TIME }; - IntervalType() : TemporalType(Type::INTERVAL) {} - - virtual type interval_type() const = 0; - virtual ~IntervalType() = default; -}; - -/// \brief Represents a some number of months. -/// -/// Type representing a number of months. Corresponeds to YearMonth type -/// in Schema.fbs (Years are defined as 12 months). -class ARROW_EXPORT MonthIntervalType : public IntervalType { - public: - using c_type = int32_t; - static constexpr Type::type type_id = Type::INTERVAL; - - IntervalType::type interval_type() const override { return IntervalType::MONTHS; } - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - MonthIntervalType() : IntervalType() {} - - std::string ToString() const override { return name(); } - std::string name() const override { return "month_interval"; } -}; - -/// \brief Represents a number of days and milliseconds (fraction of day). -class ARROW_EXPORT DayTimeIntervalType : public IntervalType { - public: - struct DayMilliseconds { - int32_t days; - int32_t milliseconds; - bool operator==(DayMilliseconds other) { - return this->days == other.days && this->milliseconds == other.milliseconds; - } - bool operator!=(DayMilliseconds other) { return !(*this == other); } - }; - using c_type = DayMilliseconds; - static_assert(sizeof(DayMilliseconds) == 8, - "DayMilliseconds struct assumed to be of size 8 bytes"); - static constexpr Type::type type_id = Type::INTERVAL; - IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; } - - DayTimeIntervalType() : IntervalType() {} - - int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - - std::string ToString() const override { return name(); } - std::string name() const override { return "day_time_interval"; } -}; - -// \brief Represents an amount of elapsed time without any relation to a calendar -// artifact. -class ARROW_EXPORT DurationType : public TemporalType, public ParametricType { - public: - using Unit = TimeUnit; - - static constexpr Type::type type_id = Type::DURATION; - using c_type = int64_t; - - int bit_width() const override { return static_cast(sizeof(int64_t) * CHAR_BIT); } - - explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI) - : TemporalType(Type::DURATION), unit_(unit) {} - - std::string ToString() const override; - std::string name() const override { return "duration"; } - - TimeUnit::type unit() const { return unit_; } - - private: - TimeUnit::type unit_; -}; - -// ---------------------------------------------------------------------- -// Dictionary type (for representing categorical or dictionary-encoded -// in memory) - -/// \brief Dictionary-encoded value type with data-dependent -/// dictionary -class ARROW_EXPORT DictionaryType : public FixedWidthType { - public: - static constexpr Type::type type_id = Type::DICTIONARY; - - DictionaryType(const std::shared_ptr& index_type, - const std::shared_ptr& value_type, bool ordered = false); - - std::string ToString() const override; - std::string name() const override { return "dictionary"; } - - int bit_width() const override; - - std::shared_ptr index_type() const { return index_type_; } - std::shared_ptr value_type() const { return value_type_; } - - bool ordered() const { return ordered_; } - - /// \brief Unify dictionaries types - /// - /// Compute a resulting dictionary that will allow the union of values - /// of all input dictionary types. The input types must all have the - /// same value type. - /// \param[in] pool Memory pool to allocate dictionary values from - /// \param[in] types A sequence of input dictionary types - /// \param[in] dictionaries A sequence of input dictionaries - /// corresponding to each type - /// \param[out] out_type The unified dictionary type - /// \param[out] out_dictionary The unified dictionary - /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, - /// one per input type. Each integer vector represents the transposition - /// of input type indices into unified type indices. - // XXX Should we return something special (an empty transpose map?) when - // the transposition is the identity function? - static Status Unify(MemoryPool* pool, const std::vector& types, - const std::vector& dictionaries, - std::shared_ptr* out_type, - std::shared_ptr* out_dictionary, - std::vector>* out_transpose_maps = NULLPTR); - - protected: - // Must be an integer type (not currently checked) - std::shared_ptr index_type_; - std::shared_ptr value_type_; - bool ordered_; -}; - -// ---------------------------------------------------------------------- -// Schema - -/// \class Schema -/// \brief Sequence of arrow::Field objects describing the columns of a record -/// batch or table data structure -class ARROW_EXPORT Schema { - public: - explicit Schema(const std::vector>& fields, - const std::shared_ptr& metadata = NULLPTR); - - explicit Schema(std::vector>&& fields, - const std::shared_ptr& metadata = NULLPTR); - - Schema(const Schema&); - - virtual ~Schema(); - - /// Returns true if all of the schema fields are equal - bool Equals(const Schema& other, bool check_metadata = true) const; - - /// \brief Return the number of fields (columns) in the schema - int num_fields() const; - - /// Return the ith schema element. Does not boundscheck - std::shared_ptr field(int i) const; - - const std::vector>& fields() const; - - std::vector field_names() const; - - /// Returns null if name not found - std::shared_ptr GetFieldByName(const std::string& name) const; - - /// Return all fields having this name - std::vector> GetAllFieldsByName(const std::string& name) const; - - /// Returns -1 if name not found - int GetFieldIndex(const std::string& name) const; - - /// Return the indices of all fields having this name - std::vector GetAllFieldIndices(const std::string& name) const; - - /// \brief The custom key-value metadata, if any - /// - /// \return metadata may be null - std::shared_ptr metadata() const; - - /// \brief Render a string representation of the schema suitable for debugging - std::string ToString() const; - - Status AddField(int i, const std::shared_ptr& field, - std::shared_ptr* out) const; - Status RemoveField(int i, std::shared_ptr* out) const; - Status SetField(int i, const std::shared_ptr& field, - std::shared_ptr* out) const; - - /// \brief Replace key-value metadata with new metadata - /// - /// \param[in] metadata new KeyValueMetadata - /// \return new Schema - std::shared_ptr AddMetadata( - const std::shared_ptr& metadata) const; - - /// \brief Return copy of Schema without the KeyValueMetadata - std::shared_ptr RemoveMetadata() const; - - /// \brief Indicates that Schema has non-empty KevValueMetadata - bool HasMetadata() const; - - private: - class Impl; - std::unique_ptr impl_; -}; - -// ---------------------------------------------------------------------- -// Parametric factory functions -// Other factory functions are in type_fwd.h - -/// \addtogroup type-factories -/// @{ - -/// \brief Create a FixedSizeBinaryType instance. -ARROW_EXPORT -std::shared_ptr fixed_size_binary(int32_t byte_width); - -/// \brief Create a Decimal128Type instance -ARROW_EXPORT -std::shared_ptr decimal(int32_t precision, int32_t scale); - -/// \brief Create a ListType instance from its child Field type -ARROW_EXPORT -std::shared_ptr list(const std::shared_ptr& value_type); - -/// \brief Create a ListType instance from its child DataType -ARROW_EXPORT -std::shared_ptr list(const std::shared_ptr& value_type); - -/// \brief Create a FixedSizeListType instance from its child Field type -ARROW_EXPORT -std::shared_ptr fixed_size_list(const std::shared_ptr& value_type, - int32_t list_size); - -/// \brief Create a FixedSizeListType instance from its child DataType -ARROW_EXPORT -std::shared_ptr fixed_size_list(const std::shared_ptr& value_type, - int32_t list_size); -/// \brief Return an Duration instance (naming use _type to avoid namespace conflict with -/// built in time clases). -std::shared_ptr ARROW_EXPORT duration(TimeUnit::type unit); - -/// \brief Return an DayTimeIntervalType instance -std::shared_ptr ARROW_EXPORT day_time_interval(); - -/// \brief Return an MonthIntervalType instance -std::shared_ptr ARROW_EXPORT month_interval(); - -/// \brief Create a TimestampType instance from its unit -ARROW_EXPORT -std::shared_ptr timestamp(TimeUnit::type unit); - -/// \brief Create a TimestampType instance from its unit and timezone -ARROW_EXPORT -std::shared_ptr timestamp(TimeUnit::type unit, const std::string& timezone); - -/// \brief Create a 32-bit time type instance -/// -/// Unit can be either SECOND or MILLI -std::shared_ptr ARROW_EXPORT time32(TimeUnit::type unit); - -/// \brief Create a 64-bit time type instance -/// -/// Unit can be either MICRO or NANO -std::shared_ptr ARROW_EXPORT time64(TimeUnit::type unit); - -/// \brief Create a StructType instance -std::shared_ptr ARROW_EXPORT -struct_(const std::vector>& fields); - -/// \brief Create a UnionType instance -std::shared_ptr ARROW_EXPORT -union_(const std::vector>& child_fields, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); - -/// \brief Create a UnionType instance -std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - const std::vector& field_names, - const std::vector& type_codes, UnionMode::type mode = UnionMode::SPARSE); - -/// \brief Create a UnionType instance -inline std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - const std::vector& field_names, - UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, field_names, {}, mode); -} - -/// \brief Create a UnionType instance -inline std::shared_ptr ARROW_EXPORT -union_(const std::vector>& children, - UnionMode::type mode = UnionMode::SPARSE) { - return union_(children, {}, {}, mode); -} - -/// \brief Create a DictionaryType instance -/// \param[in] index_type the type of the dictionary indices (must be -/// a signed integer) -/// \param[in] dict_type the type of the values in the variable dictionary -/// \param[in] ordered true if the order of the dictionary values has -/// semantic meaning and should be preserved where possible -ARROW_EXPORT -std::shared_ptr dictionary(const std::shared_ptr& index_type, - const std::shared_ptr& dict_type, - bool ordered = false); - -/// @} - -/// \defgroup schema-factories Factory functions for fields and schemas -/// -/// Factory functions for fields and schemas -/// @{ - -/// \brief Create a Field instance -/// -/// \param name the field name -/// \param type the field value type -/// \param nullable whether the values are nullable, default true -/// \param metadata any custom key-value metadata, default null -std::shared_ptr ARROW_EXPORT field( - const std::string& name, const std::shared_ptr& type, bool nullable = true, - const std::shared_ptr& metadata = NULLPTR); - -/// \brief Create a Schema instance -/// -/// \param fields the schema's fields -/// \param metadata any custom key-value metadata, default null -/// \return schema shared_ptr to Schema -ARROW_EXPORT -std::shared_ptr schema( - const std::vector>& fields, - const std::shared_ptr& metadata = NULLPTR); - -/// \brief Create a Schema instance -/// -/// \param fields the schema's fields (rvalue reference) -/// \param metadata any custom key-value metadata, default null -/// \return schema shared_ptr to Schema -ARROW_EXPORT -std::shared_ptr schema( - std::vector>&& fields, - const std::shared_ptr& metadata = NULLPTR); - -/// @} - -} // namespace arrow - -#endif // ARROW_TYPE_H diff --git a/r/R/inst/include/arrow/type_fwd.h b/r/R/inst/include/arrow/type_fwd.h deleted file mode 100644 index 040ccf2ffb4..00000000000 --- a/r/R/inst/include/arrow/type_fwd.h +++ /dev/null @@ -1,225 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPE_FWD_H -#define ARROW_TYPE_FWD_H - -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -class DataType; -class KeyValueMetadata; -class Array; -struct ArrayData; -class ArrayBuilder; -class Field; -class Tensor; - -class ChunkedArray; -class Column; -class RecordBatch; -class Table; - -class Buffer; -class MemoryPool; -class RecordBatch; -class Schema; - -class DictionaryType; -class DictionaryArray; -class DictionaryScalar; - -class NullType; -class NullArray; -class NullBuilder; -struct NullScalar; - -class BooleanType; -class BooleanArray; -class BooleanBuilder; -struct BooleanScalar; - -class BinaryType; -class BinaryArray; -class BinaryBuilder; -struct BinaryScalar; - -class FixedSizeBinaryType; -class FixedSizeBinaryArray; -class FixedSizeBinaryBuilder; -struct FixedSizeBinaryScalar; - -class StringType; -class StringArray; -class StringBuilder; -struct StringScalar; - -class ListType; -class ListArray; -class ListBuilder; -struct ListScalar; - -class FixedSizeListType; -class FixedSizeListArray; -class FixedSizeListBuilder; -struct FixedSizeListScalar; - -class StructType; -class StructArray; -class StructBuilder; -struct StructScalar; - -class Decimal128Type; -class Decimal128Array; -class Decimal128Builder; -struct Decimal128Scalar; - -class UnionType; -class UnionArray; -class UnionScalar; - -template -class NumericArray; - -template -class NumericBuilder; - -template -class NumericTensor; - -template -struct NumericScalar; - -#define _NUMERIC_TYPE_DECL(KLASS) \ - class KLASS##Type; \ - using KLASS##Array = NumericArray; \ - using KLASS##Builder = NumericBuilder; \ - using KLASS##Scalar = NumericScalar; \ - using KLASS##Tensor = NumericTensor; - -_NUMERIC_TYPE_DECL(Int8) -_NUMERIC_TYPE_DECL(Int16) -_NUMERIC_TYPE_DECL(Int32) -_NUMERIC_TYPE_DECL(Int64) -_NUMERIC_TYPE_DECL(UInt8) -_NUMERIC_TYPE_DECL(UInt16) -_NUMERIC_TYPE_DECL(UInt32) -_NUMERIC_TYPE_DECL(UInt64) -_NUMERIC_TYPE_DECL(HalfFloat) -_NUMERIC_TYPE_DECL(Float) -_NUMERIC_TYPE_DECL(Double) - -#undef _NUMERIC_TYPE_DECL - -class Date64Type; -using Date64Array = NumericArray; -using Date64Builder = NumericBuilder; -class Date64Scalar; - -class Date32Type; -using Date32Array = NumericArray; -using Date32Builder = NumericBuilder; -class Date32Scalar; - -class Time32Type; -using Time32Array = NumericArray; -using Time32Builder = NumericBuilder; -class Time32Scalar; - -class Time64Type; -using Time64Array = NumericArray; -using Time64Builder = NumericBuilder; -class Time64Scalar; - -class TimestampType; -using TimestampArray = NumericArray; -using TimestampBuilder = NumericBuilder; -class TimestampScalar; - -class MonthIntervalType; -using MonthIntervalArray = NumericArray; -using MonthIntervalBuilder = NumericBuilder; -class MonthIntervalScalar; - -class DayTimeIntervalType; -class DayTimeIntervalArray; -class DayTimeIntervalBuilder; -class DayTimeIntervalScalar; - -class DurationType; -using DurationArray = NumericArray; -using DurationBuilder = NumericBuilder; -class DurationScalar; - -class ExtensionType; -class ExtensionArray; -class ExtensionScalar; - -// ---------------------------------------------------------------------- -// (parameter-free) Factory functions -// Other factory functions are in type.h - -/// \defgroup type-factories Factory functions for creating data types -/// -/// Factory functions for creating data types -/// @{ - -/// \brief Return a NullType instance -std::shared_ptr ARROW_EXPORT null(); -/// \brief Return a BooleanType instance -std::shared_ptr ARROW_EXPORT boolean(); -/// \brief Return a Int8Type instance -std::shared_ptr ARROW_EXPORT int8(); -/// \brief Return a Int16Type instance -std::shared_ptr ARROW_EXPORT int16(); -/// \brief Return a Int32Type instance -std::shared_ptr ARROW_EXPORT int32(); -/// \brief Return a Int64Type instance -std::shared_ptr ARROW_EXPORT int64(); -/// \brief Return a UInt8Type instance -std::shared_ptr ARROW_EXPORT uint8(); -/// \brief Return a UInt16Type instance -std::shared_ptr ARROW_EXPORT uint16(); -/// \brief Return a UInt32Type instance -std::shared_ptr ARROW_EXPORT uint32(); -/// \brief Return a UInt64Type instance -std::shared_ptr ARROW_EXPORT uint64(); -/// \brief Return a HalfFloatType instance -std::shared_ptr ARROW_EXPORT float16(); -/// \brief Return a FloatType instance -std::shared_ptr ARROW_EXPORT float32(); -/// \brief Return a DoubleType instance -std::shared_ptr ARROW_EXPORT float64(); -/// \brief Return a StringType instance -std::shared_ptr ARROW_EXPORT utf8(); -/// \brief Return a BinaryType instance -std::shared_ptr ARROW_EXPORT binary(); -/// \brief Return a Date32Type instance -std::shared_ptr ARROW_EXPORT date32(); -/// \brief Return a Date64Type instance -std::shared_ptr ARROW_EXPORT date64(); - -/// @} - -} // namespace arrow - -#endif // ARROW_TYPE_FWD_H diff --git a/r/R/inst/include/arrow/type_traits.h b/r/R/inst/include/arrow/type_traits.h deleted file mode 100644 index 49c8ff86486..00000000000 --- a/r/R/inst/include/arrow/type_traits.h +++ /dev/null @@ -1,590 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPE_TRAITS_H -#define ARROW_TYPE_TRAITS_H - -#include -#include -#include -#include - -#include "arrow/type_fwd.h" -#include "arrow/util/bit-util.h" - -namespace arrow { - -// -// Per-type type traits -// - -template -struct TypeTraits {}; - -template -struct CTypeTraits {}; - -template <> -struct TypeTraits { - using ArrayType = NullArray; - using BuilderType = NullBuilder; - using ScalarType = NullScalar; - - static constexpr int64_t bytes_required(int64_t) { return 0; } - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return null(); } -}; - -template <> -struct TypeTraits { - using ArrayType = BooleanArray; - using BuilderType = BooleanBuilder; - using ScalarType = BooleanScalar; - using CType = bool; - - static constexpr int64_t bytes_required(int64_t elements) { - return BitUtil::BytesForBits(elements); - } - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return boolean(); } -}; - -template <> -struct CTypeTraits : public TypeTraits { - using ArrowType = BooleanType; -}; - -#define PRIMITIVE_TYPE_TRAITS_DEF_(CType_, ArrowType_, ArrowArrayType, ArrowBuilderType, \ - ArrowScalarType, ArrowTensorType, SingletonFn) \ - template <> \ - struct TypeTraits { \ - using ArrayType = ArrowArrayType; \ - using BuilderType = ArrowBuilderType; \ - using ScalarType = ArrowScalarType; \ - using TensorType = ArrowTensorType; \ - using CType = ArrowType_::c_type; \ - static constexpr int64_t bytes_required(int64_t elements) { \ - return elements * static_cast(sizeof(CType)); \ - } \ - constexpr static bool is_parameter_free = true; \ - static inline std::shared_ptr type_singleton() { return SingletonFn(); } \ - }; \ - \ - template <> \ - struct CTypeTraits : public TypeTraits { \ - using ArrowType = ArrowType_; \ - }; - -#define PRIMITIVE_TYPE_TRAITS_DEF(CType, ArrowShort, SingletonFn) \ - PRIMITIVE_TYPE_TRAITS_DEF_( \ - CType, ARROW_CONCAT(ArrowShort, Type), ARROW_CONCAT(ArrowShort, Array), \ - ARROW_CONCAT(ArrowShort, Builder), ARROW_CONCAT(ArrowShort, Scalar), \ - ARROW_CONCAT(ArrowShort, Tensor), SingletonFn) - -PRIMITIVE_TYPE_TRAITS_DEF(uint8_t, UInt8, uint8) -PRIMITIVE_TYPE_TRAITS_DEF(int8_t, Int8, int8) -PRIMITIVE_TYPE_TRAITS_DEF(uint16_t, UInt16, uint16) -PRIMITIVE_TYPE_TRAITS_DEF(int16_t, Int16, int16) -PRIMITIVE_TYPE_TRAITS_DEF(uint32_t, UInt32, uint32) -PRIMITIVE_TYPE_TRAITS_DEF(int32_t, Int32, int32) -PRIMITIVE_TYPE_TRAITS_DEF(uint64_t, UInt64, uint64) -PRIMITIVE_TYPE_TRAITS_DEF(int64_t, Int64, int64) -PRIMITIVE_TYPE_TRAITS_DEF(float, Float, float32) -PRIMITIVE_TYPE_TRAITS_DEF(double, Double, float64) - -#undef PRIMITIVE_TYPE_TRAITS_DEF -#undef PRIMITIVE_TYPE_TRAITS_DEF_ - -template <> -struct TypeTraits { - using ArrayType = Date64Array; - using BuilderType = Date64Builder; - using ScalarType = Date64Scalar; - using CType = Date64Type::c_type; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int64_t)); - } - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return date64(); } -}; - -template <> -struct TypeTraits { - using ArrayType = Date32Array; - using BuilderType = Date32Builder; - using ScalarType = Date32Scalar; - using CType = Date32Type::c_type; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int32_t)); - } - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return date32(); } -}; - -template <> -struct TypeTraits { - using ArrayType = TimestampArray; - using BuilderType = TimestampBuilder; - using ScalarType = TimestampScalar; - using CType = TimestampType::c_type; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int64_t)); - } - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = DurationArray; - using BuilderType = DurationBuilder; - using ScalarType = DurationScalar; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int64_t)); - } - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = DayTimeIntervalArray; - using BuilderType = DayTimeIntervalBuilder; - using ScalarType = DayTimeIntervalScalar; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(DayTimeIntervalType::DayMilliseconds)); - } - constexpr static bool is_parameter_free = true; -}; - -template <> -struct TypeTraits { - using ArrayType = MonthIntervalArray; - using BuilderType = MonthIntervalBuilder; - using ScalarType = MonthIntervalScalar; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int32_t)); - } - constexpr static bool is_parameter_free = true; -}; - -template <> -struct TypeTraits { - using ArrayType = Time32Array; - using BuilderType = Time32Builder; - using ScalarType = Time32Scalar; - using CType = Time32Type::c_type; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int32_t)); - } - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = Time64Array; - using BuilderType = Time64Builder; - using ScalarType = Time64Scalar; - using CType = Time64Type::c_type; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(int64_t)); - } - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = HalfFloatArray; - using BuilderType = HalfFloatBuilder; - using ScalarType = HalfFloatScalar; - using TensorType = HalfFloatTensor; - - static constexpr int64_t bytes_required(int64_t elements) { - return elements * static_cast(sizeof(uint16_t)); - } - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return float16(); } -}; - -template <> -struct TypeTraits { - using ArrayType = Decimal128Array; - using BuilderType = Decimal128Builder; - using ScalarType = Decimal128Scalar; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = BinaryArray; - using BuilderType = BinaryBuilder; - using ScalarType = BinaryScalar; - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return binary(); } -}; - -template <> -struct TypeTraits { - using ArrayType = FixedSizeBinaryArray; - using BuilderType = FixedSizeBinaryBuilder; - using ScalarType = FixedSizeBinaryScalar; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = StringArray; - using BuilderType = StringBuilder; - using ScalarType = StringScalar; - constexpr static bool is_parameter_free = true; - static inline std::shared_ptr type_singleton() { return utf8(); } -}; - -template <> -struct CTypeTraits : public TypeTraits { - using ArrowType = StringType; -}; - -template <> -struct CTypeTraits : public TypeTraits { - using ArrowType = StringType; -}; - -template <> -struct TypeTraits { - using ArrayType = ListArray; - using BuilderType = ListBuilder; - using ScalarType = ListScalar; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = FixedSizeListArray; - using BuilderType = FixedSizeListBuilder; - using ScalarType = FixedSizeListScalar; - constexpr static bool is_parameter_free = false; -}; - -template -struct CTypeTraits> : public TypeTraits { - using ArrowType = ListType; - - static inline std::shared_ptr type_singleton() { - return list(CTypeTraits::type_singleton()); - } -}; - -template <> -struct TypeTraits { - using ArrayType = StructArray; - using BuilderType = StructBuilder; - using ScalarType = StructScalar; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = UnionArray; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = DictionaryArray; - using ScalarType = DictionaryScalar; - constexpr static bool is_parameter_free = false; -}; - -template <> -struct TypeTraits { - using ArrayType = ExtensionArray; - constexpr static bool is_parameter_free = false; -}; - -// -// Useful type predicates -// - -template -using is_number_type = std::is_base_of; - -template -using is_integer_type = std::is_base_of; - -template -using is_floating_type = std::is_base_of; - -template -using is_temporal_type = std::is_base_of; - -template -struct has_c_type { - static constexpr bool value = - (std::is_base_of::value || std::is_base_of::value || - std::is_base_of::value || std::is_base_of::value || - std::is_base_of::value || - std::is_base_of::value); -}; - -template -struct is_8bit_int { - static constexpr bool value = - (std::is_same::value || std::is_same::value); -}; - -template -using enable_if_8bit_int = typename std::enable_if::value, R>::type; - -template -using enable_if_primitive_ctype = - typename std::enable_if::value, R>::type; - -template -using enable_if_integer = typename std::enable_if::value, R>::type; - -template -using is_signed_integer = - std::integral_constant::value && - std::is_signed::value>; - -template -using enable_if_signed_integer = - typename std::enable_if::value, R>::type; - -template -using enable_if_unsigned_integer = typename std::enable_if< - is_integer_type::value && std::is_unsigned::value, R>::type; - -template -using enable_if_floating_point = - typename std::enable_if::value, R>::type; - -template -using is_date = std::is_base_of; - -template -using enable_if_date = typename std::enable_if::value, R>::type; - -template -using is_time = std::is_base_of; - -template -using enable_if_time = typename std::enable_if::value, R>::type; - -template -using is_timestamp = std::is_base_of; - -template -using enable_if_timestamp = typename std::enable_if::value, R>::type; - -template -using enable_if_has_c_type = typename std::enable_if::value, R>::type; - -template -using enable_if_null = typename std::enable_if::value, R>::type; - -template -using enable_if_binary = - typename std::enable_if::value, R>::type; - -template -using enable_if_boolean = - typename std::enable_if::value, R>::type; - -template -using enable_if_binary_like = - typename std::enable_if::value || - std::is_base_of::value, - R>::type; - -template -using enable_if_fixed_size_binary = - typename std::enable_if::value, R>::type; - -template -using enable_if_list = - typename std::enable_if::value, R>::type; - -template -using enable_if_fixed_size_list = - typename std::enable_if::value, R>::type; - -template -using enable_if_number = typename std::enable_if::value, R>::type; - -namespace detail { - -// Not all type classes have a c_type -template -struct as_void { - using type = void; -}; - -// The partial specialization will match if T has the ATTR_NAME member -#define GET_ATTR(ATTR_NAME, DEFAULT) \ - template \ - struct GetAttr_##ATTR_NAME { \ - using type = DEFAULT; \ - }; \ - \ - template \ - struct GetAttr_##ATTR_NAME::type> { \ - using type = typename T::ATTR_NAME; \ - }; - -GET_ATTR(c_type, void) -GET_ATTR(TypeClass, void) - -#undef GET_ATTR - -} // namespace detail - -#define PRIMITIVE_TRAITS(T) \ - using TypeClass = \ - typename std::conditional::value, T, \ - typename detail::GetAttr_TypeClass::type>::type; \ - using c_type = typename detail::GetAttr_c_type::type - -template -struct IsUnsignedInt { - PRIMITIVE_TRAITS(T); - static constexpr bool value = - std::is_integral::value && std::is_unsigned::value; -}; - -template -struct IsSignedInt { - PRIMITIVE_TRAITS(T); - static constexpr bool value = - std::is_integral::value && std::is_signed::value; -}; - -template -struct IsInteger { - PRIMITIVE_TRAITS(T); - static constexpr bool value = std::is_integral::value; -}; - -template -struct IsFloatingPoint { - PRIMITIVE_TRAITS(T); - static constexpr bool value = std::is_floating_point::value; -}; - -template -struct IsNumeric { - PRIMITIVE_TRAITS(T); - static constexpr bool value = std::is_arithmetic::value; -}; - -static inline bool is_integer(Type::type type_id) { - switch (type_id) { - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::UINT32: - case Type::INT32: - case Type::UINT64: - case Type::INT64: - return true; - default: - break; - } - return false; -} - -static inline bool is_floating(Type::type type_id) { - switch (type_id) { - case Type::HALF_FLOAT: - case Type::FLOAT: - case Type::DOUBLE: - return true; - default: - break; - } - return false; -} - -static inline bool is_primitive(Type::type type_id) { - switch (type_id) { - case Type::NA: - case Type::BOOL: - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::UINT32: - case Type::INT32: - case Type::UINT64: - case Type::INT64: - case Type::HALF_FLOAT: - case Type::FLOAT: - case Type::DOUBLE: - case Type::DATE32: - case Type::DATE64: - case Type::TIME32: - case Type::TIME64: - case Type::TIMESTAMP: - case Type::INTERVAL: - return true; - default: - break; - } - return false; -} - -static inline bool is_binary_like(Type::type type_id) { - switch (type_id) { - case Type::BINARY: - case Type::STRING: - return true; - default: - break; - } - return false; -} - -static inline bool is_dictionary(Type::type type_id) { - return type_id == Type::DICTIONARY; -} - -static inline bool is_fixed_size_binary(Type::type type_id) { - switch (type_id) { - case Type::DECIMAL: - case Type::FIXED_SIZE_BINARY: - return true; - default: - break; - } - return false; -} - -static inline bool is_fixed_width(Type::type type_id) { - return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id); -} - -} // namespace arrow - -#endif // ARROW_TYPE_TRAITS_H diff --git a/r/R/inst/include/arrow/util/basic_decimal.h b/r/R/inst/include/arrow/util/basic_decimal.h deleted file mode 100644 index 2e5857c3012..00000000000 --- a/r/R/inst/include/arrow/util/basic_decimal.h +++ /dev/null @@ -1,175 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -enum class DecimalStatus { - kSuccess, - kDivideByZero, - kOverflow, - kRescaleDataLoss, -}; - -/// Represents a signed 128-bit integer in two's complement. -/// -/// This class is also compiled into LLVM IR - so, it should not have cpp references like -/// streams and boost. -class ARROW_EXPORT BasicDecimal128 { - public: - /// \brief Create a BasicDecimal128 from the two's complement representation. - constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept - : low_bits_(low), high_bits_(high) {} - - /// \brief Empty constructor creates a BasicDecimal128 with a value of 0. - constexpr BasicDecimal128() noexcept : BasicDecimal128(0, 0) {} - - /// \brief Convert any integer value into a BasicDecimal128. - template ::value, T>::type> - constexpr BasicDecimal128(T value) noexcept - : BasicDecimal128(static_cast(value) >= 0 ? 0 : -1, - static_cast(value)) {} - - /// \brief Create a BasicDecimal128 from an array of bytes. Bytes are assumed to be in - /// little-endian byte order. - explicit BasicDecimal128(const uint8_t* bytes); - - /// \brief Negate the current value (in-place) - BasicDecimal128& Negate(); - - /// \brief Absolute value (in-place) - BasicDecimal128& Abs(); - - /// \brief Absolute value - static BasicDecimal128 Abs(const BasicDecimal128& left); - - /// \brief Add a number to this one. The result is truncated to 128 bits. - BasicDecimal128& operator+=(const BasicDecimal128& right); - - /// \brief Subtract a number from this one. The result is truncated to 128 bits. - BasicDecimal128& operator-=(const BasicDecimal128& right); - - /// \brief Multiply this number by another number. The result is truncated to 128 bits. - BasicDecimal128& operator*=(const BasicDecimal128& right); - - /// Divide this number by right and return the result. - /// - /// This operation is not destructive. - /// The answer rounds to zero. Signs work like: - /// 21 / 5 -> 4, 1 - /// -21 / 5 -> -4, -1 - /// 21 / -5 -> -4, 1 - /// -21 / -5 -> 4, -1 - /// \param[in] divisor the number to divide by - /// \param[out] result the quotient - /// \param[out] remainder the remainder after the division - DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result, - BasicDecimal128* remainder) const; - - /// \brief In-place division. - BasicDecimal128& operator/=(const BasicDecimal128& right); - - /// \brief Bitwise "or" between two BasicDecimal128. - BasicDecimal128& operator|=(const BasicDecimal128& right); - - /// \brief Bitwise "and" between two BasicDecimal128. - BasicDecimal128& operator&=(const BasicDecimal128& right); - - /// \brief Shift left by the given number of bits. - BasicDecimal128& operator<<=(uint32_t bits); - - /// \brief Shift right by the given number of bits. Negative values will - BasicDecimal128& operator>>=(uint32_t bits); - - /// \brief Get the high bits of the two's complement representation of the number. - inline int64_t high_bits() const { return high_bits_; } - - /// \brief Get the low bits of the two's complement representation of the number. - inline uint64_t low_bits() const { return low_bits_; } - - /// \brief Return the raw bytes of the value in little-endian byte order. - std::array ToBytes() const; - void ToBytes(uint8_t* out) const; - - /// \brief seperate the integer and fractional parts for the given scale. - void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole, - BasicDecimal128* fraction) const; - - /// \brief Scale multiplier for given scale value. - static const BasicDecimal128& GetScaleMultiplier(int32_t scale); - - /// \brief Convert BasicDecimal128 from one scale to another - DecimalStatus Rescale(int32_t original_scale, int32_t new_scale, - BasicDecimal128* out) const; - - /// \brief Scale up. - BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const; - - /// \brief Scale down. - /// - If 'round' is true, the right-most digits are dropped and the result value is - /// rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits - /// (>= 10^reduce_by / 2). - /// - If 'round' is false, the right-most digits are simply dropped. - BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const; - - // returns 1 for positive and zero decimal values, -1 for negative decimal values. - inline int64_t Sign() const { return 1 | (high_bits_ >> 63); } - - /// \brief count the number of leading binary zeroes. - int32_t CountLeadingBinaryZeros() const; - - /// \brief Get the maximum valid unscaled decimal value. - static const BasicDecimal128& GetMaxValue(); - - private: - uint64_t low_bits_; - int64_t high_bits_; -}; - -ARROW_EXPORT bool operator==(const BasicDecimal128& left, const BasicDecimal128& right); -ARROW_EXPORT bool operator!=(const BasicDecimal128& left, const BasicDecimal128& right); -ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right); -ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right); -ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right); -ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right); - -ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand); -ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand); -ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left, - const BasicDecimal128& right); -ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left, - const BasicDecimal128& right); -ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left, - const BasicDecimal128& right); -ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left, - const BasicDecimal128& right); -ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left, - const BasicDecimal128& right); - -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/bit-stream-utils.h b/r/R/inst/include/arrow/util/bit-stream-utils.h deleted file mode 100644 index ad86ee87c9f..00000000000 --- a/r/R/inst/include/arrow/util/bit-stream-utils.h +++ /dev/null @@ -1,416 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala (incubating) as of 2016-01-29 - -#ifndef ARROW_UTIL_BIT_STREAM_UTILS_H -#define ARROW_UTIL_BIT_STREAM_UTILS_H - -#include -#include -#include - -#include "arrow/util/bit-util.h" -#include "arrow/util/bpacking.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace BitUtil { - -/// Utility class to write bit/byte streams. This class can write data to either be -/// bit packed or byte aligned (and a single stream that has a mix of both). -/// This class does not allocate memory. -class BitWriter { - public: - /// buffer: buffer to write bits to. Buffer should be preallocated with - /// 'buffer_len' bytes. - BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) { - Clear(); - } - - void Clear() { - buffered_values_ = 0; - byte_offset_ = 0; - bit_offset_ = 0; - } - - /// The number of current bytes written, including the current byte (i.e. may include a - /// fraction of a byte). Includes buffered values. - int bytes_written() const { - return byte_offset_ + static_cast(BitUtil::BytesForBits(bit_offset_)); - } - uint8_t* buffer() const { return buffer_; } - int buffer_len() const { return max_bytes_; } - - /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - /// packed. Returns false if there was not enough space. num_bits must be <= 32. - bool PutValue(uint64_t v, int num_bits); - - /// Writes v to the next aligned byte using num_bytes. If T is larger than - /// num_bytes, the extra high-order bytes will be ignored. Returns false if - /// there was not enough space. - template - bool PutAligned(T v, int num_bytes); - - /// Write a Vlq encoded int to the buffer. Returns false if there was not enough - /// room. The value is written byte aligned. - /// For more details on vlq: - /// en.wikipedia.org/wiki/Variable-length_quantity - bool PutVlqInt(uint32_t v); - - // Writes an int zigzag encoded. - bool PutZigZagVlqInt(int32_t v); - - /// Get a pointer to the next aligned byte and advance the underlying buffer - /// by num_bytes. - /// Returns NULL if there was not enough space. - uint8_t* GetNextBytePtr(int num_bytes = 1); - - /// Flushes all buffered values to the buffer. Call this when done writing to - /// the buffer. If 'align' is true, buffered_values_ is reset and any future - /// writes will be written to the next byte boundary. - void Flush(bool align = false); - - private: - uint8_t* buffer_; - int max_bytes_; - - /// Bit-packed values are initially written to this variable before being memcpy'd to - /// buffer_. This is faster than writing values byte by byte directly to buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -/// Utility class to read bit/byte stream. This class can read bits or bytes -/// that are either byte aligned or not. It also has utilities to read multiple -/// bytes in one read (e.g. encoded int). -class BitReader { - public: - /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. - BitReader(const uint8_t* buffer, int buffer_len) - : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) { - int num_bytes = std::min(8, max_bytes_ - byte_offset_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); - } - - BitReader() - : buffer_(NULL), - max_bytes_(0), - buffered_values_(0), - byte_offset_(0), - bit_offset_(0) {} - - void Reset(const uint8_t* buffer, int buffer_len) { - buffer_ = buffer; - max_bytes_ = buffer_len; - byte_offset_ = 0; - bit_offset_ = 0; - int num_bytes = std::min(8, max_bytes_ - byte_offset_); - memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); - } - - /// Gets the next value from the buffer. Returns true if 'v' could be read or false if - /// there are not enough bytes left. num_bits must be <= 32. - template - bool GetValue(int num_bits, T* v); - - /// Get a number of values from the buffer. Return the number of values actually read. - template - int GetBatch(int num_bits, T* v, int batch_size); - - /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T - /// needs to be a little-endian native type and big enough to store - /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will - /// be advanced to the start of the next byte before 'v' is read. Returns - /// false if there are not enough bytes left. - template - bool GetAligned(int num_bytes, T* v); - - /// Reads a vlq encoded int from the stream. The encoded int must start at - /// the beginning of a byte. Return false if there were not enough bytes in - /// the buffer. - bool GetVlqInt(int32_t* v); - - // Reads a zigzag encoded int `into` v. - bool GetZigZagVlqInt(int32_t* v); - - /// Returns the number of bytes left in the stream, not including the current - /// byte (i.e., there may be an additional fraction of a byte). - int bytes_left() { - return max_bytes_ - - (byte_offset_ + static_cast(BitUtil::BytesForBits(bit_offset_))); - } - - /// Maximum byte length of a vlq encoded int - static const int MAX_VLQ_BYTE_LEN = 5; - - private: - const uint8_t* buffer_; - int max_bytes_; - - /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is - /// faster than reading values byte by byte directly from buffer_. - uint64_t buffered_values_; - - int byte_offset_; // Offset in buffer_ - int bit_offset_; // Offset in buffered_values_ -}; - -inline bool BitWriter::PutValue(uint64_t v, int num_bits) { - // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases) - DCHECK_LE(num_bits, 32); - DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; - - if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) - return false; - - buffered_values_ |= v << bit_offset_; - bit_offset_ += num_bits; - - if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) { - // Flush buffered_values_ and write out bits of v that did not fit - memcpy(buffer_ + byte_offset_, &buffered_values_, 8); - buffered_values_ = 0; - byte_offset_ += 8; - bit_offset_ -= 64; - buffered_values_ = v >> (num_bits - bit_offset_); - } - DCHECK_LT(bit_offset_, 64); - return true; -} - -inline void BitWriter::Flush(bool align) { - int num_bytes = static_cast(BitUtil::BytesForBits(bit_offset_)); - DCHECK_LE(byte_offset_ + num_bytes, max_bytes_); - memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes); - - if (align) { - buffered_values_ = 0; - byte_offset_ += num_bytes; - bit_offset_ = 0; - } -} - -inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { - Flush(/* align */ true); - DCHECK_LE(byte_offset_, max_bytes_); - if (byte_offset_ + num_bytes > max_bytes_) return NULL; - uint8_t* ptr = buffer_ + byte_offset_; - byte_offset_ += num_bytes; - return ptr; -} - -template -inline bool BitWriter::PutAligned(T val, int num_bytes) { - uint8_t* ptr = GetNextBytePtr(num_bytes); - if (ptr == NULL) return false; - memcpy(ptr, &val, num_bytes); - return true; -} - -inline bool BitWriter::PutVlqInt(uint32_t v) { - bool result = true; - while ((v & 0xFFFFFF80) != 0L) { - result &= PutAligned(static_cast((v & 0x7F) | 0x80), 1); - v >>= 7; - } - result &= PutAligned(static_cast(v & 0x7F), 1); - return result; -} - -namespace detail { - -template -inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, - int* bit_offset, int* byte_offset, uint64_t* buffered_values) { -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800) -#endif - *v = static_cast(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> - *bit_offset); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - *bit_offset += num_bits; - if (*bit_offset >= 64) { - *byte_offset += 8; - *bit_offset -= 64; - - int bytes_remaining = max_bytes - *byte_offset; - if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(buffered_values, buffer + *byte_offset, 8); - } else { - memcpy(buffered_values, buffer + *byte_offset, bytes_remaining); - } -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800 4805) -#endif - // Read bits of v that crossed into new buffered_values_ - *v = *v | static_cast(BitUtil::TrailingBits(*buffered_values, *bit_offset) - << (num_bits - *bit_offset)); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - DCHECK_LE(*bit_offset, 64); - } -} - -} // namespace detail - -template -inline bool BitReader::GetValue(int num_bits, T* v) { - return GetBatch(num_bits, v, 1) == 1; -} - -template -inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { - DCHECK(buffer_ != NULL); - // TODO: revisit this limit if necessary - DCHECK_LE(num_bits, 32); - DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)); - - int bit_offset = bit_offset_; - int byte_offset = byte_offset_; - uint64_t buffered_values = buffered_values_; - int max_bytes = max_bytes_; - const uint8_t* buffer = buffer_; - - uint64_t needed_bits = num_bits * batch_size; - uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset; - if (remaining_bits < needed_bits) { - batch_size = static_cast(remaining_bits) / num_bits; - } - - int i = 0; - if (ARROW_PREDICT_FALSE(bit_offset != 0)) { - for (; i < batch_size && bit_offset != 0; ++i) { - detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); - } - } - - if (sizeof(T) == 4) { - int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; - } else { - const int buffer_size = 1024; - uint32_t unpack_buffer[buffer_size]; - while (i < batch_size) { - int unpack_size = std::min(buffer_size, batch_size - i); - int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); - if (num_unpacked == 0) { - break; - } - for (int k = 0; k < num_unpacked; ++k) { -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4800) -#endif - v[i + k] = static_cast(unpack_buffer[k]); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - } - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; - } - } - - int bytes_remaining = max_bytes - byte_offset; - if (bytes_remaining >= 8) { - memcpy(&buffered_values, buffer + byte_offset, 8); - } else { - memcpy(&buffered_values, buffer + byte_offset, bytes_remaining); - } - - for (; i < batch_size; ++i) { - detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); - } - - bit_offset_ = bit_offset; - byte_offset_ = byte_offset; - buffered_values_ = buffered_values; - - return batch_size; -} - -template -inline bool BitReader::GetAligned(int num_bytes, T* v) { - DCHECK_LE(num_bytes, static_cast(sizeof(T))); - int bytes_read = static_cast(BitUtil::BytesForBits(bit_offset_)); - if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) - return false; - - // Advance byte_offset to next unread byte and read num_bytes - byte_offset_ += bytes_read; - memcpy(v, buffer_ + byte_offset_, num_bytes); - byte_offset_ += num_bytes; - - // Reset buffered_values_ - bit_offset_ = 0; - int bytes_remaining = max_bytes_ - byte_offset_; - if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) { - memcpy(&buffered_values_, buffer_ + byte_offset_, 8); - } else { - memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); - } - return true; -} - -inline bool BitReader::GetVlqInt(int32_t* v) { - *v = 0; - int shift = 0; - int num_bytes = 0; - uint8_t byte = 0; - do { - if (!GetAligned(1, &byte)) return false; - *v |= (byte & 0x7F) << shift; - shift += 7; - DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); - } while ((byte & 0x80) != 0); - return true; -} - -inline bool BitWriter::PutZigZagVlqInt(int32_t v) { - // Note negative left shift is undefined - uint32_t u = (static_cast(v) << 1) ^ (v >> 31); - return PutVlqInt(u); -} - -inline bool BitReader::GetZigZagVlqInt(int32_t* v) { - int32_t u_signed; - if (!GetVlqInt(&u_signed)) return false; - uint32_t u = static_cast(u_signed); - *reinterpret_cast(v) = (u >> 1) ^ -(static_cast(u & 1)); - return true; -} - -} // namespace BitUtil -} // namespace arrow - -#endif // ARROW_UTIL_BIT_STREAM_UTILS_H diff --git a/r/R/inst/include/arrow/util/bit-util.h b/r/R/inst/include/arrow/util/bit-util.h deleted file mode 100644 index b7de112b85c..00000000000 --- a/r/R/inst/include/arrow/util/bit-util.h +++ /dev/null @@ -1,855 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_BIT_UTIL_H -#define ARROW_UTIL_BIT_UTIL_H - -#ifdef _WIN32 -#define ARROW_LITTLE_ENDIAN 1 -#else -#ifdef __APPLE__ -#include -#else -#include -#endif -# -#ifndef __BYTE_ORDER__ -#error "__BYTE_ORDER__ not defined" -#endif -# -#ifndef __ORDER_LITTLE_ENDIAN__ -#error "__ORDER_LITTLE_ENDIAN__ not defined" -#endif -# -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define ARROW_LITTLE_ENDIAN 1 -#else -#define ARROW_LITTLE_ENDIAN 0 -#endif -#endif - -#if defined(_MSC_VER) -#include -#pragma intrinsic(_BitScanReverse) -#pragma intrinsic(_BitScanForward) -#define ARROW_BYTE_SWAP64 _byteswap_uint64 -#define ARROW_BYTE_SWAP32 _byteswap_ulong -#else -#define ARROW_BYTE_SWAP64 __builtin_bswap64 -#define ARROW_BYTE_SWAP32 __builtin_bswap32 -#endif - -#include -#include -#include -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MemoryPool; -class Status; - -namespace detail { - -template -typename std::make_unsigned::type as_unsigned(Integer x) { - return static_cast::type>(x); -} - -} // namespace detail - -namespace BitUtil { - -// The number of set bits in a given unsigned byte value, pre-computed -// -// Generated with the following Python code -// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};' -// popcounts = [str(bin(i).count('1')) for i in range(0, 256)] -// print(output.format(', '.join(popcounts))) -static constexpr uint8_t kBytePopcount[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, - 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, - 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, - 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, - 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - -// -// Bit-related computations on integer values -// - -// Returns the ceil of value/divisor -constexpr int64_t CeilDiv(int64_t value, int64_t divisor) { - return value / divisor + (value % divisor != 0); -} - -constexpr int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; } - -// Returns the smallest power of two that contains v. If v is already a -// power of two, it is returned as is. -static inline int64_t NextPower2(int64_t n) { - // Taken from - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - n |= n >> 32; - n++; - return n; -} - -constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } - -constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } - -// Returns 'value' rounded up to the nearest multiple of 'factor' -constexpr int64_t RoundUp(int64_t value, int64_t factor) { - return (value + (factor - 1)) / factor * factor; -} - -// Returns 'value' rounded down to the nearest multiple of 'factor' -constexpr int64_t RoundDown(int64_t value, int64_t factor) { - return (value / factor) * factor; -} - -// Returns 'value' rounded up to the nearest multiple of 'factor' when factor -// is a power of two. -// The result is undefined on overflow, i.e. if `value > 2**64 - factor`, -// since we cannot return the correct result which would be 2**64. -constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { - // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0)); - return (value + (factor - 1)) & ~(factor - 1); -} - -constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); } - -constexpr int64_t RoundUpToMultipleOf64(int64_t num) { - return RoundUpToPowerOf2(num, 64); -} - -// Returns the number of bytes covering a sliced bitmap. Find the length -// rounded to cover full bytes on both extremities. -// -// The following example represents a slice (offset=10, length=9) -// -// 0 8 16 24 -// |-------|-------|------| -// [ ] (slice) -// [ ] (same slice aligned to bytes bounds, length=16) -// -// The covering bytes is the length (in bytes) of this new aligned slice. -constexpr int64_t CoveringBytes(int64_t offset, int64_t length) { - return (BitUtil::RoundUp(length + offset, 8) - BitUtil::RoundDown(offset, 8)) / 8; -} - -// Returns the 'num_bits' least-significant bits of 'v'. -static inline uint64_t TrailingBits(uint64_t v, int num_bits) { - if (ARROW_PREDICT_FALSE(num_bits == 0)) return 0; - if (ARROW_PREDICT_FALSE(num_bits >= 64)) return v; - int n = 64 - num_bits; - return (v << n) >> n; -} - -/// \brief Count the number of leading zeros in an unsigned integer. -static inline int CountLeadingZeros(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 32; - return static_cast(__builtin_clz(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanReverse(&index, static_cast(value))) { // NOLINT - return 31 - static_cast(index); - } else { - return 32; - } -#else - int bitpos = 0; - while (value != 0) { - value >>= 1; - ++bitpos; - } - return 32 - bitpos; -#endif -} - -static inline int CountLeadingZeros(uint64_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 64; - return static_cast(__builtin_clzll(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanReverse64(&index, value)) { // NOLINT - return 63 - static_cast(index); - } else { - return 64; - } -#else - int bitpos = 0; - while (value != 0) { - value >>= 1; - ++bitpos; - } - return 64 - bitpos; -#endif -} - -static inline int CountTrailingZeros(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 32; - return static_cast(__builtin_ctzl(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanForward(&index, value)) { - return static_cast(index); - } else { - return 32; - } -#else - int bitpos = 0; - if (value) { - while (value & 1 == 0) { - value >>= 1; - ++bitpos; - } - } else { - bitpos = 32; - } - return bitpos; -#endif -} - -static inline int CountTrailingZeros(uint64_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 64; - return static_cast(__builtin_ctzll(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanForward64(&index, value)) { - return static_cast(index); - } else { - return 64; - } -#else - int bitpos = 0; - if (value) { - while (value & 1 == 0) { - value >>= 1; - ++bitpos; - } - } else { - bitpos = 64; - } - return bitpos; -#endif -} - -// Returns the minimum number of bits needed to represent an unsigned value -static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); } - -// Returns ceil(log2(x)). -static inline int Log2(uint64_t x) { - // DCHECK_GT(x, 0); - return NumRequiredBits(x - 1); -} - -// -// Byte-swap 16-bit, 32-bit and 64-bit values -// - -// Swap the byte order (i.e. endianess) -static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } -static inline uint64_t ByteSwap(uint64_t value) { - return static_cast(ARROW_BYTE_SWAP64(value)); -} -static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } -static inline uint32_t ByteSwap(uint32_t value) { - return static_cast(ARROW_BYTE_SWAP32(value)); -} -static inline int16_t ByteSwap(int16_t value) { - constexpr auto m = static_cast(0xff); - return static_cast(((value >> 8) & m) | ((value & m) << 8)); -} -static inline uint16_t ByteSwap(uint16_t value) { - return static_cast(ByteSwap(static_cast(value))); -} - -// Write the swapped bytes into dst. Src and dst cannot overlap. -static inline void ByteSwap(void* dst, const void* src, int len) { - switch (len) { - case 1: - *reinterpret_cast(dst) = *reinterpret_cast(src); - return; - case 2: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 4: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 8: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - default: - break; - } - - auto d = reinterpret_cast(dst); - auto s = reinterpret_cast(src); - for (int i = 0; i < len; ++i) { - d[i] = s[len - i - 1]; - } -} - -// Convert to little/big endian format from the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T ToBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T ToLittleEndian(T value) { - return value; -} -#else -template > -static inline T ToBigEndian(T value) { - return value; -} - -template > -static inline T ToLittleEndian(T value) { - return ByteSwap(value); -} -#endif - -// Convert from big/little endian format to the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T FromBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T FromLittleEndian(T value) { - return value; -} -#else -template > -static inline T FromBigEndian(T value) { - return value; -} - -template > -static inline T FromLittleEndian(T value) { - return ByteSwap(value); -} -#endif - -// -// Utilities for reading and writing individual bits by their index -// in a memory area. -// - -// Bitmask selecting the k-th bit in a byte -static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; - -// the bitwise complement version of kBitmask -static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127}; - -// Bitmask selecting the (k - 1) preceding bits in a byte -static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; -static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63, 127}; - -// the bitwise complement version of kPrecedingBitmask -static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; - -static inline bool GetBit(const uint8_t* bits, uint64_t i) { - return (bits[i >> 3] >> (i & 0x07)) & 1; -} - -static inline void ClearBit(uint8_t* bits, int64_t i) { - bits[i / 8] &= kFlippedBitmask[i % 8]; -} - -static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; } - -static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) { - // https://graphics.stanford.edu/~seander/bithacks.html - // "Conditionally set or clear bits without branching" - // NOTE: this seems to confuse Valgrind as it reads from potentially - // uninitialized memory - bits[i / 8] ^= static_cast(-static_cast(bit_is_set) ^ bits[i / 8]) & - kBitmask[i % 8]; -} - -/// \brief set or clear a range of bits quickly -static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, - bool bits_are_set) { - if (length == 0) return; - - const auto i_begin = start_offset; - const auto i_end = start_offset + length; - const uint8_t fill_byte = static_cast(-static_cast(bits_are_set)); - - const auto bytes_begin = i_begin / 8; - const auto bytes_end = i_end / 8 + 1; - - const auto first_byte_mask = kPrecedingBitmask[i_begin % 8]; - const auto last_byte_mask = kTrailingBitmask[i_end % 8]; - - if (bytes_end == bytes_begin + 1) { - // set bits within a single byte - const auto only_byte_mask = - i_end % 8 == 0 ? first_byte_mask - : static_cast(first_byte_mask | last_byte_mask); - bits[bytes_begin] &= only_byte_mask; - bits[bytes_begin] |= static_cast(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte - bits[bytes_begin] &= first_byte_mask; - bits[bytes_begin] |= static_cast(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { - // set/clear whole bytes - std::memset(bits + bytes_begin + 1, fill_byte, - static_cast(bytes_end - bytes_begin - 2)); - } - - if (i_end % 8 == 0) return; - - // set/clear leading bits of last byte - bits[bytes_end - 1] &= last_byte_mask; - bits[bytes_end - 1] |= static_cast(fill_byte & ~last_byte_mask); -} - -/// \brief Convert vector of bytes to bitmap buffer -ARROW_EXPORT -Status BytesToBits(const std::vector&, MemoryPool*, std::shared_ptr*); - -} // namespace BitUtil - -namespace internal { - -class BitmapReader { - public: - BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) - : bitmap_(bitmap), position_(0), length_(length) { - current_byte_ = 0; - byte_offset_ = start_offset / 8; - bit_offset_ = start_offset % 8; - if (length > 0) { - current_byte_ = bitmap[byte_offset_]; - } - } - - bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; } - - bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } - - void Next() { - ++bit_offset_; - ++position_; - if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) { - bit_offset_ = 0; - ++byte_offset_; - if (ARROW_PREDICT_TRUE(position_ < length_)) { - current_byte_ = bitmap_[byte_offset_]; - } - } - } - - private: - const uint8_t* bitmap_; - int64_t position_; - int64_t length_; - - uint8_t current_byte_; - int64_t byte_offset_; - int64_t bit_offset_; -}; - -class BitmapWriter { - // A sequential bitwise writer that preserves surrounding bit values. - - public: - BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) - : bitmap_(bitmap), position_(0), length_(length) { - byte_offset_ = start_offset / 8; - bit_mask_ = BitUtil::kBitmask[start_offset % 8]; - if (length > 0) { - current_byte_ = bitmap[byte_offset_]; - } else { - current_byte_ = 0; - } - } - - void Set() { current_byte_ |= bit_mask_; } - - void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; } - - void Next() { - bit_mask_ = static_cast(bit_mask_ << 1); - ++position_; - if (bit_mask_ == 0) { - // Finished this byte, need advancing - bit_mask_ = 0x01; - bitmap_[byte_offset_++] = current_byte_; - if (ARROW_PREDICT_TRUE(position_ < length_)) { - current_byte_ = bitmap_[byte_offset_]; - } - } - } - - void Finish() { - // Store current byte if we didn't went past bitmap storage - if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { - bitmap_[byte_offset_] = current_byte_; - } - } - - int64_t position() const { return position_; } - - private: - uint8_t* bitmap_; - int64_t position_; - int64_t length_; - - uint8_t current_byte_; - uint8_t bit_mask_; - int64_t byte_offset_; -}; - -class FirstTimeBitmapWriter { - // Like BitmapWriter, but any bit values *following* the bits written - // might be clobbered. It is hence faster than BitmapWriter, and can - // also avoid false positives with Valgrind. - - public: - FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length) - : bitmap_(bitmap), position_(0), length_(length) { - current_byte_ = 0; - byte_offset_ = start_offset / 8; - bit_mask_ = BitUtil::kBitmask[start_offset % 8]; - if (length > 0) { - current_byte_ = bitmap[byte_offset_] & BitUtil::kPrecedingBitmask[start_offset % 8]; - } else { - current_byte_ = 0; - } - } - - void Set() { current_byte_ |= bit_mask_; } - - void Clear() {} - - void Next() { - bit_mask_ = static_cast(bit_mask_ << 1); - ++position_; - if (bit_mask_ == 0) { - // Finished this byte, need advancing - bit_mask_ = 0x01; - bitmap_[byte_offset_++] = current_byte_; - current_byte_ = 0; - } - } - - void Finish() { - // Store current byte if we didn't went past bitmap storage - if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { - bitmap_[byte_offset_] = current_byte_; - } - } - - int64_t position() const { return position_; } - - private: - uint8_t* bitmap_; - int64_t position_; - int64_t length_; - - uint8_t current_byte_; - uint8_t bit_mask_; - int64_t byte_offset_; -}; - -// A std::generate() like function to write sequential bits into a bitmap area. -// Bits preceding the bitmap area are preserved, bits following the bitmap -// area may be clobbered. - -template -void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) { - if (length == 0) { - return; - } - uint8_t* cur = bitmap + start_offset / 8; - uint8_t bit_mask = BitUtil::kBitmask[start_offset % 8]; - uint8_t current_byte = *cur & BitUtil::kPrecedingBitmask[start_offset % 8]; - - for (int64_t index = 0; index < length; ++index) { - const bool bit = g(); - current_byte = bit ? (current_byte | bit_mask) : current_byte; - bit_mask = static_cast(bit_mask << 1); - if (bit_mask == 0) { - bit_mask = 1; - *cur++ = current_byte; - current_byte = 0; - } - } - if (bit_mask != 1) { - *cur++ = current_byte; - } -} - -// Like GenerateBits(), but unrolls its main loop for higher performance. - -template -void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length, - Generator&& g) { - if (length == 0) { - return; - } - uint8_t current_byte; - uint8_t* cur = bitmap + start_offset / 8; - const uint64_t start_bit_offset = start_offset % 8; - uint8_t bit_mask = BitUtil::kBitmask[start_bit_offset]; - int64_t remaining = length; - - if (bit_mask != 0x01) { - current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset]; - while (bit_mask != 0 && remaining > 0) { - current_byte = g() ? (current_byte | bit_mask) : current_byte; - bit_mask = static_cast(bit_mask << 1); - --remaining; - } - *cur++ = current_byte; - } - - int64_t remaining_bytes = remaining / 8; - while (remaining_bytes-- > 0) { - current_byte = 0; - current_byte = g() ? current_byte | 0x01 : current_byte; - current_byte = g() ? current_byte | 0x02 : current_byte; - current_byte = g() ? current_byte | 0x04 : current_byte; - current_byte = g() ? current_byte | 0x08 : current_byte; - current_byte = g() ? current_byte | 0x10 : current_byte; - current_byte = g() ? current_byte | 0x20 : current_byte; - current_byte = g() ? current_byte | 0x40 : current_byte; - current_byte = g() ? current_byte | 0x80 : current_byte; - *cur++ = current_byte; - } - - int64_t remaining_bits = remaining % 8; - if (remaining_bits) { - current_byte = 0; - bit_mask = 0x01; - while (remaining_bits-- > 0) { - current_byte = g() ? (current_byte | bit_mask) : current_byte; - bit_mask = static_cast(bit_mask << 1); - } - *cur++ = current_byte; - } -} - -// ---------------------------------------------------------------------- -// Bitmap utilities - -/// Copy a bit range of an existing bitmap -/// -/// \param[in] pool memory pool to allocate memory from -/// \param[in] bitmap source data -/// \param[in] offset bit offset into the source data -/// \param[in] length number of bits to copy -/// \param[out] out the resulting copy -/// -/// \return Status message -ARROW_EXPORT -Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, int64_t length, - std::shared_ptr* out); - -/// Copy a bit range of an existing bitmap into an existing bitmap -/// -/// \param[in] bitmap source data -/// \param[in] offset bit offset into the source data -/// \param[in] length number of bits to copy -/// \param[in] dest_offset bit offset into the destination -/// \param[in] restore_trailing_bits don't clobber bits outside the destination range -/// \param[out] dest the destination buffer, must have at least space for -/// (offset + length) bits -ARROW_EXPORT -void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset, bool restore_trailing_bits = true); - -/// Invert a bit range of an existing bitmap into an existing bitmap -/// -/// \param[in] bitmap source data -/// \param[in] offset bit offset into the source data -/// \param[in] length number of bits to copy -/// \param[in] dest_offset bit offset into the destination -/// \param[out] dest the destination buffer, must have at least space for -/// (offset + length) bits -ARROW_EXPORT -void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset); - -/// Invert a bit range of an existing bitmap -/// -/// \param[in] pool memory pool to allocate memory from -/// \param[in] bitmap source data -/// \param[in] offset bit offset into the source data -/// \param[in] length number of bits to copy -/// \param[out] out the resulting copy -/// -/// \return Status message -ARROW_EXPORT -Status InvertBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, - int64_t length, std::shared_ptr* out); - -/// Compute the number of 1's in the given data array -/// -/// \param[in] data a packed LSB-ordered bitmap as a byte array -/// \param[in] bit_offset a bitwise offset into the bitmap -/// \param[in] length the number of bits to inspect in the bitmap relative to -/// the offset -/// -/// \return The number of set (1) bits in the range -ARROW_EXPORT -int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length); - -ARROW_EXPORT -bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t bit_length); - -/// \brief Do a "bitmap and" on right and left buffers starting at -/// their respective bit-offsets for the given bit-length and put -/// the results in out_buffer starting at the given bit-offset. -/// -/// out_buffer will be allocated and initialized to zeros using pool before -/// the operation. -ARROW_EXPORT -Status BitmapAnd(MemoryPool* pool, const uint8_t* left, int64_t left_offset, - const uint8_t* right, int64_t right_offset, int64_t length, - int64_t out_offset, std::shared_ptr* out_buffer); - -/// \brief Do a "bitmap and" on right and left buffers starting at -/// their respective bit-offsets for the given bit-length and put -/// the results in out starting at the given bit-offset. -ARROW_EXPORT -void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); - -/// \brief Do a "bitmap or" for the given bit length on right and left buffers -/// starting at their respective bit-offsets and put the results in out_buffer -/// starting at the given bit-offset. -/// -/// out_buffer will be allocated and initialized to zeros using pool before -/// the operation. -ARROW_EXPORT -Status BitmapOr(MemoryPool* pool, const uint8_t* left, int64_t left_offset, - const uint8_t* right, int64_t right_offset, int64_t length, - int64_t out_offset, std::shared_ptr* out_buffer); - -/// \brief Do a "bitmap or" for the given bit length on right and left buffers -/// starting at their respective bit-offsets and put the results in out -/// starting at the given bit-offset. -ARROW_EXPORT -void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); - -/// \brief Do a "bitmap xor" for the given bit-length on right and left -/// buffers starting at their respective bit-offsets and put the results in -/// out_buffer starting at the given bit offset. -/// -/// out_buffer will be allocated and initialized to zeros using pool before -/// the operation. -ARROW_EXPORT -Status BitmapXor(MemoryPool* pool, const uint8_t* left, int64_t left_offset, - const uint8_t* right, int64_t right_offset, int64_t length, - int64_t out_offset, std::shared_ptr* out_buffer); - -/// \brief Do a "bitmap xor" for the given bit-length on right and left -/// buffers starting at their respective bit-offsets and put the results in -/// out starting at the given bit offset. -ARROW_EXPORT -void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out); - -/// \brief Store a stack of bitsets efficiently. The top bitset may be -/// accessed and its bits may be modified, but it may not be resized. -class BitsetStack { - public: - using reference = typename std::vector::reference; - - /// \brief push a bitset onto the stack - /// \param size number of bits in the next bitset - /// \param value initial value for bits in the pushed bitset - void Push(int size, bool value) { - offsets_.push_back(bit_count()); - bits_.resize(bit_count() + size, value); - } - - /// \brief number of bits in the bitset at the top of the stack - int TopSize() const { - if (offsets_.size() == 0) return 0; - return bit_count() - offsets_.back(); - } - - /// \brief pop a bitset off the stack - void Pop() { - bits_.resize(offsets_.back()); - offsets_.pop_back(); - } - - /// \brief get the value of a bit in the top bitset - /// \param i index of the bit to access - bool operator[](int i) const { return bits_[offsets_.back() + i]; } - - /// \brief get a mutable reference to a bit in the top bitset - /// \param i index of the bit to access - reference operator[](int i) { return bits_[offsets_.back() + i]; } - - private: - int bit_count() const { return static_cast(bits_.size()); } - std::vector bits_; - std::vector offsets_; -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_BIT_UTIL_H diff --git a/r/R/inst/include/arrow/util/bpacking.h b/r/R/inst/include/arrow/util/bpacking.h deleted file mode 100644 index 14258cff6e4..00000000000 --- a/r/R/inst/include/arrow/util/bpacking.h +++ /dev/null @@ -1,3308 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file was modified from its original version for inclusion in parquet-cpp. -// Original source: -// https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp -// The original copyright notice follows. - -// This code is released under the -// Apache License Version 2.0 http://www.apache.org/licenses/. -// (c) Daniel Lemire 2013 - -#ifndef ARROW_UTIL_BPACKING_H -#define ARROW_UTIL_BPACKING_H - -#include "arrow/util/logging.h" - -namespace arrow { -namespace internal { - -inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) & 1; - out++; - *out = ((*in) >> 1) & 1; - out++; - *out = ((*in) >> 2) & 1; - out++; - *out = ((*in) >> 3) & 1; - out++; - *out = ((*in) >> 4) & 1; - out++; - *out = ((*in) >> 5) & 1; - out++; - *out = ((*in) >> 6) & 1; - out++; - *out = ((*in) >> 7) & 1; - out++; - *out = ((*in) >> 8) & 1; - out++; - *out = ((*in) >> 9) & 1; - out++; - *out = ((*in) >> 10) & 1; - out++; - *out = ((*in) >> 11) & 1; - out++; - *out = ((*in) >> 12) & 1; - out++; - *out = ((*in) >> 13) & 1; - out++; - *out = ((*in) >> 14) & 1; - out++; - *out = ((*in) >> 15) & 1; - out++; - *out = ((*in) >> 16) & 1; - out++; - *out = ((*in) >> 17) & 1; - out++; - *out = ((*in) >> 18) & 1; - out++; - *out = ((*in) >> 19) & 1; - out++; - *out = ((*in) >> 20) & 1; - out++; - *out = ((*in) >> 21) & 1; - out++; - *out = ((*in) >> 22) & 1; - out++; - *out = ((*in) >> 23) & 1; - out++; - *out = ((*in) >> 24) & 1; - out++; - *out = ((*in) >> 25) & 1; - out++; - *out = ((*in) >> 26) & 1; - out++; - *out = ((*in) >> 27) & 1; - out++; - *out = ((*in) >> 28) & 1; - out++; - *out = ((*in) >> 29) & 1; - out++; - *out = ((*in) >> 30) & 1; - out++; - *out = ((*in) >> 31); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 2); - out++; - *out = ((*in) >> 2) % (1U << 2); - out++; - *out = ((*in) >> 4) % (1U << 2); - out++; - *out = ((*in) >> 6) % (1U << 2); - out++; - *out = ((*in) >> 8) % (1U << 2); - out++; - *out = ((*in) >> 10) % (1U << 2); - out++; - *out = ((*in) >> 12) % (1U << 2); - out++; - *out = ((*in) >> 14) % (1U << 2); - out++; - *out = ((*in) >> 16) % (1U << 2); - out++; - *out = ((*in) >> 18) % (1U << 2); - out++; - *out = ((*in) >> 20) % (1U << 2); - out++; - *out = ((*in) >> 22) % (1U << 2); - out++; - *out = ((*in) >> 24) % (1U << 2); - out++; - *out = ((*in) >> 26) % (1U << 2); - out++; - *out = ((*in) >> 28) % (1U << 2); - out++; - *out = ((*in) >> 30); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 2); - out++; - *out = ((*in) >> 2) % (1U << 2); - out++; - *out = ((*in) >> 4) % (1U << 2); - out++; - *out = ((*in) >> 6) % (1U << 2); - out++; - *out = ((*in) >> 8) % (1U << 2); - out++; - *out = ((*in) >> 10) % (1U << 2); - out++; - *out = ((*in) >> 12) % (1U << 2); - out++; - *out = ((*in) >> 14) % (1U << 2); - out++; - *out = ((*in) >> 16) % (1U << 2); - out++; - *out = ((*in) >> 18) % (1U << 2); - out++; - *out = ((*in) >> 20) % (1U << 2); - out++; - *out = ((*in) >> 22) % (1U << 2); - out++; - *out = ((*in) >> 24) % (1U << 2); - out++; - *out = ((*in) >> 26) % (1U << 2); - out++; - *out = ((*in) >> 28) % (1U << 2); - out++; - *out = ((*in) >> 30); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack3_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 3); - out++; - *out = ((*in) >> 3) % (1U << 3); - out++; - *out = ((*in) >> 6) % (1U << 3); - out++; - *out = ((*in) >> 9) % (1U << 3); - out++; - *out = ((*in) >> 12) % (1U << 3); - out++; - *out = ((*in) >> 15) % (1U << 3); - out++; - *out = ((*in) >> 18) % (1U << 3); - out++; - *out = ((*in) >> 21) % (1U << 3); - out++; - *out = ((*in) >> 24) % (1U << 3); - out++; - *out = ((*in) >> 27) % (1U << 3); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 1)) << (3 - 1); - out++; - *out = ((*in) >> 1) % (1U << 3); - out++; - *out = ((*in) >> 4) % (1U << 3); - out++; - *out = ((*in) >> 7) % (1U << 3); - out++; - *out = ((*in) >> 10) % (1U << 3); - out++; - *out = ((*in) >> 13) % (1U << 3); - out++; - *out = ((*in) >> 16) % (1U << 3); - out++; - *out = ((*in) >> 19) % (1U << 3); - out++; - *out = ((*in) >> 22) % (1U << 3); - out++; - *out = ((*in) >> 25) % (1U << 3); - out++; - *out = ((*in) >> 28) % (1U << 3); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 2)) << (3 - 2); - out++; - *out = ((*in) >> 2) % (1U << 3); - out++; - *out = ((*in) >> 5) % (1U << 3); - out++; - *out = ((*in) >> 8) % (1U << 3); - out++; - *out = ((*in) >> 11) % (1U << 3); - out++; - *out = ((*in) >> 14) % (1U << 3); - out++; - *out = ((*in) >> 17) % (1U << 3); - out++; - *out = ((*in) >> 20) % (1U << 3); - out++; - *out = ((*in) >> 23) % (1U << 3); - out++; - *out = ((*in) >> 26) % (1U << 3); - out++; - *out = ((*in) >> 29); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack4_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 4); - out++; - *out = ((*in) >> 4) % (1U << 4); - out++; - *out = ((*in) >> 8) % (1U << 4); - out++; - *out = ((*in) >> 12) % (1U << 4); - out++; - *out = ((*in) >> 16) % (1U << 4); - out++; - *out = ((*in) >> 20) % (1U << 4); - out++; - *out = ((*in) >> 24) % (1U << 4); - out++; - *out = ((*in) >> 28); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack5_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 5); - out++; - *out = ((*in) >> 5) % (1U << 5); - out++; - *out = ((*in) >> 10) % (1U << 5); - out++; - *out = ((*in) >> 15) % (1U << 5); - out++; - *out = ((*in) >> 20) % (1U << 5); - out++; - *out = ((*in) >> 25) % (1U << 5); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 3)) << (5 - 3); - out++; - *out = ((*in) >> 3) % (1U << 5); - out++; - *out = ((*in) >> 8) % (1U << 5); - out++; - *out = ((*in) >> 13) % (1U << 5); - out++; - *out = ((*in) >> 18) % (1U << 5); - out++; - *out = ((*in) >> 23) % (1U << 5); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 1)) << (5 - 1); - out++; - *out = ((*in) >> 1) % (1U << 5); - out++; - *out = ((*in) >> 6) % (1U << 5); - out++; - *out = ((*in) >> 11) % (1U << 5); - out++; - *out = ((*in) >> 16) % (1U << 5); - out++; - *out = ((*in) >> 21) % (1U << 5); - out++; - *out = ((*in) >> 26) % (1U << 5); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 4)) << (5 - 4); - out++; - *out = ((*in) >> 4) % (1U << 5); - out++; - *out = ((*in) >> 9) % (1U << 5); - out++; - *out = ((*in) >> 14) % (1U << 5); - out++; - *out = ((*in) >> 19) % (1U << 5); - out++; - *out = ((*in) >> 24) % (1U << 5); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 2)) << (5 - 2); - out++; - *out = ((*in) >> 2) % (1U << 5); - out++; - *out = ((*in) >> 7) % (1U << 5); - out++; - *out = ((*in) >> 12) % (1U << 5); - out++; - *out = ((*in) >> 17) % (1U << 5); - out++; - *out = ((*in) >> 22) % (1U << 5); - out++; - *out = ((*in) >> 27); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack6_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 6); - out++; - *out = ((*in) >> 6) % (1U << 6); - out++; - *out = ((*in) >> 12) % (1U << 6); - out++; - *out = ((*in) >> 18) % (1U << 6); - out++; - *out = ((*in) >> 24) % (1U << 6); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6); - out++; - *out = ((*in) >> 10) % (1U << 6); - out++; - *out = ((*in) >> 16) % (1U << 6); - out++; - *out = ((*in) >> 22) % (1U << 6); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6); - out++; - *out = ((*in) >> 8) % (1U << 6); - out++; - *out = ((*in) >> 14) % (1U << 6); - out++; - *out = ((*in) >> 20) % (1U << 6); - out++; - *out = ((*in) >> 26); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 6); - out++; - *out = ((*in) >> 6) % (1U << 6); - out++; - *out = ((*in) >> 12) % (1U << 6); - out++; - *out = ((*in) >> 18) % (1U << 6); - out++; - *out = ((*in) >> 24) % (1U << 6); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 4)) << (6 - 4); - out++; - *out = ((*in) >> 4) % (1U << 6); - out++; - *out = ((*in) >> 10) % (1U << 6); - out++; - *out = ((*in) >> 16) % (1U << 6); - out++; - *out = ((*in) >> 22) % (1U << 6); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 2)) << (6 - 2); - out++; - *out = ((*in) >> 2) % (1U << 6); - out++; - *out = ((*in) >> 8) % (1U << 6); - out++; - *out = ((*in) >> 14) % (1U << 6); - out++; - *out = ((*in) >> 20) % (1U << 6); - out++; - *out = ((*in) >> 26); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack7_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 7); - out++; - *out = ((*in) >> 7) % (1U << 7); - out++; - *out = ((*in) >> 14) % (1U << 7); - out++; - *out = ((*in) >> 21) % (1U << 7); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 3)) << (7 - 3); - out++; - *out = ((*in) >> 3) % (1U << 7); - out++; - *out = ((*in) >> 10) % (1U << 7); - out++; - *out = ((*in) >> 17) % (1U << 7); - out++; - *out = ((*in) >> 24) % (1U << 7); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 6)) << (7 - 6); - out++; - *out = ((*in) >> 6) % (1U << 7); - out++; - *out = ((*in) >> 13) % (1U << 7); - out++; - *out = ((*in) >> 20) % (1U << 7); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 2)) << (7 - 2); - out++; - *out = ((*in) >> 2) % (1U << 7); - out++; - *out = ((*in) >> 9) % (1U << 7); - out++; - *out = ((*in) >> 16) % (1U << 7); - out++; - *out = ((*in) >> 23) % (1U << 7); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 5)) << (7 - 5); - out++; - *out = ((*in) >> 5) % (1U << 7); - out++; - *out = ((*in) >> 12) % (1U << 7); - out++; - *out = ((*in) >> 19) % (1U << 7); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 1)) << (7 - 1); - out++; - *out = ((*in) >> 1) % (1U << 7); - out++; - *out = ((*in) >> 8) % (1U << 7); - out++; - *out = ((*in) >> 15) % (1U << 7); - out++; - *out = ((*in) >> 22) % (1U << 7); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 4)) << (7 - 4); - out++; - *out = ((*in) >> 4) % (1U << 7); - out++; - *out = ((*in) >> 11) % (1U << 7); - out++; - *out = ((*in) >> 18) % (1U << 7); - out++; - *out = ((*in) >> 25); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack8_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 8); - out++; - *out = ((*in) >> 8) % (1U << 8); - out++; - *out = ((*in) >> 16) % (1U << 8); - out++; - *out = ((*in) >> 24); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack9_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 9); - out++; - *out = ((*in) >> 9) % (1U << 9); - out++; - *out = ((*in) >> 18) % (1U << 9); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 4)) << (9 - 4); - out++; - *out = ((*in) >> 4) % (1U << 9); - out++; - *out = ((*in) >> 13) % (1U << 9); - out++; - *out = ((*in) >> 22) % (1U << 9); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 8)) << (9 - 8); - out++; - *out = ((*in) >> 8) % (1U << 9); - out++; - *out = ((*in) >> 17) % (1U << 9); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 3)) << (9 - 3); - out++; - *out = ((*in) >> 3) % (1U << 9); - out++; - *out = ((*in) >> 12) % (1U << 9); - out++; - *out = ((*in) >> 21) % (1U << 9); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 7)) << (9 - 7); - out++; - *out = ((*in) >> 7) % (1U << 9); - out++; - *out = ((*in) >> 16) % (1U << 9); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 2)) << (9 - 2); - out++; - *out = ((*in) >> 2) % (1U << 9); - out++; - *out = ((*in) >> 11) % (1U << 9); - out++; - *out = ((*in) >> 20) % (1U << 9); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 6)) << (9 - 6); - out++; - *out = ((*in) >> 6) % (1U << 9); - out++; - *out = ((*in) >> 15) % (1U << 9); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 1)) << (9 - 1); - out++; - *out = ((*in) >> 1) % (1U << 9); - out++; - *out = ((*in) >> 10) % (1U << 9); - out++; - *out = ((*in) >> 19) % (1U << 9); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 5)) << (9 - 5); - out++; - *out = ((*in) >> 5) % (1U << 9); - out++; - *out = ((*in) >> 14) % (1U << 9); - out++; - *out = ((*in) >> 23); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack10_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 10); - out++; - *out = ((*in) >> 10) % (1U << 10); - out++; - *out = ((*in) >> 20) % (1U << 10); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10); - out++; - *out = ((*in) >> 18) % (1U << 10); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10); - out++; - *out = ((*in) >> 16) % (1U << 10); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10); - out++; - *out = ((*in) >> 14) % (1U << 10); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10); - out++; - *out = ((*in) >> 12) % (1U << 10); - out++; - *out = ((*in) >> 22); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 10); - out++; - *out = ((*in) >> 10) % (1U << 10); - out++; - *out = ((*in) >> 20) % (1U << 10); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 8)) << (10 - 8); - out++; - *out = ((*in) >> 8) % (1U << 10); - out++; - *out = ((*in) >> 18) % (1U << 10); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 6)) << (10 - 6); - out++; - *out = ((*in) >> 6) % (1U << 10); - out++; - *out = ((*in) >> 16) % (1U << 10); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 4)) << (10 - 4); - out++; - *out = ((*in) >> 4) % (1U << 10); - out++; - *out = ((*in) >> 14) % (1U << 10); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 2)) << (10 - 2); - out++; - *out = ((*in) >> 2) % (1U << 10); - out++; - *out = ((*in) >> 12) % (1U << 10); - out++; - *out = ((*in) >> 22); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack11_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 11); - out++; - *out = ((*in) >> 11) % (1U << 11); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 1)) << (11 - 1); - out++; - *out = ((*in) >> 1) % (1U << 11); - out++; - *out = ((*in) >> 12) % (1U << 11); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 2)) << (11 - 2); - out++; - *out = ((*in) >> 2) % (1U << 11); - out++; - *out = ((*in) >> 13) % (1U << 11); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 3)) << (11 - 3); - out++; - *out = ((*in) >> 3) % (1U << 11); - out++; - *out = ((*in) >> 14) % (1U << 11); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 4)) << (11 - 4); - out++; - *out = ((*in) >> 4) % (1U << 11); - out++; - *out = ((*in) >> 15) % (1U << 11); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 5)) << (11 - 5); - out++; - *out = ((*in) >> 5) % (1U << 11); - out++; - *out = ((*in) >> 16) % (1U << 11); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 6)) << (11 - 6); - out++; - *out = ((*in) >> 6) % (1U << 11); - out++; - *out = ((*in) >> 17) % (1U << 11); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 7)) << (11 - 7); - out++; - *out = ((*in) >> 7) % (1U << 11); - out++; - *out = ((*in) >> 18) % (1U << 11); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 8)) << (11 - 8); - out++; - *out = ((*in) >> 8) % (1U << 11); - out++; - *out = ((*in) >> 19) % (1U << 11); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 9)) << (11 - 9); - out++; - *out = ((*in) >> 9) % (1U << 11); - out++; - *out = ((*in) >> 20) % (1U << 11); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 10)) << (11 - 10); - out++; - *out = ((*in) >> 10) % (1U << 11); - out++; - *out = ((*in) >> 21); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack12_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 12); - out++; - *out = ((*in) >> 12) % (1U << 12); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 4)) << (12 - 4); - out++; - *out = ((*in) >> 4) % (1U << 12); - out++; - *out = ((*in) >> 16) % (1U << 12); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 8)) << (12 - 8); - out++; - *out = ((*in) >> 8) % (1U << 12); - out++; - *out = ((*in) >> 20); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack13_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 13); - out++; - *out = ((*in) >> 13) % (1U << 13); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 7)) << (13 - 7); - out++; - *out = ((*in) >> 7) % (1U << 13); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 1)) << (13 - 1); - out++; - *out = ((*in) >> 1) % (1U << 13); - out++; - *out = ((*in) >> 14) % (1U << 13); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 8)) << (13 - 8); - out++; - *out = ((*in) >> 8) % (1U << 13); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 2)) << (13 - 2); - out++; - *out = ((*in) >> 2) % (1U << 13); - out++; - *out = ((*in) >> 15) % (1U << 13); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 9)) << (13 - 9); - out++; - *out = ((*in) >> 9) % (1U << 13); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 3)) << (13 - 3); - out++; - *out = ((*in) >> 3) % (1U << 13); - out++; - *out = ((*in) >> 16) % (1U << 13); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 10)) << (13 - 10); - out++; - *out = ((*in) >> 10) % (1U << 13); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 4)) << (13 - 4); - out++; - *out = ((*in) >> 4) % (1U << 13); - out++; - *out = ((*in) >> 17) % (1U << 13); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 11)) << (13 - 11); - out++; - *out = ((*in) >> 11) % (1U << 13); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 5)) << (13 - 5); - out++; - *out = ((*in) >> 5) % (1U << 13); - out++; - *out = ((*in) >> 18) % (1U << 13); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 12)) << (13 - 12); - out++; - *out = ((*in) >> 12) % (1U << 13); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 6)) << (13 - 6); - out++; - *out = ((*in) >> 6) % (1U << 13); - out++; - *out = ((*in) >> 19); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack14_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 14); - out++; - *out = ((*in) >> 14) % (1U << 14); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14); - out++; - *out = ((*in) >> 16) % (1U << 14); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14); - out++; - *out = ((*in) >> 18); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 14); - out++; - *out = ((*in) >> 14) % (1U << 14); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 10)) << (14 - 10); - out++; - *out = ((*in) >> 10) % (1U << 14); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 6)) << (14 - 6); - out++; - *out = ((*in) >> 6) % (1U << 14); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 2)) << (14 - 2); - out++; - *out = ((*in) >> 2) % (1U << 14); - out++; - *out = ((*in) >> 16) % (1U << 14); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 12)) << (14 - 12); - out++; - *out = ((*in) >> 12) % (1U << 14); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 8)) << (14 - 8); - out++; - *out = ((*in) >> 8) % (1U << 14); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 4)) << (14 - 4); - out++; - *out = ((*in) >> 4) % (1U << 14); - out++; - *out = ((*in) >> 18); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack15_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 15); - out++; - *out = ((*in) >> 15) % (1U << 15); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 13)) << (15 - 13); - out++; - *out = ((*in) >> 13) % (1U << 15); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 11)) << (15 - 11); - out++; - *out = ((*in) >> 11) % (1U << 15); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 9)) << (15 - 9); - out++; - *out = ((*in) >> 9) % (1U << 15); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 7)) << (15 - 7); - out++; - *out = ((*in) >> 7) % (1U << 15); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 5)) << (15 - 5); - out++; - *out = ((*in) >> 5) % (1U << 15); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 3)) << (15 - 3); - out++; - *out = ((*in) >> 3) % (1U << 15); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 1)) << (15 - 1); - out++; - *out = ((*in) >> 1) % (1U << 15); - out++; - *out = ((*in) >> 16) % (1U << 15); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 14)) << (15 - 14); - out++; - *out = ((*in) >> 14) % (1U << 15); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 12)) << (15 - 12); - out++; - *out = ((*in) >> 12) % (1U << 15); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 10)) << (15 - 10); - out++; - *out = ((*in) >> 10) % (1U << 15); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 8)) << (15 - 8); - out++; - *out = ((*in) >> 8) % (1U << 15); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 6)) << (15 - 6); - out++; - *out = ((*in) >> 6) % (1U << 15); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 4)) << (15 - 4); - out++; - *out = ((*in) >> 4) % (1U << 15); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 2)) << (15 - 2); - out++; - *out = ((*in) >> 2) % (1U << 15); - out++; - *out = ((*in) >> 17); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack16_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 16); - out++; - *out = ((*in) >> 16); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack17_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 2)) << (17 - 2); - out++; - *out = ((*in) >> 2) % (1U << 17); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 4)) << (17 - 4); - out++; - *out = ((*in) >> 4) % (1U << 17); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 6)) << (17 - 6); - out++; - *out = ((*in) >> 6) % (1U << 17); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 8)) << (17 - 8); - out++; - *out = ((*in) >> 8) % (1U << 17); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 10)) << (17 - 10); - out++; - *out = ((*in) >> 10) % (1U << 17); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 12)) << (17 - 12); - out++; - *out = ((*in) >> 12) % (1U << 17); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 14)) << (17 - 14); - out++; - *out = ((*in) >> 14) % (1U << 17); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 16)) << (17 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 1)) << (17 - 1); - out++; - *out = ((*in) >> 1) % (1U << 17); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 3)) << (17 - 3); - out++; - *out = ((*in) >> 3) % (1U << 17); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 5)) << (17 - 5); - out++; - *out = ((*in) >> 5) % (1U << 17); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 7)) << (17 - 7); - out++; - *out = ((*in) >> 7) % (1U << 17); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 9)) << (17 - 9); - out++; - *out = ((*in) >> 9) % (1U << 17); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 11)) << (17 - 11); - out++; - *out = ((*in) >> 11) % (1U << 17); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 13)) << (17 - 13); - out++; - *out = ((*in) >> 13) % (1U << 17); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 15)) << (17 - 15); - out++; - *out = ((*in) >> 15); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack18_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 4)) << (18 - 4); - out++; - *out = ((*in) >> 4) % (1U << 18); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 8)) << (18 - 8); - out++; - *out = ((*in) >> 8) % (1U << 18); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 12)) << (18 - 12); - out++; - *out = ((*in) >> 12) % (1U << 18); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 16)) << (18 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 2)) << (18 - 2); - out++; - *out = ((*in) >> 2) % (1U << 18); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 6)) << (18 - 6); - out++; - *out = ((*in) >> 6) % (1U << 18); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 10)) << (18 - 10); - out++; - *out = ((*in) >> 10) % (1U << 18); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 14)) << (18 - 14); - out++; - *out = ((*in) >> 14); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack19_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 6)) << (19 - 6); - out++; - *out = ((*in) >> 6) % (1U << 19); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 12)) << (19 - 12); - out++; - *out = ((*in) >> 12) % (1U << 19); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 18)) << (19 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 5)) << (19 - 5); - out++; - *out = ((*in) >> 5) % (1U << 19); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 11)) << (19 - 11); - out++; - *out = ((*in) >> 11) % (1U << 19); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 17)) << (19 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 4)) << (19 - 4); - out++; - *out = ((*in) >> 4) % (1U << 19); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 10)) << (19 - 10); - out++; - *out = ((*in) >> 10) % (1U << 19); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 16)) << (19 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 3)) << (19 - 3); - out++; - *out = ((*in) >> 3) % (1U << 19); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 9)) << (19 - 9); - out++; - *out = ((*in) >> 9) % (1U << 19); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 15)) << (19 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 2)) << (19 - 2); - out++; - *out = ((*in) >> 2) % (1U << 19); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 8)) << (19 - 8); - out++; - *out = ((*in) >> 8) % (1U << 19); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 14)) << (19 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 1)) << (19 - 1); - out++; - *out = ((*in) >> 1) % (1U << 19); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 7)) << (19 - 7); - out++; - *out = ((*in) >> 7) % (1U << 19); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 13)) << (19 - 13); - out++; - *out = ((*in) >> 13); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack20_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 8)) << (20 - 8); - out++; - *out = ((*in) >> 8) % (1U << 20); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 16)) << (20 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 4)) << (20 - 4); - out++; - *out = ((*in) >> 4) % (1U << 20); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 12)) << (20 - 12); - out++; - *out = ((*in) >> 12); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack21_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 10)) << (21 - 10); - out++; - *out = ((*in) >> 10) % (1U << 21); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 20)) << (21 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 9)) << (21 - 9); - out++; - *out = ((*in) >> 9) % (1U << 21); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 19)) << (21 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 8)) << (21 - 8); - out++; - *out = ((*in) >> 8) % (1U << 21); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 18)) << (21 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 7)) << (21 - 7); - out++; - *out = ((*in) >> 7) % (1U << 21); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 17)) << (21 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 6)) << (21 - 6); - out++; - *out = ((*in) >> 6) % (1U << 21); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 16)) << (21 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 5)) << (21 - 5); - out++; - *out = ((*in) >> 5) % (1U << 21); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 15)) << (21 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 4)) << (21 - 4); - out++; - *out = ((*in) >> 4) % (1U << 21); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 14)) << (21 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 3)) << (21 - 3); - out++; - *out = ((*in) >> 3) % (1U << 21); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 13)) << (21 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 2)) << (21 - 2); - out++; - *out = ((*in) >> 2) % (1U << 21); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 12)) << (21 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 1)) << (21 - 1); - out++; - *out = ((*in) >> 1) % (1U << 21); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 11)) << (21 - 11); - out++; - *out = ((*in) >> 11); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack22_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 12)) << (22 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 2)) << (22 - 2); - out++; - *out = ((*in) >> 2) % (1U << 22); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 14)) << (22 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 4)) << (22 - 4); - out++; - *out = ((*in) >> 4) % (1U << 22); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 16)) << (22 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 6)) << (22 - 6); - out++; - *out = ((*in) >> 6) % (1U << 22); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 18)) << (22 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 8)) << (22 - 8); - out++; - *out = ((*in) >> 8) % (1U << 22); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 20)) << (22 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 10)) << (22 - 10); - out++; - *out = ((*in) >> 10); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack23_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 14)) << (23 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 5)) << (23 - 5); - out++; - *out = ((*in) >> 5) % (1U << 23); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 19)) << (23 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 10)) << (23 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 1)) << (23 - 1); - out++; - *out = ((*in) >> 1) % (1U << 23); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 15)) << (23 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 6)) << (23 - 6); - out++; - *out = ((*in) >> 6) % (1U << 23); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 20)) << (23 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 11)) << (23 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 2)) << (23 - 2); - out++; - *out = ((*in) >> 2) % (1U << 23); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 16)) << (23 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 7)) << (23 - 7); - out++; - *out = ((*in) >> 7) % (1U << 23); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 21)) << (23 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 12)) << (23 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 3)) << (23 - 3); - out++; - *out = ((*in) >> 3) % (1U << 23); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 17)) << (23 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 8)) << (23 - 8); - out++; - *out = ((*in) >> 8) % (1U << 23); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 22)) << (23 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 13)) << (23 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 4)) << (23 - 4); - out++; - *out = ((*in) >> 4) % (1U << 23); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 18)) << (23 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 9)) << (23 - 9); - out++; - *out = ((*in) >> 9); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack24_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 16)) << (24 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 8)) << (24 - 8); - out++; - *out = ((*in) >> 8); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack25_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 18)) << (25 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 11)) << (25 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 4)) << (25 - 4); - out++; - *out = ((*in) >> 4) % (1U << 25); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 22)) << (25 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 15)) << (25 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 8)) << (25 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 1)) << (25 - 1); - out++; - *out = ((*in) >> 1) % (1U << 25); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 19)) << (25 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 12)) << (25 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 5)) << (25 - 5); - out++; - *out = ((*in) >> 5) % (1U << 25); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 23)) << (25 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 16)) << (25 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 9)) << (25 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 2)) << (25 - 2); - out++; - *out = ((*in) >> 2) % (1U << 25); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 20)) << (25 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 13)) << (25 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 6)) << (25 - 6); - out++; - *out = ((*in) >> 6) % (1U << 25); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 24)) << (25 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 17)) << (25 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 10)) << (25 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 3)) << (25 - 3); - out++; - *out = ((*in) >> 3) % (1U << 25); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 21)) << (25 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 14)) << (25 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 7)) << (25 - 7); - out++; - *out = ((*in) >> 7); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack26_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 20)) << (26 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 14)) << (26 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 8)) << (26 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 2)) << (26 - 2); - out++; - *out = ((*in) >> 2) % (1U << 26); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 22)) << (26 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 16)) << (26 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 10)) << (26 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 4)) << (26 - 4); - out++; - *out = ((*in) >> 4) % (1U << 26); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 24)) << (26 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 18)) << (26 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 12)) << (26 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 6)) << (26 - 6); - out++; - *out = ((*in) >> 6); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack27_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 22)) << (27 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 17)) << (27 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 12)) << (27 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 7)) << (27 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 2)) << (27 - 2); - out++; - *out = ((*in) >> 2) % (1U << 27); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 24)) << (27 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 19)) << (27 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 14)) << (27 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 9)) << (27 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 4)) << (27 - 4); - out++; - *out = ((*in) >> 4) % (1U << 27); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 26)) << (27 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 21)) << (27 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 16)) << (27 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 11)) << (27 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 6)) << (27 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 1)) << (27 - 1); - out++; - *out = ((*in) >> 1) % (1U << 27); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 23)) << (27 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 18)) << (27 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 13)) << (27 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 8)) << (27 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 3)) << (27 - 3); - out++; - *out = ((*in) >> 3) % (1U << 27); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 25)) << (27 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 20)) << (27 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 15)) << (27 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 10)) << (27 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 5)) << (27 - 5); - out++; - *out = ((*in) >> 5); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack28_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 24)) << (28 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 20)) << (28 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 16)) << (28 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 12)) << (28 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 8)) << (28 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 4)) << (28 - 4); - out++; - *out = ((*in) >> 4); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack29_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 29); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 26)) << (29 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 23)) << (29 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 20)) << (29 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 17)) << (29 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 14)) << (29 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 11)) << (29 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 8)) << (29 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 5)) << (29 - 5); - out++; - *out = ((*in) >> 5); - ++in; - *out |= ((*in) % (1U << 2)) << (29 - 2); - out++; - *out = ((*in) >> 2) % (1U << 29); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 28)) << (29 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 25)) << (29 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 22)) << (29 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 19)) << (29 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 16)) << (29 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 13)) << (29 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 10)) << (29 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 7)) << (29 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 4)) << (29 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 1)) << (29 - 1); - out++; - *out = ((*in) >> 1) % (1U << 29); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 27)) << (29 - 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 24)) << (29 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 21)) << (29 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 18)) << (29 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 15)) << (29 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 12)) << (29 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 9)) << (29 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 6)) << (29 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 3)) << (29 - 3); - out++; - *out = ((*in) >> 3); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack30_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2); - ++in; - out++; - *out = ((*in) >> 0) % (1U << 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 28)) << (30 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 26)) << (30 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 24)) << (30 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 22)) << (30 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 20)) << (30 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 18)) << (30 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 16)) << (30 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 14)) << (30 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 12)) << (30 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 10)) << (30 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 8)) << (30 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 6)) << (30 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 4)) << (30 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 2)) << (30 - 2); - out++; - *out = ((*in) >> 2); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack31_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0) % (1U << 31); - out++; - *out = ((*in) >> 31); - ++in; - *out |= ((*in) % (1U << 30)) << (31 - 30); - out++; - *out = ((*in) >> 30); - ++in; - *out |= ((*in) % (1U << 29)) << (31 - 29); - out++; - *out = ((*in) >> 29); - ++in; - *out |= ((*in) % (1U << 28)) << (31 - 28); - out++; - *out = ((*in) >> 28); - ++in; - *out |= ((*in) % (1U << 27)) << (31 - 27); - out++; - *out = ((*in) >> 27); - ++in; - *out |= ((*in) % (1U << 26)) << (31 - 26); - out++; - *out = ((*in) >> 26); - ++in; - *out |= ((*in) % (1U << 25)) << (31 - 25); - out++; - *out = ((*in) >> 25); - ++in; - *out |= ((*in) % (1U << 24)) << (31 - 24); - out++; - *out = ((*in) >> 24); - ++in; - *out |= ((*in) % (1U << 23)) << (31 - 23); - out++; - *out = ((*in) >> 23); - ++in; - *out |= ((*in) % (1U << 22)) << (31 - 22); - out++; - *out = ((*in) >> 22); - ++in; - *out |= ((*in) % (1U << 21)) << (31 - 21); - out++; - *out = ((*in) >> 21); - ++in; - *out |= ((*in) % (1U << 20)) << (31 - 20); - out++; - *out = ((*in) >> 20); - ++in; - *out |= ((*in) % (1U << 19)) << (31 - 19); - out++; - *out = ((*in) >> 19); - ++in; - *out |= ((*in) % (1U << 18)) << (31 - 18); - out++; - *out = ((*in) >> 18); - ++in; - *out |= ((*in) % (1U << 17)) << (31 - 17); - out++; - *out = ((*in) >> 17); - ++in; - *out |= ((*in) % (1U << 16)) << (31 - 16); - out++; - *out = ((*in) >> 16); - ++in; - *out |= ((*in) % (1U << 15)) << (31 - 15); - out++; - *out = ((*in) >> 15); - ++in; - *out |= ((*in) % (1U << 14)) << (31 - 14); - out++; - *out = ((*in) >> 14); - ++in; - *out |= ((*in) % (1U << 13)) << (31 - 13); - out++; - *out = ((*in) >> 13); - ++in; - *out |= ((*in) % (1U << 12)) << (31 - 12); - out++; - *out = ((*in) >> 12); - ++in; - *out |= ((*in) % (1U << 11)) << (31 - 11); - out++; - *out = ((*in) >> 11); - ++in; - *out |= ((*in) % (1U << 10)) << (31 - 10); - out++; - *out = ((*in) >> 10); - ++in; - *out |= ((*in) % (1U << 9)) << (31 - 9); - out++; - *out = ((*in) >> 9); - ++in; - *out |= ((*in) % (1U << 8)) << (31 - 8); - out++; - *out = ((*in) >> 8); - ++in; - *out |= ((*in) % (1U << 7)) << (31 - 7); - out++; - *out = ((*in) >> 7); - ++in; - *out |= ((*in) % (1U << 6)) << (31 - 6); - out++; - *out = ((*in) >> 6); - ++in; - *out |= ((*in) % (1U << 5)) << (31 - 5); - out++; - *out = ((*in) >> 5); - ++in; - *out |= ((*in) % (1U << 4)) << (31 - 4); - out++; - *out = ((*in) >> 4); - ++in; - *out |= ((*in) % (1U << 3)) << (31 - 3); - out++; - *out = ((*in) >> 3); - ++in; - *out |= ((*in) % (1U << 2)) << (31 - 2); - out++; - *out = ((*in) >> 2); - ++in; - *out |= ((*in) % (1U << 1)) << (31 - 1); - out++; - *out = ((*in) >> 1); - ++in; - out++; - - return in; -} - -inline const uint32_t* unpack32_32(const uint32_t* in, uint32_t* out) { - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - *out = ((*in) >> 0); - ++in; - out++; - - return in; -} - -inline const uint32_t* nullunpacker32(const uint32_t* in, uint32_t* out) { - for (int k = 0; k < 32; ++k) { - out[k] = 0; - } - return in; -} - -inline int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - batch_size = batch_size / 32 * 32; - int num_loops = batch_size / 32; - - switch (num_bits) { - case 0: - for (int i = 0; i < num_loops; ++i) in = nullunpacker32(in, out + i * 32); - break; - case 1: - for (int i = 0; i < num_loops; ++i) in = unpack1_32(in, out + i * 32); - break; - case 2: - for (int i = 0; i < num_loops; ++i) in = unpack2_32(in, out + i * 32); - break; - case 3: - for (int i = 0; i < num_loops; ++i) in = unpack3_32(in, out + i * 32); - break; - case 4: - for (int i = 0; i < num_loops; ++i) in = unpack4_32(in, out + i * 32); - break; - case 5: - for (int i = 0; i < num_loops; ++i) in = unpack5_32(in, out + i * 32); - break; - case 6: - for (int i = 0; i < num_loops; ++i) in = unpack6_32(in, out + i * 32); - break; - case 7: - for (int i = 0; i < num_loops; ++i) in = unpack7_32(in, out + i * 32); - break; - case 8: - for (int i = 0; i < num_loops; ++i) in = unpack8_32(in, out + i * 32); - break; - case 9: - for (int i = 0; i < num_loops; ++i) in = unpack9_32(in, out + i * 32); - break; - case 10: - for (int i = 0; i < num_loops; ++i) in = unpack10_32(in, out + i * 32); - break; - case 11: - for (int i = 0; i < num_loops; ++i) in = unpack11_32(in, out + i * 32); - break; - case 12: - for (int i = 0; i < num_loops; ++i) in = unpack12_32(in, out + i * 32); - break; - case 13: - for (int i = 0; i < num_loops; ++i) in = unpack13_32(in, out + i * 32); - break; - case 14: - for (int i = 0; i < num_loops; ++i) in = unpack14_32(in, out + i * 32); - break; - case 15: - for (int i = 0; i < num_loops; ++i) in = unpack15_32(in, out + i * 32); - break; - case 16: - for (int i = 0; i < num_loops; ++i) in = unpack16_32(in, out + i * 32); - break; - case 17: - for (int i = 0; i < num_loops; ++i) in = unpack17_32(in, out + i * 32); - break; - case 18: - for (int i = 0; i < num_loops; ++i) in = unpack18_32(in, out + i * 32); - break; - case 19: - for (int i = 0; i < num_loops; ++i) in = unpack19_32(in, out + i * 32); - break; - case 20: - for (int i = 0; i < num_loops; ++i) in = unpack20_32(in, out + i * 32); - break; - case 21: - for (int i = 0; i < num_loops; ++i) in = unpack21_32(in, out + i * 32); - break; - case 22: - for (int i = 0; i < num_loops; ++i) in = unpack22_32(in, out + i * 32); - break; - case 23: - for (int i = 0; i < num_loops; ++i) in = unpack23_32(in, out + i * 32); - break; - case 24: - for (int i = 0; i < num_loops; ++i) in = unpack24_32(in, out + i * 32); - break; - case 25: - for (int i = 0; i < num_loops; ++i) in = unpack25_32(in, out + i * 32); - break; - case 26: - for (int i = 0; i < num_loops; ++i) in = unpack26_32(in, out + i * 32); - break; - case 27: - for (int i = 0; i < num_loops; ++i) in = unpack27_32(in, out + i * 32); - break; - case 28: - for (int i = 0; i < num_loops; ++i) in = unpack28_32(in, out + i * 32); - break; - case 29: - for (int i = 0; i < num_loops; ++i) in = unpack29_32(in, out + i * 32); - break; - case 30: - for (int i = 0; i < num_loops; ++i) in = unpack30_32(in, out + i * 32); - break; - case 31: - for (int i = 0; i < num_loops; ++i) in = unpack31_32(in, out + i * 32); - break; - case 32: - for (int i = 0; i < num_loops; ++i) in = unpack32_32(in, out + i * 32); - break; - default: - DCHECK(false) << "Unsupported num_bits"; - } - - return batch_size; -} - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_BPACKING_H diff --git a/r/R/inst/include/arrow/util/checked_cast.h b/r/R/inst/include/arrow/util/checked_cast.h deleted file mode 100644 index 718f1057343..00000000000 --- a/r/R/inst/include/arrow/util/checked_cast.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_CAST_H -#define ARROW_CAST_H - -#include -#include - -namespace arrow { -namespace internal { - -template -inline OutputType checked_cast(InputType&& value) { - static_assert(std::is_class::type>::type>::value, - "checked_cast input type must be a class"); - static_assert(std::is_class::type>::type>::value, - "checked_cast output type must be a class"); -#ifdef NDEBUG - return static_cast(value); -#else - return dynamic_cast(value); -#endif -} - -template -std::shared_ptr checked_pointer_cast(const std::shared_ptr& r) noexcept { -#ifndef NDEBUG - return std::static_pointer_cast(r); -#else - return std::dynamic_pointer_cast(r); -#endif -} - -} // namespace internal -} // namespace arrow - -#endif // ARROW_CAST_H diff --git a/r/R/inst/include/arrow/util/compiler-util.h b/r/R/inst/include/arrow/util/compiler-util.h deleted file mode 100644 index 820a9b0c11b..00000000000 --- a/r/R/inst/include/arrow/util/compiler-util.h +++ /dev/null @@ -1,25 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Deprecated header, here for backwards compatibility in parquet-cpp - -#ifndef ARROW_UTIL_COMPILER_UTIL_H -#define ARROW_UTIL_COMPILER_UTIL_H - -#include "arrow/util/macros.h" - -#endif // ARROW_UTIL_COMPILER_UTIL_H diff --git a/r/R/inst/include/arrow/util/compression.h b/r/R/inst/include/arrow/util/compression.h deleted file mode 100644 index 43174f4dba4..00000000000 --- a/r/R/inst/include/arrow/util/compression.h +++ /dev/null @@ -1,153 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_H -#define ARROW_UTIL_COMPRESSION_H - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -struct Compression { - enum type { UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO, BZ2 }; -}; - -namespace util { - -/// \brief Streaming compressor interface -/// -class ARROW_EXPORT Compressor { - public: - virtual ~Compressor(); - - /// \brief Compress some input. - /// - /// If bytes_read is 0 on return, then a larger output buffer should be supplied. - virtual Status Compress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output, int64_t* bytes_read, - int64_t* bytes_written) = 0; - - /// \brief Flush part of the compressed output. - /// - /// If should_retry is true on return, Flush() should be called again - /// with a larger buffer. - virtual Status Flush(int64_t output_len, uint8_t* output, int64_t* bytes_written, - bool* should_retry) = 0; - - /// \brief End compressing, doing whatever is necessary to end the stream. - /// - /// If should_retry is true on return, End() should be called again - /// with a larger buffer. Otherwise, the Compressor should not be used anymore. - /// - /// End() implies Flush(). - virtual Status End(int64_t output_len, uint8_t* output, int64_t* bytes_written, - bool* should_retry) = 0; - - // XXX add methods for buffer size heuristics? -}; - -/// \brief Streaming decompressor interface -/// -class ARROW_EXPORT Decompressor { - public: - virtual ~Decompressor(); - - /// \brief Decompress some input. - /// - /// If need_more_output is true on return, a larger output buffer needs - /// to be supplied. - /// XXX is need_more_output necessary? (Brotli?) - virtual Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, - uint8_t* output, int64_t* bytes_read, int64_t* bytes_written, - bool* need_more_output) = 0; - - /// \brief Return whether the compressed stream is finished. - /// - /// This is a heuristic. If true is returned, then it is guaranteed - /// that the stream is finished. If false is returned, however, it may - /// simply be that the underlying library isn't able to provide the information. - virtual bool IsFinished() = 0; - - // XXX add methods for buffer size heuristics? -}; - -class ARROW_EXPORT Codec { - public: - virtual ~Codec(); - - static Status Create(Compression::type codec, std::unique_ptr* out); - - /// \brief One-shot decompression function - /// - /// output_buffer_len must be correct and therefore be obtained in advance. - /// - /// \note One-shot decompression is not always compatible with streaming - /// compression. Depending on the codec (e.g. LZ4), different formats may - /// be used. - virtual Status Decompress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer) = 0; - - /// \brief One-shot decompression function that also returns the - /// actual decompressed size. - /// - /// \param[in] input_len the number of bytes of compressed data. - /// \param[in] input the compressed data. - /// \param[in] output_buffer_len the number of bytes of buffer for - /// decompressed data. - /// \param[in] output_buffer the buffer for decompressed data. - /// \param[out] output_len the actual decompressed size. - /// - /// \note One-shot decompression is not always compatible with streaming - /// compression. Depending on the codec (e.g. LZ4), different formats may - /// be used. - virtual Status Decompress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, - int64_t* output_len) = 0; - - /// \brief One-shot compression function - /// - /// output_buffer_len must first have been computed using MaxCompressedLen(). - /// - /// \note One-shot compression is not always compatible with streaming - /// decompression. Depending on the codec (e.g. LZ4), different formats may - /// be used. - virtual Status Compress(int64_t input_len, const uint8_t* input, - int64_t output_buffer_len, uint8_t* output_buffer, - int64_t* output_len) = 0; - - virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0; - - // XXX Should be able to choose compression level, or presets? ("fast", etc.) - - /// \brief Create a streaming compressor instance - virtual Status MakeCompressor(std::shared_ptr* out) = 0; - - /// \brief Create a streaming decompressor instance - virtual Status MakeDecompressor(std::shared_ptr* out) = 0; - - virtual const char* name() const = 0; -}; - -} // namespace util -} // namespace arrow - -#endif diff --git a/r/R/inst/include/arrow/util/compression_brotli.h b/r/R/inst/include/arrow/util/compression_brotli.h deleted file mode 100644 index 59f97cda6b9..00000000000 --- a/r/R/inst/include/arrow/util/compression_brotli.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_BROTLI_H -#define ARROW_UTIL_COMPRESSION_BROTLI_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -// Brotli codec. -class ARROW_EXPORT BrotliCodec : public Codec { - public: - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override { return "brotli"; } -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_BROTLI_H diff --git a/r/R/inst/include/arrow/util/compression_bz2.h b/r/R/inst/include/arrow/util/compression_bz2.h deleted file mode 100644 index 21461588255..00000000000 --- a/r/R/inst/include/arrow/util/compression_bz2.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_BZ2_H -#define ARROW_UTIL_COMPRESSION_BZ2_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -// BZ2 codec. -class ARROW_EXPORT BZ2Codec : public Codec { - public: - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override { return "bz2"; } -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_BZ2_H diff --git a/r/R/inst/include/arrow/util/compression_lz4.h b/r/R/inst/include/arrow/util/compression_lz4.h deleted file mode 100644 index 4d06f03c2c4..00000000000 --- a/r/R/inst/include/arrow/util/compression_lz4.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_LZ4_H -#define ARROW_UTIL_COMPRESSION_LZ4_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -// Lz4 codec. -class ARROW_EXPORT Lz4Codec : public Codec { - public: - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override { return "lz4"; } -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_LZ4_H diff --git a/r/R/inst/include/arrow/util/compression_snappy.h b/r/R/inst/include/arrow/util/compression_snappy.h deleted file mode 100644 index 7029400ab2e..00000000000 --- a/r/R/inst/include/arrow/util/compression_snappy.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_SNAPPY_H -#define ARROW_UTIL_COMPRESSION_SNAPPY_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -class ARROW_EXPORT SnappyCodec : public Codec { - public: - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override { return "snappy"; } -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_SNAPPY_H diff --git a/r/R/inst/include/arrow/util/compression_zlib.h b/r/R/inst/include/arrow/util/compression_zlib.h deleted file mode 100644 index 9a5feaa290c..00000000000 --- a/r/R/inst/include/arrow/util/compression_zlib.h +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_ZLIB_H -#define ARROW_UTIL_COMPRESSION_ZLIB_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -// GZip codec. -class ARROW_EXPORT GZipCodec : public Codec { - public: - /// Compression formats supported by the zlib library - enum Format { - ZLIB, - DEFLATE, - GZIP, - }; - - explicit GZipCodec(Format format = GZIP); - ~GZipCodec() override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override; - - private: - // The gzip compressor is stateful - class GZipCodecImpl; - std::unique_ptr impl_; -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_ZLIB_H diff --git a/r/R/inst/include/arrow/util/compression_zstd.h b/r/R/inst/include/arrow/util/compression_zstd.h deleted file mode 100644 index 8b05d8c80a9..00000000000 --- a/r/R/inst/include/arrow/util/compression_zstd.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_COMPRESSION_ZSTD_H -#define ARROW_UTIL_COMPRESSION_ZSTD_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/compression.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -// ZSTD codec. -class ARROW_EXPORT ZSTDCodec : public Codec { - public: - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer) override; - - Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, - uint8_t* output_buffer, int64_t* output_len) override; - - int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; - - Status MakeCompressor(std::shared_ptr* out) override; - - Status MakeDecompressor(std::shared_ptr* out) override; - - const char* name() const override { return "zstd"; } -}; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_COMPRESSION_ZSTD_H diff --git a/r/R/inst/include/arrow/util/cpu-info.h b/r/R/inst/include/arrow/util/cpu-info.h deleted file mode 100644 index 714d7ac5bc5..00000000000 --- a/r/R/inst/include/arrow/util/cpu-info.h +++ /dev/null @@ -1,101 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal -// set of functions needed for Apache Arrow / Apache parquet-cpp - -#ifndef ARROW_UTIL_CPU_INFO_H -#define ARROW_UTIL_CPU_INFO_H - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -/// CpuInfo is an interface to query for cpu information at runtime. The caller can -/// ask for the sizes of the caches and what hardware features are supported. -/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and -/// /sys/devices) -class ARROW_EXPORT CpuInfo { - public: - static constexpr int64_t SSSE3 = (1 << 1); - static constexpr int64_t SSE4_1 = (1 << 2); - static constexpr int64_t SSE4_2 = (1 << 3); - static constexpr int64_t POPCNT = (1 << 4); - - /// Cache enums for L1 (data), L2 and L3 - enum CacheLevel { - L1_CACHE = 0, - L2_CACHE = 1, - L3_CACHE = 2, - }; - - static CpuInfo* GetInstance(); - - /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error - /// and terminate. - void VerifyCpuRequirements(); - - /// Returns all the flags for this cpu - int64_t hardware_flags(); - - /// Returns whether of not the cpu supports this flag - bool IsSupported(int64_t flag) const { return (hardware_flags_ & flag) != 0; } - - /// \brief The processor supports SSE4.2 and the Arrow libraries are built - /// with support for it - bool CanUseSSE4_2() const; - - /// Toggle a hardware feature on and off. It is not valid to turn on a feature - /// that the underlying hardware cannot support. This is useful for testing. - void EnableFeature(int64_t flag, bool enable); - - /// Returns the size of the cache in KB at this cache level - int64_t CacheSize(CacheLevel level); - - /// Returns the number of cpu cycles per millisecond - int64_t cycles_per_ms(); - - /// Returns the number of cores (including hyper-threaded) on this machine. - int num_cores(); - - /// Returns the model name of the cpu (e.g. Intel i7-2600) - std::string model_name(); - - private: - CpuInfo(); - - void Init(); - - /// Inits CPU cache size variables with default values - void SetDefaultCacheSize(); - - int64_t hardware_flags_; - int64_t original_hardware_flags_; - int64_t cache_sizes_[L3_CACHE + 1]; - int64_t cycles_per_ms_; - int num_cores_; - std::string model_name_; -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_CPU_INFO_H diff --git a/r/R/inst/include/arrow/util/decimal.h b/r/R/inst/include/arrow/util/decimal.h deleted file mode 100644 index 3a576d085aa..00000000000 --- a/r/R/inst/include/arrow/util/decimal.h +++ /dev/null @@ -1,133 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/basic_decimal.h" -#include "arrow/util/string_view.h" - -namespace arrow { - -/// Represents a signed 128-bit integer in two's complement. -/// Calculations wrap around and overflow is ignored. -/// -/// For a discussion of the algorithms, look at Knuth's volume 2, -/// Semi-numerical Algorithms section 4.3.1. -/// -/// Adapted from the Apache ORC C++ implementation -/// -/// The implementation is split into two parts : -/// -/// 1. BasicDecimal128 -/// - can be safely compiled to IR without references to libstdc++. -/// 2. Decimal128 -/// - has additional functionality on top of BasicDecimal128 to deal with -/// strings and streams. -class ARROW_EXPORT Decimal128 : public BasicDecimal128 { - public: - /// \cond FALSE - // (need to avoid a duplicate definition in Sphinx) - using BasicDecimal128::BasicDecimal128; - /// \endcond - - /// \brief constructor creates a Decimal128 from a BasicDecimal128. - constexpr Decimal128(const BasicDecimal128& value) noexcept : BasicDecimal128(value) {} - - /// \brief Parse the number from a base 10 string representation. - explicit Decimal128(const std::string& value); - - /// \brief Empty constructor creates a Decimal128 with a value of 0. - // This is required on some older compilers. - constexpr Decimal128() noexcept : BasicDecimal128() {} - - /// Divide this number by right and return the result. - /// - /// This operation is not destructive. - /// The answer rounds to zero. Signs work like: - /// 21 / 5 -> 4, 1 - /// -21 / 5 -> -4, -1 - /// 21 / -5 -> -4, 1 - /// -21 / -5 -> 4, -1 - /// \param[in] divisor the number to divide by - /// \param[out] result the quotient - /// \param[out] remainder the remainder after the division - Status Divide(const Decimal128& divisor, Decimal128* result, - Decimal128* remainder) const { - auto dstatus = BasicDecimal128::Divide(divisor, result, remainder); - return ToArrowStatus(dstatus); - } - - /// \brief Convert the Decimal128 value to a base 10 decimal string with the given - /// scale. - std::string ToString(int32_t scale) const; - - /// \brief Convert the value to an integer string - std::string ToIntegerString() const; - - /// \brief Cast this value to an int64_t. - explicit operator int64_t() const; - - /// \brief Convert a decimal string to a Decimal128 value, optionally including - /// precision and scale if they're passed in and not null. - static Status FromString(const util::string_view& s, Decimal128* out, - int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); - static Status FromString(const std::string& s, Decimal128* out, - int32_t* precision = NULLPTR, int32_t* scale = NULLPTR); - static Status FromString(const char* s, Decimal128* out, int32_t* precision = NULLPTR, - int32_t* scale = NULLPTR); - - /// \brief Convert from a big-endian byte representation. The length must be - /// between 1 and 16. - /// \return error status if the length is an invalid value - static Status FromBigEndian(const uint8_t* data, int32_t length, Decimal128* out); - - /// \brief Convert Decimal128 from one scale to another - Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const { - auto dstatus = BasicDecimal128::Rescale(original_scale, new_scale, out); - return ToArrowStatus(dstatus); - } - - /// \brief Convert to a signed integer - template > - Status ToInteger(T* out) const { - constexpr auto min_value = std::numeric_limits::min(); - constexpr auto max_value = std::numeric_limits::max(); - const auto& self = *this; - if (self < min_value || self > max_value) { - return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T), - " byte integer"); - } - *out = static_cast(low_bits()); - return Status::OK(); - } - - friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os, - const Decimal128& decimal); - - private: - /// Converts internal error code to Status - Status ToArrowStatus(DecimalStatus dstatus) const; -}; - -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/hash-util.h b/r/R/inst/include/arrow/util/hash-util.h deleted file mode 100644 index 7aed3c171dc..00000000000 --- a/r/R/inst/include/arrow/util/hash-util.h +++ /dev/null @@ -1,310 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala (incubating) as of 2016-02-22 - -#ifndef ARROW_UTIL_HASH_UTIL_H -#define ARROW_UTIL_HASH_UTIL_H - -#include -#include - -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/neon-util.h" -#include "arrow/util/sse-util.h" - -static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) { - DCHECK(false) << "Hardware CRC support is not enabled"; - return 0; -} - -#ifdef ARROW_HAVE_SSE4_2 -#define HW_crc32_u8 SSE4_crc32_u8 -#define HW_crc32_u16 SSE4_crc32_u16 -#define HW_crc32_u32 SSE4_crc32_u32 -#define HW_crc32_u64 SSE4_crc32_u64 -#elif defined(ARROW_HAVE_ARM_CRC) -#define HW_crc32_u8 ARMCE_crc32_u8 -#define HW_crc32_u16 ARMCE_crc32_u16 -#define HW_crc32_u32 ARMCE_crc32_u32 -#define HW_crc32_u64 ARMCE_crc32_u64 -#endif - -namespace arrow { - -/// Utility class to compute hash values. -class HashUtil { - public: -#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC) - static constexpr bool have_hardware_crc32 = true; -#else - static constexpr bool have_hardware_crc32 = false; -#endif - - /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions. The input hash - /// parameter is the current hash/seed value. - /// This should only be called if SSE/ArmCRC is supported. - /// This is ~4x faster than Fnv/Boost Hash. - /// TODO: crc32 hashes with different seeds do not result in different hash functions. - /// The resulting hashes are correlated. - static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) { - const uint8_t* p = reinterpret_cast(data); - const uint8_t* end = p + nbytes; - -#if ARROW_BITNESS >= 64 - while (p <= end - 8) { - hash = HW_crc32_u64(hash, *reinterpret_cast(p)); - p += 8; - } -#endif - - while (p <= end - 4) { - hash = HW_crc32_u32(hash, *reinterpret_cast(p)); - p += 4; - } - while (p < end) { - hash = HW_crc32_u8(hash, *p); - ++p; - } - - // The lower half of the CRC hash has has poor uniformity, so swap the halves - // for anyone who only uses the first several bits of the hash. - hash = (hash << 16) | (hash >> 16); - return hash; - } - - /// A variant of CRC32 hashing that computes two independent running CRCs - /// over interleaved halves of the input, giving out a 64-bit integer. - /// The result's quality should be improved by a finalization step. - /// - /// In addition to producing more bits of output, this should be twice - /// faster than CrcHash on CPUs that can overlap several independent - /// CRC computations. - static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) { - const uint8_t* p = reinterpret_cast(data); - - uint32_t h1 = static_cast(hash >> 32); - uint32_t h2 = static_cast(hash); - -#if ARROW_BITNESS >= 64 - while (nbytes >= 16) { - h1 = HW_crc32_u64(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u64(h2, *reinterpret_cast(p + 8)); - nbytes -= 16; - p += 16; - } - if (nbytes >= 8) { - h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); - nbytes -= 8; - p += 8; - } -#else - while (nbytes >= 8) { - h1 = HW_crc32_u32(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u32(h2, *reinterpret_cast(p + 4)); - nbytes -= 8; - p += 8; - } -#endif - - if (nbytes >= 4) { - h1 = HW_crc32_u16(h1, *reinterpret_cast(p)); - h2 = HW_crc32_u16(h2, *reinterpret_cast(p + 2)); - nbytes -= 4; - p += 4; - } - switch (nbytes) { - case 3: - h1 = HW_crc32_u8(h1, p[2]); - // fallthrough - case 2: - h2 = HW_crc32_u8(h2, p[1]); - // fallthrough - case 1: - h1 = HW_crc32_u8(h1, p[0]); - // fallthrough - case 0: - break; - default: - assert(0); - } - - // A finalization step is recommended to mix up the result's bits - return (static_cast(h1) << 32) + h2; - } - - static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995; - static const int MURMUR_R = 47; - - /// Murmur2 hash implementation returning 64-bit hashes. - static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { - uint64_t h = seed ^ (len * MURMUR_PRIME); - - const uint64_t* data = reinterpret_cast(input); - const uint64_t* end = data + (len / sizeof(uint64_t)); - - while (data != end) { - uint64_t k = *data++; - k *= MURMUR_PRIME; - k ^= k >> MURMUR_R; - k *= MURMUR_PRIME; - h ^= k; - h *= MURMUR_PRIME; - } - - const uint8_t* data2 = reinterpret_cast(data); - switch (len & 7) { - case 7: - h ^= uint64_t(data2[6]) << 48; - case 6: - h ^= uint64_t(data2[5]) << 40; - case 5: - h ^= uint64_t(data2[4]) << 32; - case 4: - h ^= uint64_t(data2[3]) << 24; - case 3: - h ^= uint64_t(data2[2]) << 16; - case 2: - h ^= uint64_t(data2[1]) << 8; - case 1: - h ^= uint64_t(data2[0]); - h *= MURMUR_PRIME; - } - - h ^= h >> MURMUR_R; - h *= MURMUR_PRIME; - h ^= h >> MURMUR_R; - return h; - } - - /// default values recommended by http://isthe.com/chongo/tech/comp/fnv/ - static const uint32_t FNV_PRIME = 0x01000193; // 16777619 - static const uint32_t FNV_SEED = 0x811C9DC5; // 2166136261 - static const uint64_t FNV64_PRIME = 1099511628211UL; - static const uint64_t FNV64_SEED = 14695981039346656037UL; - - /// Implementation of the Fowler-Noll-Vo hash function. This is not as performant - /// as boost's hash on int types (2x slower) but has bit entropy. - /// For ints, boost just returns the value of the int which can be pathological. - /// For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 - /// is taken on the hash, all values will collide to the same bucket. - /// For string values, Fnv is slightly faster than boost. - /// IMPORTANT: FNV hash suffers from poor diffusion of the least significant bit, - /// which can lead to poor results when input bytes are duplicated. - /// See FnvHash64to32() for how this can be mitigated. - static uint64_t FnvHash64(const void* data, int32_t bytes, uint64_t hash) { - const uint8_t* ptr = reinterpret_cast(data); - while (bytes--) { - hash = (*ptr ^ hash) * FNV64_PRIME; - ++ptr; - } - return hash; - } - - /// Return a 32-bit hash computed by invoking FNV-64 and folding the result to 32-bits. - /// This technique is recommended instead of FNV-32 since the LSB of an FNV hash is the - /// XOR of the LSBs of its input bytes, leading to poor results for duplicate inputs. - /// The input seed 'hash' is duplicated so the top half of the seed is not all zero. - /// Data length must be at least 1 byte: zero-length data should be handled separately, - /// for example using CombineHash with a unique constant value to avoid returning the - /// hash argument. Zero-length data gives terrible results: the initial hash value is - /// xored with itself cancelling all bits. - static uint32_t FnvHash64to32(const void* data, int32_t bytes, uint32_t hash) { - // IMPALA-2270: this function should never be used for zero-byte inputs. - DCHECK_GT(bytes, 0); - uint64_t hash_u64 = hash | (static_cast(hash) << 32); - hash_u64 = FnvHash64(data, bytes, hash_u64); - return static_cast((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF)); - } - - // Hash template - template - static inline int Hash(const void* data, int32_t bytes, uint32_t seed); - - /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio). - static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9; - - /// Combine hashes 'value' and 'seed' to get a new hash value. Similar to - /// boost::hash_combine(), but for uint32_t. This function should be used with a - /// constant first argument to update the hash value for zero-length values such as - /// NULL, boolean, and empty strings. - static inline uint32_t HashCombine32(uint32_t value, uint32_t seed) { - return seed ^ (HASH_COMBINE_SEED + value + (seed << 6) + (seed >> 2)); - } - - // Get 32 more bits of randomness from a 32-bit hash: - static inline uint32_t Rehash32to32(const uint32_t hash) { - // Constants generated by uuidgen(1) with the -r flag - static const uint64_t m = 0x7850f11ec6d14889ull, a = 0x6773610597ca4c63ull; - // This is strongly universal hashing following Dietzfelbinger's "Universal hashing - // and k-wise independent random variables via integer arithmetic without primes". As - // such, for any two distinct uint32_t's hash1 and hash2, the probability (over the - // randomness of the constants) that any subset of bit positions of - // Rehash32to32(hash1) is equal to the same subset of bit positions - // Rehash32to32(hash2) is minimal. - return static_cast((static_cast(hash) * m + a) >> 32); - } - - static inline uint64_t Rehash32to64(const uint32_t hash) { - static const uint64_t m1 = 0x47b6137a44974d91ull, m2 = 0x8824ad5ba2b7289cull, - a1 = 0x705495c62df1424aull, a2 = 0x9efc49475c6bfb31ull; - const uint64_t hash1 = (static_cast(hash) * m1 + a1) >> 32; - const uint64_t hash2 = (static_cast(hash) * m2 + a2) >> 32; - return hash1 | (hash2 << 32); - } -}; - -// HW Hash -template <> -inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { -#ifdef ARROW_HAVE_ARM_CRC - // Need run time check for Arm - // if not support, fall back to Murmur - if (!crc32c_runtime_check()) - return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); - else -#endif - // Double CRC - return static_cast(HashUtil::DoubleCrcHash(data, bytes, seed)); -} - -// Murmur Hash -template <> -inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { - return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); -} - -} // namespace arrow - -#endif // ARROW_UTIL_HASH_UTIL_H diff --git a/r/R/inst/include/arrow/util/hashing.h b/r/R/inst/include/arrow/util/hashing.h deleted file mode 100644 index 27301585fc6..00000000000 --- a/r/R/inst/include/arrow/util/hashing.h +++ /dev/null @@ -1,807 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Private header, not to be exported - -#ifndef ARROW_UTIL_HASHING_H -#define ARROW_UTIL_HASHING_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/hash-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" - -namespace arrow { -namespace internal { - -// XXX would it help to have a 32-bit hash value on large datasets? -typedef uint64_t hash_t; - -// Notes about the choice of a hash function. -// - xxHash64 is extremely fast on large enough data -// - for small- to medium-sized data, there are better choices -// (see comprehensive benchmarks results at -// https://aras-p.info/blog/2016/08/09/More-Hash-Function-Tests/) -// - for very small fixed-size data (<= 16 bytes, e.g. Decimal128), it is -// beneficial to define specialized hash functions -// - while xxHash and others have good statistical properties, we can relax those -// a bit if it helps performance (especially if the hash table implementation -// has a good collision resolution strategy) - -template -inline hash_t ComputeStringHash(const void* data, int64_t length); - -template -struct ScalarHelperBase { - static bool CompareScalars(Scalar u, Scalar v) { return u == v; } - - static hash_t ComputeHash(const Scalar& value) { - // Generic hash computation for scalars. Simply apply the string hash - // to the bit representation of the value. - - // XXX in the case of FP values, we'd like equal values to have the same hash, - // even if they have different bit representations... - return ComputeStringHash(&value, sizeof(value)); - } -}; - -template -struct ScalarHelper : public ScalarHelperBase {}; - -template -struct ScalarHelper::value>::type> - : public ScalarHelperBase { - // ScalarHelper specialization for integers - - static hash_t ComputeHash(const Scalar& value) { - // Faster hash computation for integers. - - // Two of xxhash's prime multipliers (which are chosen for their - // bit dispersion properties) - static constexpr uint64_t multipliers[] = {11400714785074694791ULL, - 14029467366897019727ULL}; - - // Multiplying by the prime number mixes the low bits into the high bits, - // then byte-swapping (which is a single CPU instruction) allows the - // combined high and low bits to participate in the initial hash table index. - auto h = static_cast(value); - return BitUtil::ByteSwap(multipliers[AlgNum] * h); - } -}; - -template -struct ScalarHelper< - Scalar, AlgNum, - typename std::enable_if::value>::type> - : public ScalarHelperBase { - // ScalarHelper specialization for util::string_view - - static hash_t ComputeHash(const util::string_view& value) { - return ComputeStringHash(value.data(), static_cast(value.size())); - } -}; - -template -struct ScalarHelper::value>::type> - : public ScalarHelperBase { - // ScalarHelper specialization for reals - - static bool CompareScalars(Scalar u, Scalar v) { - if (std::isnan(u)) { - // XXX should we do a bit-precise comparison? - return std::isnan(v); - } - return u == v; - } -}; - -template -hash_t ComputeStringHash(const void* data, int64_t length) { - if (ARROW_PREDICT_TRUE(length <= 16)) { - // Specialize for small hash strings, as they are quite common as - // hash table keys. - auto p = reinterpret_cast(data); - auto n = static_cast(length); - if (n <= 8) { - if (n <= 3) { - if (n == 0) { - return 1U; - } - uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1]; - return ScalarHelper::ComputeHash(x); - } - // 4 <= length <= 8 - // We can read the string as two overlapping 32-bit ints, apply - // different hash functions to each of them in parallel, then XOR - // the results - uint32_t x, y; - hash_t hx, hy; - // XXX those are unaligned accesses. Should we have a facility for that? - x = *reinterpret_cast(p + n - 4); - y = *reinterpret_cast(p); - hx = ScalarHelper::ComputeHash(x); - hy = ScalarHelper::ComputeHash(y); - return n ^ hx ^ hy; - } - // 8 <= length <= 16 - // Apply the same principle as above - uint64_t x, y; - hash_t hx, hy; - x = *reinterpret_cast(p + n - 8); - y = *reinterpret_cast(p); - hx = ScalarHelper::ComputeHash(x); - hy = ScalarHelper::ComputeHash(y); - return n ^ hx ^ hy; - } - - if (HashUtil::have_hardware_crc32) { - // DoubleCrcHash is faster that Murmur2. - auto h = HashUtil::DoubleCrcHash(data, static_cast(length), AlgNum); - return ScalarHelper::ComputeHash(h); - } else { - // Fall back on 64-bit Murmur2 for longer strings. - // It has decent speed for medium-sized strings. There may be faster - // hashes on long strings such as xxHash, but that may not matter much - // for the typical length distribution of hash keys. - return HashUtil::MurmurHash2_64(data, static_cast(length), AlgNum); - } -} - -// XXX add a HashEq struct with both hash and compare functions? - -// ---------------------------------------------------------------------- -// An open-addressing insert-only hash table (no deletes) - -template -class HashTable { - public: - struct Entry { - hash_t h; - Payload payload; - }; - - explicit HashTable(uint64_t capacity) { - // Presize for at least 8 elements - capacity = std::max(capacity, static_cast(8U)); - size_ = BitUtil::NextPower2(capacity * 4U); - size_mask_ = size_ - 1; - n_filled_ = 0; - // This will zero out hash entries, marking them empty - entries_.resize(size_); - } - - // Lookup with non-linear probing - // cmp_func should have signature bool(const Payload*). - // Return a (Entry*, found) pair. - template - std::pair Lookup(hash_t h, CmpFunc&& cmp_func) { - auto p = Lookup(h, entries_.data(), size_mask_, - std::forward(cmp_func)); - return {&entries_[p.first], p.second}; - } - - template - std::pair Lookup(hash_t h, CmpFunc&& cmp_func) const { - auto p = Lookup(h, entries_.data(), size_mask_, - std::forward(cmp_func)); - return {&entries_[p.first], p.second}; - } - - void Insert(Entry* entry, hash_t h, const Payload& payload) { - assert(entry->h == 0); - entry->h = FixHash(h); - entry->payload = payload; - ++n_filled_; - if (NeedUpsizing()) { - // Resizing is expensive, avoid doing it too often - Upsize(size_ * 4); - } - } - - uint64_t size() const { return n_filled_; } - - // Visit all non-empty entries in the table - // The visit_func should have signature void(const Entry*) - template - void VisitEntries(VisitFunc&& visit_func) const { - for (const auto& entry : entries_) { - if (entry.h != 0U) { - visit_func(&entry); - } - } - } - - protected: - // NoCompare is for when the value is known not to exist in the table - enum CompareKind { DoCompare, NoCompare }; - - // The workhorse lookup function - template - std::pair Lookup(hash_t h, const Entry* entries, uint64_t size_mask, - CmpFunc&& cmp_func) const { - static constexpr uint8_t perturb_shift = 5; - - uint64_t index, perturb; - const Entry* entry; - - h = FixHash(h); - index = h & size_mask; - perturb = (h >> perturb_shift) + 1U; - - while (true) { - entry = &entries[index]; - if (CompareEntry(h, entry, std::forward(cmp_func))) { - // Found - return {index, true}; - } - if (entry->h == 0U) { - // Empty slot - return {index, false}; - } - - // Perturbation logic inspired from CPython's set / dict object. - // The goal is that all 64 bits of the unmasked hash value eventually - // participate in the probing sequence, to minimize clustering. - index = (index + perturb) & size_mask; - perturb = (perturb >> perturb_shift) + 1U; - } - } - - template - bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const { - if (CKind == NoCompare) { - return false; - } else { - return entry->h == h && cmp_func(&entry->payload); - } - } - - bool NeedUpsizing() const { - // Keep the load factor <= 1/2 - return n_filled_ * 2U >= size_; - } - - void Upsize(uint64_t new_size) { - assert(new_size > size_); - uint64_t new_mask = new_size - 1; - assert((new_size & new_mask) == 0); // it's a power of two - - std::vector new_entries(new_size); - for (auto& entry : entries_) { - hash_t h = entry.h; - if (h != 0) { - // Dummy compare function (will not be called) - auto cmp_func = [](const Payload*) { return false; }; - // Non-empty slot, move into new - auto p = Lookup(h, new_entries.data(), new_mask, cmp_func); - assert(!p.second); // shouldn't have found a matching entry - Entry* new_entry = &new_entries[p.first]; - new_entry->h = h; - new_entry->payload = entry.payload; - } - } - std::swap(entries_, new_entries); - size_ = new_size; - size_mask_ = new_mask; - } - - hash_t FixHash(hash_t h) const { - // 0 is used to indicate empty entries - return (h == 0U) ? 42U : h; - } - - uint64_t size_; - uint64_t size_mask_; - uint64_t n_filled_; - std::vector entries_; -}; - -// XXX typedef memo_index_t int32_t ? - -// ---------------------------------------------------------------------- -// A base class for memoization table. - -class MemoTable { - public: - virtual ~MemoTable() = default; - - virtual int32_t size() const = 0; -}; - -// ---------------------------------------------------------------------- -// A memoization table for memory-cheap scalar values. - -// The memoization table remembers and allows to look up the insertion -// index for each key. - -template class HashTableTemplateType = HashTable> -class ScalarMemoTable : public MemoTable { - public: - explicit ScalarMemoTable(int64_t entries = 0) - : hash_table_(static_cast(entries)) {} - - int32_t Get(const Scalar& value) const { - auto cmp_func = [value](const Payload* payload) -> bool { - return ScalarHelper::CompareScalars(payload->value, value); - }; - hash_t h = ComputeHash(value); - auto p = hash_table_.Lookup(h, cmp_func); - if (p.second) { - return p.first->payload.memo_index; - } else { - return -1; - } - } - - template - int32_t GetOrInsert(const Scalar& value, Func1&& on_found, Func2&& on_not_found) { - auto cmp_func = [value](const Payload* payload) -> bool { - return ScalarHelper::CompareScalars(value, payload->value); - }; - hash_t h = ComputeHash(value); - auto p = hash_table_.Lookup(h, cmp_func); - int32_t memo_index; - if (p.second) { - memo_index = p.first->payload.memo_index; - on_found(memo_index); - } else { - memo_index = size(); - hash_table_.Insert(p.first, h, {value, memo_index}); - on_not_found(memo_index); - } - return memo_index; - } - - int32_t GetOrInsert(const Scalar& value) { - return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); - } - - // The number of entries in the memo table - // (which is also 1 + the largest memo index) - int32_t size() const override { return static_cast(hash_table_.size()); } - - // Copy values starting from index `start` into `out_data` - void CopyValues(int32_t start, Scalar* out_data) const { - hash_table_.VisitEntries([=](const HashTableEntry* entry) { - int32_t index = entry->payload.memo_index - start; - if (index >= 0) { - out_data[index] = entry->payload.value; - } - }); - } - - void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } - - protected: - struct Payload { - Scalar value; - int32_t memo_index; - }; - - using HashTableType = HashTableTemplateType; - using HashTableEntry = typename HashTableType::Entry; - HashTableType hash_table_; - - hash_t ComputeHash(const Scalar& value) const { - return ScalarHelper::ComputeHash(value); - } -}; - -// ---------------------------------------------------------------------- -// A memoization table for small scalar values, using direct indexing - -template -struct SmallScalarTraits {}; - -template <> -struct SmallScalarTraits { - static constexpr int32_t cardinality = 2; - - static uint32_t AsIndex(bool value) { return value ? 1 : 0; } -}; - -template -struct SmallScalarTraits::value>::type> { - using Unsigned = typename std::make_unsigned::type; - - static constexpr int32_t cardinality = 1U + std::numeric_limits::max(); - - static uint32_t AsIndex(Scalar value) { return static_cast(value); } -}; - -template class HashTableTemplateType = HashTable> -class SmallScalarMemoTable : public MemoTable { - public: - explicit SmallScalarMemoTable(int64_t entries = 0) { - std::fill(value_to_index_, value_to_index_ + cardinality, -1); - index_to_value_.reserve(cardinality); - } - - int32_t Get(const Scalar value) const { - auto value_index = AsIndex(value); - return value_to_index_[value_index]; - } - - template - int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) { - auto value_index = AsIndex(value); - auto memo_index = value_to_index_[value_index]; - if (memo_index < 0) { - memo_index = static_cast(index_to_value_.size()); - index_to_value_.push_back(value); - value_to_index_[value_index] = memo_index; - assert(memo_index < cardinality); - on_not_found(memo_index); - } else { - on_found(memo_index); - } - return memo_index; - } - - int32_t GetOrInsert(const Scalar value) { - return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {}); - } - - // The number of entries in the memo table - // (which is also 1 + the largest memo index) - int32_t size() const override { return static_cast(index_to_value_.size()); } - - // Copy values starting from index `start` into `out_data` - void CopyValues(int32_t start, Scalar* out_data) const { - DCHECK_GE(start, 0); - DCHECK_LE(static_cast(start), index_to_value_.size()); - int64_t offset = start * static_cast(sizeof(Scalar)); - memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar)); - } - - void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); } - - const std::vector& values() const { return index_to_value_; } - - protected: - static constexpr auto cardinality = SmallScalarTraits::cardinality; - static_assert(cardinality <= 256, "cardinality too large for direct-addressed table"); - - uint32_t AsIndex(Scalar value) const { - return SmallScalarTraits::AsIndex(value); - } - - int32_t value_to_index_[cardinality]; - std::vector index_to_value_; -}; - -// ---------------------------------------------------------------------- -// A memoization table for variable-sized binary data. - -class BinaryMemoTable : public MemoTable { - public: - explicit BinaryMemoTable(int64_t entries = 0, int64_t values_size = -1) - : hash_table_(static_cast(entries)) { - offsets_.reserve(entries + 1); - offsets_.push_back(0); - if (values_size == -1) { - values_.reserve(entries * 4); // A conservative heuristic - } else { - values_.reserve(values_size); - } - } - - int32_t Get(const void* data, int32_t length) const { - hash_t h = ComputeStringHash<0>(data, length); - auto p = Lookup(h, data, length); - if (p.second) { - return p.first->payload.memo_index; - } else { - return -1; - } - } - - int32_t Get(const std::string& value) const { - return Get(value.data(), static_cast(value.length())); - } - - int32_t Get(const util::string_view& value) const { - return Get(value.data(), static_cast(value.length())); - } - - template - int32_t GetOrInsert(const void* data, int32_t length, Func1&& on_found, - Func2&& on_not_found) { - hash_t h = ComputeStringHash<0>(data, length); - auto p = Lookup(h, data, length); - int32_t memo_index; - if (p.second) { - memo_index = p.first->payload.memo_index; - on_found(memo_index); - } else { - memo_index = size(); - // Insert offset - auto offset = static_cast(values_.size()); - assert(offsets_.size() == static_cast(memo_index + 1)); - assert(offsets_[memo_index] == offset); - offsets_.push_back(offset + length); - // Insert string value - values_.append(static_cast(data), length); - // Insert hash entry - hash_table_.Insert(const_cast(p.first), h, {memo_index}); - - on_not_found(memo_index); - } - return memo_index; - } - - template - int32_t GetOrInsert(const util::string_view& value, Func1&& on_found, - Func2&& on_not_found) { - return GetOrInsert(value.data(), static_cast(value.length()), - std::forward(on_found), std::forward(on_not_found)); - } - - int32_t GetOrInsert(const void* data, int32_t length) { - return GetOrInsert(data, length, [](int32_t i) {}, [](int32_t i) {}); - } - - int32_t GetOrInsert(const util::string_view& value) { - return GetOrInsert(value.data(), static_cast(value.length())); - } - - int32_t GetOrInsert(const std::string& value) { - return GetOrInsert(value.data(), static_cast(value.length())); - } - - // The number of entries in the memo table - // (which is also 1 + the largest memo index) - int32_t size() const override { return static_cast(hash_table_.size()); } - - int32_t values_size() const { return static_cast(values_.size()); } - - const uint8_t* values_data() const { - return reinterpret_cast(values_.data()); - } - - // Copy (n + 1) offsets starting from index `start` into `out_data` - template - void CopyOffsets(int32_t start, Offset* out_data) const { - auto delta = offsets_[start]; - for (uint32_t i = start; i < offsets_.size(); ++i) { - auto adjusted_offset = offsets_[i] - delta; - auto cast_offset = static_cast(adjusted_offset); - assert(static_cast(cast_offset) == adjusted_offset); // avoid truncation - *out_data++ = cast_offset; - } - } - - template - void CopyOffsets(Offset* out_data) const { - CopyOffsets(0, out_data); - } - - // Copy values starting from index `start` into `out_data` - void CopyValues(int32_t start, uint8_t* out_data) const { - CopyValues(start, -1, out_data); - } - - // Same as above, but check output size in debug mode - void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const { - int32_t offset = offsets_[start]; - auto length = values_.size() - static_cast(offset); - if (out_size != -1) { - assert(static_cast(length) == out_size); - } - memcpy(out_data, values_.data() + offset, length); - } - - void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); } - - void CopyValues(int64_t out_size, uint8_t* out_data) const { - CopyValues(0, out_size, out_data); - } - - // Visit the stored values in insertion order. - // The visitor function should have the signature `void(util::string_view)` - // or `void(const util::string_view&)`. - template - void VisitValues(int32_t start, VisitFunc&& visit) const { - for (uint32_t i = start; i < offsets_.size() - 1; ++i) { - visit( - util::string_view(values_.data() + offsets_[i], offsets_[i + 1] - offsets_[i])); - } - } - - protected: - struct Payload { - int32_t memo_index; - }; - - using HashTableType = HashTable; - using HashTableEntry = typename HashTable::Entry; - HashTableType hash_table_; - - std::vector offsets_; - std::string values_; - - std::pair Lookup(hash_t h, const void* data, - int32_t length) const { - auto cmp_func = [=](const Payload* payload) { - int32_t start, stop; - start = offsets_[payload->memo_index]; - stop = offsets_[payload->memo_index + 1]; - return length == stop - start && memcmp(data, values_.data() + start, length) == 0; - }; - return hash_table_.Lookup(h, cmp_func); - } -}; - -template -struct HashTraits {}; - -template <> -struct HashTraits { - using MemoTableType = SmallScalarMemoTable; -}; - -template -struct HashTraits> { - using c_type = typename T::c_type; - using MemoTableType = SmallScalarMemoTable; -}; - -template -struct HashTraits< - T, typename std::enable_if::value && !is_8bit_int::value>::type> { - using c_type = typename T::c_type; - using MemoTableType = ScalarMemoTable; -}; - -template -struct HashTraits> { - using MemoTableType = BinaryMemoTable; -}; - -template -struct HashTraits> { - using MemoTableType = BinaryMemoTable; -}; - -template -struct DictionaryTraits {}; - -template <> -struct DictionaryTraits { - using T = BooleanType; - using MemoTableType = typename HashTraits::MemoTableType; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - BooleanBuilder builder(pool); - const auto& bool_values = memo_table.values(); - auto it = bool_values.begin() + start_offset; - for (; it != bool_values.end(); ++it) { - RETURN_NOT_OK(builder.Append(*it)); - } - return builder.FinishInternal(out); - } -}; - -template -struct DictionaryTraits> { - using c_type = typename T::c_type; - using MemoTableType = typename HashTraits::MemoTableType; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_buffer; - auto dict_length = static_cast(memo_table.size()) - start_offset; - // This makes a copy, but we assume a dictionary array is usually small - // compared to the size of the dictionary-using array. - // (also, copying the dictionary values is cheap compared to the cost - // of building the memo table) - RETURN_NOT_OK( - AllocateBuffer(pool, TypeTraits::bytes_required(dict_length), &dict_buffer)); - memo_table.CopyValues(static_cast(start_offset), - reinterpret_cast(dict_buffer->mutable_data())); - *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */); - return Status::OK(); - } -}; - -template -struct DictionaryTraits> { - using MemoTableType = typename HashTraits::MemoTableType; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - std::shared_ptr dict_offsets; - std::shared_ptr dict_data; - - // Create the offsets buffer - auto dict_length = static_cast(memo_table.size() - start_offset); - RETURN_NOT_OK(AllocateBuffer( - pool, TypeTraits::bytes_required(dict_length + 1), &dict_offsets)); - auto raw_offsets = reinterpret_cast(dict_offsets->mutable_data()); - memo_table.CopyOffsets(static_cast(start_offset), raw_offsets); - - // Create the data buffer - DCHECK_EQ(raw_offsets[0], 0); - RETURN_NOT_OK(AllocateBuffer(pool, raw_offsets[dict_length], &dict_data)); - memo_table.CopyValues(static_cast(start_offset), dict_data->size(), - dict_data->mutable_data()); - - *out = ArrayData::Make(type, dict_length, {nullptr, dict_offsets, dict_data}, - 0 /* null_count */); - return Status::OK(); - } -}; - -template -struct DictionaryTraits> { - using MemoTableType = typename HashTraits::MemoTableType; - - static Status GetDictionaryArrayData(MemoryPool* pool, - const std::shared_ptr& type, - const MemoTableType& memo_table, - int64_t start_offset, - std::shared_ptr* out) { - const T& concrete_type = internal::checked_cast(*type); - std::shared_ptr dict_data; - - // Create the data buffer - auto dict_length = static_cast(memo_table.size() - start_offset); - auto data_length = dict_length * concrete_type.byte_width(); - RETURN_NOT_OK(AllocateBuffer(pool, data_length, &dict_data)); - memo_table.CopyValues(static_cast(start_offset), data_length, - dict_data->mutable_data()); - - *out = ArrayData::Make(type, dict_length, {nullptr, dict_data}, 0 /* null_count */); - return Status::OK(); - } -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_HASHING_H diff --git a/r/R/inst/include/arrow/util/int-util.h b/r/R/inst/include/arrow/util/int-util.h deleted file mode 100644 index d3ae09f75cf..00000000000 --- a/r/R/inst/include/arrow/util/int-util.h +++ /dev/null @@ -1,89 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_INT_UTIL_H -#define ARROW_UTIL_INT_UTIL_H - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -ARROW_EXPORT -uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width = 1); - -ARROW_EXPORT -uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes, - int64_t length, uint8_t min_width = 1); - -ARROW_EXPORT -uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width = 1); - -ARROW_EXPORT -uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length, - uint8_t min_width = 1); - -ARROW_EXPORT -void DowncastInts(const int64_t* source, int8_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastInts(const int64_t* source, int16_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastInts(const int64_t* source, int32_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastInts(const int64_t* source, int64_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length); - -ARROW_EXPORT -void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length); - -template -ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length, - const int32_t* transpose_map); - -/// Signed addition with well-defined behaviour on overflow (as unsigned) -template -SignedInt SafeSignedAdd(SignedInt u, SignedInt v) { - using UnsignedInt = typename std::make_unsigned::type; - return static_cast(static_cast(u) + - static_cast(v)); -} - -/// Signed left shift with well-defined behaviour on negative numbers or overflow -template -SignedInt SafeLeftShift(SignedInt u, Shift shift) { - using UnsignedInt = typename std::make_unsigned::type; - return static_cast(static_cast(u) << shift); -} - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_INT_UTIL_H diff --git a/r/R/inst/include/arrow/util/io-util.h b/r/R/inst/include/arrow/util/io-util.h deleted file mode 100644 index 2b48a5c4833..00000000000 --- a/r/R/inst/include/arrow/util/io-util.h +++ /dev/null @@ -1,263 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_IO_UTIL_H -#define ARROW_UTIL_IO_UTIL_H - -#ifndef _WIN32 -#define ARROW_HAVE_SIGACTION 1 -#endif - -#include -#include - -#if ARROW_HAVE_SIGACTION -#include // Needed for struct sigaction -#endif - -#include "arrow/io/interfaces.h" -#include "arrow/status.h" -#include "arrow/util/macros.h" - -// The Windows API defines DeleteFile as a macro resolving to either -// DeleteFileA or DeleteFileW. Need to undo it. -#if defined(_WIN32) && defined(DeleteFile) -#undef DeleteFile -#endif - -namespace arrow { - -class Buffer; - -namespace io { - -// Output stream that just writes to stdout. -class ARROW_EXPORT StdoutStream : public OutputStream { - public: - StdoutStream(); - ~StdoutStream() override {} - - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Write(const void* data, int64_t nbytes) override; - - private: - int64_t pos_; -}; - -// Output stream that just writes to stderr. -class ARROW_EXPORT StderrStream : public OutputStream { - public: - StderrStream(); - ~StderrStream() override {} - - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Write(const void* data, int64_t nbytes) override; - - private: - int64_t pos_; -}; - -// Input stream that just reads from stdin. -class ARROW_EXPORT StdinStream : public InputStream { - public: - StdinStream(); - ~StdinStream() override {} - - Status Close() override; - bool closed() const override; - - Status Tell(int64_t* position) const override; - - Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - - Status Read(int64_t nbytes, std::shared_ptr* out) override; - - private: - int64_t pos_; -}; - -} // namespace io - -namespace internal { - -// NOTE: 8-bit path strings on Windows are encoded using UTF-8. -// Using MBCS would fail encoding some paths. - -#if defined(_WIN32) -using NativePathString = std::wstring; -#else -using NativePathString = std::string; -#endif - -class ARROW_EXPORT PlatformFilename { - public: - ~PlatformFilename(); - PlatformFilename(); - PlatformFilename(const PlatformFilename&); - PlatformFilename(PlatformFilename&&); - PlatformFilename& operator=(const PlatformFilename&); - PlatformFilename& operator=(PlatformFilename&&); - explicit PlatformFilename(const NativePathString& path); - - const NativePathString& ToNative() const; - std::string ToString() const; - - // These functions can fail for character encoding reasons. - static Status FromString(const std::string& file_name, PlatformFilename* out); - Status Join(const std::string& child_name, PlatformFilename* out) const; - - private: - struct Impl; - std::unique_ptr impl_; - - explicit PlatformFilename(const Impl& impl); - explicit PlatformFilename(Impl&& impl); - - // Those functions need access to the embedded path object - friend ARROW_EXPORT Status CreateDir(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status CreateDirTree(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status DeleteDirTree(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status DeleteFile(const PlatformFilename&, bool*); - friend ARROW_EXPORT Status FileExists(const PlatformFilename&, bool*); -}; - -ARROW_EXPORT -Status CreateDir(const PlatformFilename& dir_path, bool* created = NULLPTR); -ARROW_EXPORT -Status CreateDirTree(const PlatformFilename& dir_path, bool* created = NULLPTR); -ARROW_EXPORT -Status DeleteDirTree(const PlatformFilename& dir_path, bool* deleted = NULLPTR); -ARROW_EXPORT -Status DeleteFile(const PlatformFilename& file_path, bool* deleted = NULLPTR); -ARROW_EXPORT -Status FileExists(const PlatformFilename& path, bool* out); - -ARROW_EXPORT -Status FileNameFromString(const std::string& file_name, PlatformFilename* out); - -ARROW_EXPORT -Status FileOpenReadable(const PlatformFilename& file_name, int* fd); -ARROW_EXPORT -Status FileOpenWritable(const PlatformFilename& file_name, bool write_only, bool truncate, - bool append, int* fd); - -ARROW_EXPORT -Status FileRead(int fd, uint8_t* buffer, const int64_t nbytes, int64_t* bytes_read); -ARROW_EXPORT -Status FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes, - int64_t* bytes_read); -ARROW_EXPORT -Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes); -ARROW_EXPORT -Status FileTruncate(int fd, const int64_t size); - -ARROW_EXPORT -Status FileTell(int fd, int64_t* pos); -ARROW_EXPORT -Status FileSeek(int fd, int64_t pos); -ARROW_EXPORT -Status FileSeek(int fd, int64_t pos, int whence); -ARROW_EXPORT -Status FileGetSize(int fd, int64_t* size); - -ARROW_EXPORT -Status FileClose(int fd); - -ARROW_EXPORT -Status CreatePipe(int fd[2]); - -ARROW_EXPORT -Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, - void** new_addr); - -ARROW_EXPORT -Status GetEnvVar(const char* name, std::string* out); -ARROW_EXPORT -Status GetEnvVar(const std::string& name, std::string* out); -ARROW_EXPORT -Status SetEnvVar(const char* name, const char* value); -ARROW_EXPORT -Status SetEnvVar(const std::string& name, const std::string& value); -ARROW_EXPORT -Status DelEnvVar(const char* name); -ARROW_EXPORT -Status DelEnvVar(const std::string& name); - -ARROW_EXPORT -std::string ErrnoMessage(int errnum); -#if _WIN32 -ARROW_EXPORT -std::string WinErrorMessage(int errnum); -#endif - -class ARROW_EXPORT TemporaryDir { - public: - ~TemporaryDir(); - - const PlatformFilename& path() { return path_; } - - static Status Make(const std::string& prefix, std::unique_ptr* out); - - private: - PlatformFilename path_; - - explicit TemporaryDir(PlatformFilename&&); -}; - -class ARROW_EXPORT SignalHandler { - public: - typedef void (*Callback)(int); - - SignalHandler(); - explicit SignalHandler(Callback cb); -#if ARROW_HAVE_SIGACTION - explicit SignalHandler(const struct sigaction& sa); -#endif - - Callback callback() const; -#if ARROW_HAVE_SIGACTION - const struct sigaction& action() const; -#endif - - protected: -#if ARROW_HAVE_SIGACTION - // Storing the full sigaction allows to restore the entire signal handling - // configuration. - struct sigaction sa_; -#else - Callback cb_; -#endif -}; - -ARROW_EXPORT -Status GetSignalHandler(int signum, SignalHandler* out); -ARROW_EXPORT -Status SetSignalHandler(int signum, SignalHandler handler, - SignalHandler* old_handler = NULLPTR); - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_IO_UTIL_H diff --git a/r/R/inst/include/arrow/util/key_value_metadata.h b/r/R/inst/include/arrow/util/key_value_metadata.h deleted file mode 100644 index 2820c98200d..00000000000 --- a/r/R/inst/include/arrow/util/key_value_metadata.h +++ /dev/null @@ -1,81 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_KEY_VALUE_METADATA_H -#define ARROW_UTIL_KEY_VALUE_METADATA_H - -#include -#include -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -/// \brief A container for key-value pair type metadata. Not thread-safe -class ARROW_EXPORT KeyValueMetadata { - public: - KeyValueMetadata(); - KeyValueMetadata(const std::vector& keys, - const std::vector& values); - explicit KeyValueMetadata(const std::unordered_map& map); - virtual ~KeyValueMetadata() = default; - - void ToUnorderedMap(std::unordered_map* out) const; - - void Append(const std::string& key, const std::string& value); - - void reserve(int64_t n); - int64_t size() const; - - const std::string& key(int64_t i) const; - const std::string& value(int64_t i) const; - - /// \brief Perform linear search for key, returning -1 if not found - int FindKey(const std::string& key) const; - - std::shared_ptr Copy() const; - - bool Equals(const KeyValueMetadata& other) const; - std::string ToString() const; - - private: - std::vector keys_; - std::vector values_; - - ARROW_DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata); -}; - -/// \brief Create a KeyValueMetadata instance -/// -/// \param pairs key-value mapping -std::shared_ptr ARROW_EXPORT -key_value_metadata(const std::unordered_map& pairs); - -/// \brief Create a KeyValueMetadata instance -/// -/// \param keys sequence of metadata keys -/// \param values sequence of corresponding metadata values -std::shared_ptr ARROW_EXPORT key_value_metadata( - const std::vector& keys, const std::vector& values); - -} // namespace arrow - -#endif // ARROW_UTIL_KEY_VALUE_METADATA_H diff --git a/r/R/inst/include/arrow/util/lazy.h b/r/R/inst/include/arrow/util/lazy.h deleted file mode 100644 index de32b5f22af..00000000000 --- a/r/R/inst/include/arrow/util/lazy.h +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_LAZY_H -#define ARROW_UTIL_LAZY_H - -#include -#include - -namespace arrow { -namespace internal { - -/// Create a range from a callable which takes a single index parameter -/// and returns the value of iterator on each call and a length. -/// Only iterators obtained from the same range should be compared, the -/// behaviour generally similar to other STL containers. -template -class LazyRange { - private: - // callable which generates the values - // has to be defined at the beginning of the class for type deduction - const Generator gen_; - // the length of the range - int64_t length_; -#ifdef _MSC_VER - // workaround to VS2010 not supporting decltype properly - // see https://stackoverflow.com/questions/21782846/decltype-for-class-member-function - static Generator gen_static_; -#endif - - public: -#ifdef _MSC_VER - using return_type = decltype(gen_static_(0)); -#else - using return_type = decltype(gen_(0)); -#endif - - /// Construct a new range from a callable and length - LazyRange(Generator gen, int64_t length) : gen_(gen), length_(length) {} - - // Class of the dependent iterator, created implicitly by begin and end - class RangeIter { - public: - using difference_type = int64_t; - using value_type = return_type; - using reference = const value_type&; - using pointer = const value_type*; - using iterator_category = std::forward_iterator_tag; - -#ifdef _MSC_VER - // msvc complains about unchecked iterators, - // see https://stackoverflow.com/questions/21655496/error-c4996-checked-iterators - using _Unchecked_type = typename LazyRange::RangeIter; -#endif - - RangeIter(const LazyRange& range, int64_t index) - : range_(range), index_(index) {} - - const return_type operator*() { return range_.gen_(index_); } - - RangeIter operator+(difference_type length) { - return RangeIter(range_, index_ + length); - } - - // pre-increment - RangeIter& operator++() { - ++index_; - return *this; - } - - // post-increment - RangeIter operator++(int) { - auto copy = RangeIter(*this); - ++index_; - return copy; - } - - bool operator==(const typename LazyRange::RangeIter& other) const { - return this->index_ == other.index_ && &this->range_ == &other.range_; - } - - bool operator!=(const typename LazyRange::RangeIter& other) const { - return this->index_ != other.index_ || &this->range_ != &other.range_; - } - - int64_t operator-(const typename LazyRange::RangeIter& other) { - return this->index_ - other.index_; - } - - private: - // parent range reference - const LazyRange& range_; - // current index - int64_t index_; - }; - - friend class RangeIter; - - // Create a new begin const iterator - RangeIter begin() { return RangeIter(*this, 0); } - - // Create a new end const iterator - RangeIter end() { return RangeIter(*this, length_); } -}; - -/// Helper function to create a lazy range from a callable (e.g. lambda) and length -template -LazyRange MakeLazyRange(Generator&& gen, int64_t length) { - return LazyRange(std::forward(gen), length); -} - -} // namespace internal -} // namespace arrow -#endif diff --git a/r/R/inst/include/arrow/util/logging.h b/r/R/inst/include/arrow/util/logging.h deleted file mode 100644 index 999aca6fd7c..00000000000 --- a/r/R/inst/include/arrow/util/logging.h +++ /dev/null @@ -1,244 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_LOGGING_H -#define ARROW_UTIL_LOGGING_H - -#ifdef GANDIVA_IR - -// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to -// streams or stdc++. So, making the DCHECK calls void in that case. - -#define ARROW_IGNORE_EXPR(expr) ((void)(expr)) - -#define DCHECK(condition) ARROW_IGNORE_EXPR(condition) -#define DCHECK_OK(status) ARROW_IGNORE_EXPR(status) -#define DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1) -#define DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1) -#define DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1) -#define DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1) -#define DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1) -#define DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1) - -#else // !GANDIVA_IR - -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -enum class ArrowLogLevel : int { - ARROW_DEBUG = -1, - ARROW_INFO = 0, - ARROW_WARNING = 1, - ARROW_ERROR = 2, - ARROW_FATAL = 3 -}; - -#define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level) -#define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level) - -#define ARROW_IGNORE_EXPR(expr) ((void)(expr)) - -#define ARROW_CHECK(condition) \ - (condition) ? ARROW_IGNORE_EXPR(0) \ - : ::arrow::util::Voidify() & \ - ::arrow::util::ArrowLog(__FILE__, __LINE__, \ - ::arrow::util::ArrowLogLevel::ARROW_FATAL) \ - << " Check failed: " #condition " " - -// If 'to_call' returns a bad status, CHECK immediately with a logged message -// of 'msg' followed by the status. -#define ARROW_CHECK_OK_PREPEND(to_call, msg) \ - do { \ - ::arrow::Status _s = (to_call); \ - ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ - } while (false) - -// If the status is bad, CHECK immediately, appending the status to the -// logged message. -#define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status") - -#ifdef NDEBUG -#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING - -// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros -// only do so in debug mode. - -#define DCHECK(condition) \ - while (false) ARROW_IGNORE_EXPR(condition); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_OK(s) \ - ARROW_IGNORE_EXPR(s); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_EQ(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_NE(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_LE(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_LT(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_GE(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() -#define DCHECK_GT(val1, val2) \ - while (false) ARROW_IGNORE_EXPR(val1); \ - while (false) ARROW_IGNORE_EXPR(val2); \ - while (false) ::arrow::util::detail::NullLog() - -#else -#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL - -#define DCHECK(condition) ARROW_CHECK(condition) -#define DCHECK_OK(status) ARROW_CHECK_OK(status) -#define DCHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2)) -#define DCHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2)) -#define DCHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2)) -#define DCHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2)) -#define DCHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2)) -#define DCHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2)) - -#endif // NDEBUG - -// This code is adapted from -// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h. - -// To make the logging lib plugable with other logging libs and make -// the implementation unawared by the user, ArrowLog is only a declaration -// which hide the implementation into logging.cc file. -// In logging.cc, we can choose different log libs using different macros. - -// This is also a null log which does not output anything. -class ARROW_EXPORT ArrowLogBase { - public: - virtual ~ArrowLogBase() {} - - virtual bool IsEnabled() const { return false; } - - template - ArrowLogBase& operator<<(const T& t) { - if (IsEnabled()) { - Stream() << t; - } - return *this; - } - - protected: - virtual std::ostream& Stream() = 0; -}; - -class ARROW_EXPORT ArrowLog : public ArrowLogBase { - public: - ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity); - ~ArrowLog() override; - - /// Return whether or not current logging instance is enabled. - /// - /// \return True if logging is enabled and false otherwise. - bool IsEnabled() const override; - - /// The init function of arrow log for a program which should be called only once. - /// - /// \param appName The app name which starts the log. - /// \param severity_threshold Logging threshold for the program. - /// \param logDir Logging output file name. If empty, the log won't output to file. - static void StartArrowLog(const std::string& appName, - ArrowLogLevel severity_threshold = ArrowLogLevel::ARROW_INFO, - const std::string& logDir = ""); - - /// The shutdown function of arrow log, it should be used with StartArrowLog as a pair. - static void ShutDownArrowLog(); - - /// Install the failure signal handler to output call stack when crash. - /// If glog is not installed, this function won't do anything. - static void InstallFailureSignalHandler(); - - /// Uninstall the signal actions installed by InstallFailureSignalHandler. - static void UninstallSignalAction(); - - /// Return whether or not the log level is enabled in current setting. - /// - /// \param log_level The input log level to test. - /// \return True if input log level is not lower than the threshold. - static bool IsLevelEnabled(ArrowLogLevel log_level); - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog); - - // Hide the implementation of log provider by void *. - // Otherwise, lib user may define the same macro to use the correct header file. - void* logging_provider_; - /// True if log messages should be logged and false if they should be ignored. - bool is_enabled_; - - static ArrowLogLevel severity_threshold_; - - protected: - std::ostream& Stream() override; -}; - -// This class make ARROW_CHECK compilation pass to change the << operator to void. -// This class is copied from glog. -class ARROW_EXPORT Voidify { - public: - Voidify() {} - // This has to be an operator with a precedence lower than << but - // higher than ?: - void operator&(ArrowLogBase&) {} -}; - -namespace detail { - -/// @brief A helper for the nil log sink. -/// -/// Using this helper is analogous to sending log messages to /dev/null: -/// nothing gets logged. -class NullLog { - public: - /// The no-op output operator. - /// - /// @param [in] t - /// The object to send into the nil sink. - /// @return Reference to the updated object. - template - NullLog& operator<<(const T& t) { - return *this; - } -}; - -} // namespace detail -} // namespace util -} // namespace arrow - -#endif // GANDIVA_IR - -#endif // ARROW_UTIL_LOGGING_H diff --git a/r/R/inst/include/arrow/util/macros.h b/r/R/inst/include/arrow/util/macros.h deleted file mode 100644 index 4516985e300..00000000000 --- a/r/R/inst/include/arrow/util/macros.h +++ /dev/null @@ -1,164 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_MACROS_H -#define ARROW_UTIL_MACROS_H - -#include - -#define ARROW_STRINGIFY(x) #x -#define ARROW_CONCAT(x, y) x##y - -// From Google gutil -#ifndef ARROW_DISALLOW_COPY_AND_ASSIGN -#define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ - void operator=(const TypeName&) = delete -#endif - -#define ARROW_UNUSED(x) (void)x -#define ARROW_ARG_UNUSED(x) -// -// GCC can be told that a certain branch is not likely to be taken (for -// instance, a CHECK failure), and use that information in static analysis. -// Giving it this information can help it optimize for the common case in -// the absence of better information (ie. -fprofile-arcs). -// -#if defined(__GNUC__) -#define ARROW_PREDICT_FALSE(x) (__builtin_expect(x, 0)) -#define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) -#define ARROW_NORETURN __attribute__((noreturn)) -#define ARROW_PREFETCH(addr) __builtin_prefetch(addr) -#elif defined(_MSC_VER) -#define ARROW_NORETURN __declspec(noreturn) -#define ARROW_PREDICT_FALSE(x) x -#define ARROW_PREDICT_TRUE(x) x -#define ARROW_PREFETCH(addr) -#else -#define ARROW_NORETURN -#define ARROW_PREDICT_FALSE(x) x -#define ARROW_PREDICT_TRUE(x) x -#define ARROW_PREFETCH(addr) -#endif - -#if (defined(__GNUC__) || defined(__APPLE__)) -#define ARROW_MUST_USE_RESULT __attribute__((warn_unused_result)) -#elif defined(_MSC_VER) -#define ARROW_MUST_USE_RESULT -#else -#define ARROW_MUST_USE_RESULT -#endif - -// ---------------------------------------------------------------------- -// C++/CLI support macros (see ARROW-1134) - -#ifndef NULLPTR - -#ifdef __cplusplus_cli -#define NULLPTR __nullptr -#else -#define NULLPTR nullptr -#endif - -#endif // ifndef NULLPTR - -// ---------------------------------------------------------------------- - -// clang-format off -// [[deprecated]] is only available in C++14, use this for the time being -// This macro takes an optional deprecation message -#if __cplusplus <= 201103L -# ifdef __GNUC__ -# define ARROW_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__))) -# elif defined(_MSC_VER) -# define ARROW_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__)) -# else -# define ARROW_DEPRECATED(...) -# endif -#else -# define ARROW_DEPRECATED(...) [[deprecated(__VA_ARGS__)]] -#endif - -// ---------------------------------------------------------------------- - -// macros to disable padding -// these macros are portable across different compilers and platforms -//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355] -#if !defined(MANUALLY_ALIGNED_STRUCT) -#if defined(_MSC_VER) -#define MANUALLY_ALIGNED_STRUCT(alignment) \ - __pragma(pack(1)); \ - struct __declspec(align(alignment)) -#define STRUCT_END(name, size) \ - __pragma(pack()); \ - static_assert(sizeof(name) == size, "compiler breaks packing rules") -#elif defined(__GNUC__) || defined(__clang__) -#define MANUALLY_ALIGNED_STRUCT(alignment) \ - _Pragma("pack(1)") struct __attribute__((aligned(alignment))) -#define STRUCT_END(name, size) \ - _Pragma("pack()") static_assert(sizeof(name) == size, "compiler breaks packing rules") -#else -#error Unknown compiler, please define structure alignment macros -#endif -#endif // !defined(MANUALLY_ALIGNED_STRUCT) - -// ---------------------------------------------------------------------- -// Convenience macro disabling a particular UBSan check in a function - -#if defined(__clang__) -#define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature))) -#else -#define ARROW_DISABLE_UBSAN(feature) -#endif - -// ---------------------------------------------------------------------- -// Machine information - -#if INTPTR_MAX == INT64_MAX -#define ARROW_BITNESS 64 -#elif INTPTR_MAX == INT32_MAX -#define ARROW_BITNESS 32 -#else -#error Unexpected INTPTR_MAX -#endif - -// ---------------------------------------------------------------------- -// From googletest -// (also in parquet-cpp) - -// When you need to test the private or protected members of a class, -// use the FRIEND_TEST macro to declare your tests as friends of the -// class. For example: -// -// class MyClass { -// private: -// void MyMethod(); -// FRIEND_TEST(MyClassTest, MyMethod); -// }; -// -// class MyClassTest : public testing::Test { -// // ... -// }; -// -// TEST_F(MyClassTest, MyMethod) { -// // Can call MyClass::MyMethod() here. -// } - -#define FRIEND_TEST(test_case_name, test_name) \ - friend class test_case_name##_##test_name##_Test - -#endif // ARROW_UTIL_MACROS_H diff --git a/r/R/inst/include/arrow/util/memory.h b/r/R/inst/include/arrow/util/memory.h deleted file mode 100644 index 2d2a1059214..00000000000 --- a/r/R/inst/include/arrow/util/memory.h +++ /dev/null @@ -1,46 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_MEMORY_H -#define ARROW_UTIL_MEMORY_H - -#include -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace internal { - -// A helper function for doing memcpy with multiple threads. This is required -// to saturate the memory bandwidth of modern cpus. -void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, - uintptr_t block_size, int num_threads); - -// A helper function for checking if two wrapped objects implementing `Equals` -// are equal. -template -bool SharedPtrEquals(const std::shared_ptr& left, const std::shared_ptr& right) { - if (left == right) return true; - if (left == NULLPTR || right == NULLPTR) return false; - return left->Equals(*right); -} - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_MEMORY_H diff --git a/r/R/inst/include/arrow/util/neon-util.h b/r/R/inst/include/arrow/util/neon-util.h deleted file mode 100644 index 714d2324f05..00000000000 --- a/r/R/inst/include/arrow/util/neon-util.h +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -namespace arrow { - -#if defined(__aarch64__) || defined(__AARCH64__) -#ifdef __ARM_FEATURE_CRC32 -#define ARROW_HAVE_ARM_CRC -#include -#endif -#endif - -#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC) - -#include -#include -#ifndef HWCAP_CRC32 -#define HWCAP_CRC32 (1 << 7) -#endif -static inline uint32_t crc32c_runtime_check(void) { - uint64_t auxv = getauxval(AT_HWCAP); - return (auxv & HWCAP_CRC32) != 0; -} - -static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) { - return __crc32cb(crc, v); -} - -static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t v) { - return __crc32ch(crc, v); -} - -static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t v) { - return __crc32cw(crc, v); -} - -static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) { - return __crc32cd(crc, v); -} - -#endif // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC) - -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/parallel.h b/r/R/inst/include/arrow/util/parallel.h deleted file mode 100644 index 8caba5f1f0d..00000000000 --- a/r/R/inst/include/arrow/util/parallel.h +++ /dev/null @@ -1,95 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_PARALLEL_H -#define ARROW_UTIL_PARALLEL_H - -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/thread-pool.h" - -namespace arrow { -namespace internal { - -// A parallelizer that takes a `Status(int)` function and calls it with -// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads. - -template -Status ParallelFor(int num_tasks, FUNCTION&& func) { - auto pool = internal::GetCpuThreadPool(); - std::vector> futures(num_tasks); - - for (int i = 0; i < num_tasks; ++i) { - futures[i] = pool->Submit(func, i); - } - auto st = Status::OK(); - for (auto& fut : futures) { - st &= fut.get(); - } - return st; -} - -// A variant of ParallelFor() with an explicit number of dedicated threads. -// In most cases it's more appropriate to use the 2-argument ParallelFor (above), -// or directly the global CPU thread pool (arrow/util/thread-pool.h). - -template -Status ParallelFor(int nthreads, int num_tasks, FUNCTION&& func) { - std::vector thread_pool; - thread_pool.reserve(nthreads); - std::atomic task_counter(0); - - std::mutex error_mtx; - bool error_occurred = false; - Status error; - - for (int thread_id = 0; thread_id < nthreads; ++thread_id) { - thread_pool.emplace_back( - [&num_tasks, &task_counter, &error, &error_occurred, &error_mtx, &func]() { - int task_id; - while (!error_occurred) { - task_id = task_counter.fetch_add(1); - if (task_id >= num_tasks) { - break; - } - Status s = func(task_id); - if (!s.ok()) { - std::lock_guard lock(error_mtx); - error_occurred = true; - error = s; - break; - } - } - }); - } - for (auto&& thread : thread_pool) { - thread.join(); - } - if (error_occurred) { - return error; - } - return Status::OK(); -} - -} // namespace internal -} // namespace arrow - -#endif diff --git a/r/R/inst/include/arrow/util/parsing.h b/r/R/inst/include/arrow/util/parsing.h deleted file mode 100644 index 20b749a4ecf..00000000000 --- a/r/R/inst/include/arrow/util/parsing.h +++ /dev/null @@ -1,512 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This is a private header for string-to-number parsing utilitiers - -#ifndef ARROW_UTIL_PARSING_H -#define ARROW_UTIL_PARSING_H - -#include -#include -#include -#include -#include -#include - -#include - -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/config.h" -#include "arrow/vendored/datetime.h" - -namespace arrow { -namespace internal { - -/// \brief A class providing conversion from strings to some Arrow data types -/// -/// Conversion is triggered by calling operator(). It returns true on -/// success, false on failure. -/// -/// The class may have a non-trivial construction cost in some cases, -/// so it's recommended to use a single instance many times, if doing bulk -/// conversion. -/// -template -class StringConverter; - -template <> -class StringConverter { - public: - explicit StringConverter(const std::shared_ptr& = NULLPTR) {} - - using value_type = bool; - - bool operator()(const char* s, size_t length, value_type* out) { - if (length == 1) { - // "0" or "1"? - if (s[0] == '0') { - *out = false; - return true; - } - if (s[0] == '1') { - *out = true; - return true; - } - return false; - } - if (length == 4) { - // "true"? - *out = true; - return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') && - (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E')); - } - if (length == 5) { - // "false"? - *out = false; - return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') && - (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') && - (s[4] == 'e' || s[4] == 'E')); - } - return false; - } -}; - -// Ideas for faster float parsing: -// - http://rapidjson.org/md_doc_internals.html#ParsingDouble -// - https://github.com/google/double-conversion [used here] -// - https://github.com/achan001/dtoa-fast - -template -class StringToFloatConverterMixin { - public: - using value_type = typename ARROW_TYPE::c_type; - - explicit StringToFloatConverterMixin(const std::shared_ptr& = NULLPTR) - : main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"), - fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf", - "nan") {} - - bool operator()(const char* s, size_t length, value_type* out) { - value_type v; - // double-conversion doesn't give us an error flag but signals parse - // errors with sentinel values. Since a sentinel value can appear as - // legitimate input, we fallback on a second converter with a different - // sentinel to eliminate false errors. - TryConvert(main_converter_, s, length, &v); - if (ARROW_PREDICT_FALSE(v == static_cast(main_junk_value_))) { - TryConvert(fallback_converter_, s, length, &v); - if (ARROW_PREDICT_FALSE(v == static_cast(fallback_junk_value_))) { - return false; - } - } - *out = v; - return true; - } - - protected: -// This is only support in double-conversion 3.1+ -#ifdef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY - static const int flags_ = - double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY; -#else - static const int flags_ = double_conversion::StringToDoubleConverter::NO_FLAGS; -#endif - // Two unlikely values to signal a parsing error - static constexpr double main_junk_value_ = 0.7066424364107089; - static constexpr double fallback_junk_value_ = 0.40088499148279166; - - double_conversion::StringToDoubleConverter main_converter_; - double_conversion::StringToDoubleConverter fallback_converter_; - - inline void TryConvert(double_conversion::StringToDoubleConverter& converter, - const char* s, size_t length, float* out) { - int processed_length; - *out = converter.StringToFloat(s, static_cast(length), &processed_length); - } - - inline void TryConvert(double_conversion::StringToDoubleConverter& converter, - const char* s, size_t length, double* out) { - int processed_length; - *out = converter.StringToDouble(s, static_cast(length), &processed_length); - } -}; - -template <> -class StringConverter : public StringToFloatConverterMixin { - using StringToFloatConverterMixin::StringToFloatConverterMixin; -}; - -template <> -class StringConverter : public StringToFloatConverterMixin { - using StringToFloatConverterMixin::StringToFloatConverterMixin; -}; - -// NOTE: HalfFloatType would require a half<->float conversion library - -namespace detail { - -inline uint8_t ParseDecimalDigit(char c) { return static_cast(c - '0'); } - -#define PARSE_UNSIGNED_ITERATION(C_TYPE) \ - if (length > 0) { \ - uint8_t digit = ParseDecimalDigit(*s++); \ - result = static_cast(result * 10U); \ - length--; \ - if (ARROW_PREDICT_FALSE(digit > 9U)) { \ - /* Non-digit */ \ - return false; \ - } \ - result = static_cast(result + digit); \ - } - -#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \ - if (length > 0) { \ - if (ARROW_PREDICT_FALSE(result > std::numeric_limits::max() / 10U)) { \ - /* Overflow */ \ - return false; \ - } \ - uint8_t digit = ParseDecimalDigit(*s++); \ - result = static_cast(result * 10U); \ - C_TYPE new_result = static_cast(result + digit); \ - if (ARROW_PREDICT_FALSE(--length > 0)) { \ - /* Too many digits */ \ - return false; \ - } \ - if (ARROW_PREDICT_FALSE(digit > 9U)) { \ - /* Non-digit */ \ - return false; \ - } \ - if (ARROW_PREDICT_FALSE(new_result < result)) { \ - /* Overflow */ \ - return false; \ - } \ - result = new_result; \ - } - -inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) { - uint8_t result = 0; - - PARSE_UNSIGNED_ITERATION(uint8_t); - PARSE_UNSIGNED_ITERATION(uint8_t); - PARSE_UNSIGNED_ITERATION_LAST(uint8_t); - *out = result; - return true; -} - -inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) { - uint16_t result = 0; - - PARSE_UNSIGNED_ITERATION(uint16_t); - PARSE_UNSIGNED_ITERATION(uint16_t); - PARSE_UNSIGNED_ITERATION(uint16_t); - PARSE_UNSIGNED_ITERATION(uint16_t); - PARSE_UNSIGNED_ITERATION_LAST(uint16_t); - *out = result; - return true; -} - -inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) { - uint32_t result = 0; - - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - PARSE_UNSIGNED_ITERATION(uint32_t); - - PARSE_UNSIGNED_ITERATION_LAST(uint32_t); - *out = result; - return true; -} - -inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { - uint64_t result = 0; - - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - PARSE_UNSIGNED_ITERATION(uint64_t); - - PARSE_UNSIGNED_ITERATION_LAST(uint64_t); - *out = result; - return true; -} - -#undef PARSE_UNSIGNED_ITERATION -#undef PARSE_UNSIGNED_ITERATION_LAST - -} // namespace detail - -template -class StringToUnsignedIntConverterMixin { - public: - using value_type = typename ARROW_TYPE::c_type; - - explicit StringToUnsignedIntConverterMixin(const std::shared_ptr& = NULLPTR) { - } - - bool operator()(const char* s, size_t length, value_type* out) { - if (ARROW_PREDICT_FALSE(length == 0)) { - return false; - } - // Skip leading zeros - while (length > 0 && *s == '0') { - length--; - s++; - } - return detail::ParseUnsigned(s, length, out); - } -}; - -template <> -class StringConverter : public StringToUnsignedIntConverterMixin { - using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToUnsignedIntConverterMixin { - using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToUnsignedIntConverterMixin { - using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToUnsignedIntConverterMixin { - using StringToUnsignedIntConverterMixin::StringToUnsignedIntConverterMixin; -}; - -template -class StringToSignedIntConverterMixin { - public: - using value_type = typename ARROW_TYPE::c_type; - using unsigned_type = typename std::make_unsigned::type; - - explicit StringToSignedIntConverterMixin(const std::shared_ptr& = NULLPTR) {} - - bool operator()(const char* s, size_t length, value_type* out) { - static constexpr unsigned_type max_positive = - static_cast(std::numeric_limits::max()); - // Assuming two's complement - static constexpr unsigned_type max_negative = max_positive + 1; - bool negative = false; - unsigned_type unsigned_value = 0; - - if (ARROW_PREDICT_FALSE(length == 0)) { - return false; - } - if (*s == '-') { - negative = true; - s++; - if (--length == 0) { - return false; - } - } - // Skip leading zeros - while (length > 0 && *s == '0') { - length--; - s++; - } - if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) { - return false; - } - if (negative) { - if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) { - return false; - } - // To avoid both compiler warnings (with unsigned negation) - // and undefined behaviour (with signed negation overflow), - // use the expanded formula for 2's complement negation. - *out = static_cast(~unsigned_value + 1); - } else { - if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) { - return false; - } - *out = static_cast(unsigned_value); - } - return true; - } -}; - -template <> -class StringConverter : public StringToSignedIntConverterMixin { - using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToSignedIntConverterMixin { - using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToSignedIntConverterMixin { - using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; -}; - -template <> -class StringConverter : public StringToSignedIntConverterMixin { - using StringToSignedIntConverterMixin::StringToSignedIntConverterMixin; -}; - -template <> -class StringConverter { - public: - using value_type = TimestampType::c_type; - - explicit StringConverter(const std::shared_ptr& type) - : unit_(checked_cast(type.get())->unit()) {} - - bool operator()(const char* s, size_t length, value_type* out) { - // We allow the following formats: - // - "YYYY-MM-DD" - // - "YYYY-MM-DD[ T]hh:mm:ss" - // - "YYYY-MM-DD[ T]hh:mm:ssZ" - // UTC is always assumed, and the DataType's timezone is ignored. - arrow_vendored::date::year_month_day ymd; - if (ARROW_PREDICT_FALSE(length < 10)) { - return false; - } - if (length == 10) { - if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { - return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out); - } - if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) { - return false; - } - if (s[length - 1] == 'Z') { - --length; - } - if (length == 19) { - if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) { - return false; - } - std::chrono::duration seconds; - if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) { - return false; - } - return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out); - } - return false; - } - - protected: - template - bool ConvertTimePoint(TimePoint tp, value_type* out) { - auto duration = tp.time_since_epoch(); - switch (unit_) { - case TimeUnit::SECOND: - *out = std::chrono::duration_cast(duration).count(); - return true; - case TimeUnit::MILLI: - *out = std::chrono::duration_cast(duration).count(); - return true; - case TimeUnit::MICRO: - *out = std::chrono::duration_cast(duration).count(); - return true; - case TimeUnit::NANO: - *out = std::chrono::duration_cast(duration).count(); - return true; - } - // Unreachable, but suppress compiler warning - assert(0); - *out = 0; - return true; - } - - bool ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) { - uint16_t year; - uint8_t month, day; - if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) { - return false; - } - *out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month}, - arrow_vendored::date::day{day}}; - return out->ok(); - } - - bool ParseHH_MM_SS(const char* s, std::chrono::duration* out) { - uint8_t hours, minutes, seconds; - if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) { - return false; - } - if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 6, 2, &seconds))) { - return false; - } - if (ARROW_PREDICT_FALSE(hours >= 24)) { - return false; - } - if (ARROW_PREDICT_FALSE(minutes >= 60)) { - return false; - } - if (ARROW_PREDICT_FALSE(seconds >= 60)) { - return false; - } - *out = std::chrono::duration(3600U * hours + 60U * minutes + seconds); - return true; - } - - const TimeUnit::type unit_; -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_PARSING_H diff --git a/r/R/inst/include/arrow/util/rle-encoding.h b/r/R/inst/include/arrow/util/rle-encoding.h deleted file mode 100644 index 739158a59a1..00000000000 --- a/r/R/inst/include/arrow/util/rle-encoding.h +++ /dev/null @@ -1,604 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use -// in parquet-cpp, Arrow - -#ifndef ARROW_UTIL_RLE_ENCODING_H -#define ARROW_UTIL_RLE_ENCODING_H - -#include -#include - -#include "arrow/util/bit-stream-utils.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs -/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed -/// (literal encoding). -/// For both types of runs, there is a byte-aligned indicator which encodes the length -/// of the run and the type of the run. -/// This encoding has the benefit that when there aren't any long enough runs, values -/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and -/// the run length are byte aligned. This allows for very efficient decoding -/// implementations. -/// The encoding is: -/// encoded-block := run* -/// run := literal-run | repeated-run -/// literal-run := literal-indicator < literal bytes > -/// repeated-run := repeated-indicator < repeated value. padded to byte boundary > -/// literal-indicator := varint_encode( number_of_groups << 1 | 1) -/// repeated-indicator := varint_encode( number_of_repetitions << 1 ) -// -/// Each run is preceded by a varint. The varint's least significant bit is -/// used to indicate whether the run is a literal run or a repeated run. The rest -/// of the varint is used to determine the length of the run (eg how many times the -/// value repeats). -// -/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode -/// in groups of 8), so that no matter the bit-width of the value, the sequence will end -/// on a byte boundary without padding. -/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than -/// the actual number of encoded ints. (This means that the total number of encoded values -/// can not be determined from the encoded data, since the number of values in the last -/// group may not be a multiple of 8). For the last group of literal runs, we pad -/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side -/// without the need for additional checks. -// -/// There is a break-even point when it is more storage efficient to do run length -/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes -/// for both the repeated encoding or the literal encoding. This value can always -/// be computed based on the bit-width. -/// TODO: think about how to use this for strings. The bit packing isn't quite the same. -// -/// Examples with bit-width 1 (eg encoding booleans): -/// ---------------------------------------- -/// 100 1s followed by 100 0s: -/// <1, padded to 1 byte> <0, padded to 1 byte> -/// - (total 4 bytes) -// -/// alternating 1s and 0s (200 total): -/// 200 ints = 25 groups of 8 -/// <25 bytes of values, bitpacked> -/// (total 26 bytes, 1 byte overhead) -// - -/// Decoder class for RLE encoded data. -class RleDecoder { - public: - /// Create a decoder object. buffer/buffer_len is the decoded data. - /// bit_width is the width of each value (before encoding). - RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) - : bit_reader_(buffer, buffer_len), - bit_width_(bit_width), - current_value_(0), - repeat_count_(0), - literal_count_(0) { - DCHECK_GE(bit_width_, 0); - DCHECK_LE(bit_width_, 64); - } - - RleDecoder() : bit_width_(-1) {} - - void Reset(const uint8_t* buffer, int buffer_len, int bit_width) { - DCHECK_GE(bit_width, 0); - DCHECK_LE(bit_width, 64); - bit_reader_.Reset(buffer, buffer_len); - bit_width_ = bit_width; - current_value_ = 0; - repeat_count_ = 0; - literal_count_ = 0; - } - - /// Gets the next value. Returns false if there are no more. - template - bool Get(T* val); - - /// Gets a batch of values. Returns the number of decoded elements. - template - int GetBatch(T* values, int batch_size); - - /// Like GetBatch but the values are then decoded using the provided dictionary - template - int GetBatchWithDict(const T* dictionary, T* values, int batch_size); - - /// Like GetBatchWithDict but add spacing for null entries - template - int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size, - int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset); - - protected: - BitUtil::BitReader bit_reader_; - /// Number of bits needed to encode the value. Must be between 0 and 64. - int bit_width_; - uint64_t current_value_; - uint32_t repeat_count_; - uint32_t literal_count_; - - private: - /// Fills literal_count_ and repeat_count_ with next values. Returns false if there - /// are no more. - template - bool NextCounts(); -}; - -/// Class to incrementally build the rle data. This class does not allocate any memory. -/// The encoding has two modes: encoding repeated runs and literal runs. -/// If the run is sufficiently short, it is more efficient to encode as a literal run. -/// This class does so by buffering 8 values at a time. If they are not all the same -/// they are added to the literal run. If they are the same, they are added to the -/// repeated run. When we switch modes, the previous run is flushed out. -class RleEncoder { - public: - /// buffer/buffer_len: preallocated output buffer. - /// bit_width: max number of bits for value. - /// TODO: consider adding a min_repeated_run_length so the caller can control - /// when values should be encoded as repeated runs. Currently this is derived - /// based on the bit_width, which can determine a storage optimal choice. - /// TODO: allow 0 bit_width (and have dict encoder use it) - RleEncoder(uint8_t* buffer, int buffer_len, int bit_width) - : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { - DCHECK_GE(bit_width_, 0); - DCHECK_LE(bit_width_, 64); - max_run_byte_size_ = MinBufferSize(bit_width); - DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough."; - Clear(); - } - - /// Returns the minimum buffer size needed to use the encoder for 'bit_width' - /// This is the maximum length of a single run for 'bit_width'. - /// It is not valid to pass a buffer less than this length. - static int MinBufferSize(int bit_width) { - /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. - int max_literal_run_size = - 1 + - static_cast(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width)); - /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value. - int max_repeated_run_size = BitUtil::BitReader::MAX_VLQ_BYTE_LEN + - static_cast(BitUtil::BytesForBits(bit_width)); - return std::max(max_literal_run_size, max_repeated_run_size); - } - - /// Returns the maximum byte size it could take to encode 'num_values'. - static int MaxBufferSize(int bit_width, int num_values) { - // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 - // and then a repeated run of length 8". - // 8 values per smallest run, 8 bits per byte - int bytes_per_run = bit_width; - int num_runs = static_cast(BitUtil::CeilDiv(num_values, 8)); - int literal_max_size = num_runs + num_runs * bytes_per_run; - - // In the very worst case scenario, the data is a concatenation of repeated - // runs of 8 values. Repeated run has a 1 byte varint followed by the - // bit-packed repeated value - int min_repeated_run_size = 1 + static_cast(BitUtil::BytesForBits(bit_width)); - int repeated_max_size = - static_cast(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size; - - return std::max(literal_max_size, repeated_max_size); - } - - /// Encode value. Returns true if the value fits in buffer, false otherwise. - /// This value must be representable with bit_width_ bits. - bool Put(uint64_t value); - - /// Flushes any pending values to the underlying buffer. - /// Returns the total number of bytes written - int Flush(); - - /// Resets all the state in the encoder. - void Clear(); - - /// Returns pointer to underlying buffer - uint8_t* buffer() { return bit_writer_.buffer(); } - int32_t len() { return bit_writer_.bytes_written(); } - - private: - /// Flushes any buffered values. If this is part of a repeated run, this is largely - /// a no-op. - /// If it is part of a literal run, this will call FlushLiteralRun, which writes - /// out the buffered literal values. - /// If 'done' is true, the current run would be written even if it would normally - /// have been buffered more. This should only be called at the end, when the - /// encoder has received all values even if it would normally continue to be - /// buffered. - void FlushBufferedValues(bool done); - - /// Flushes literal values to the underlying buffer. If update_indicator_byte, - /// then the current literal run is complete and the indicator byte is updated. - void FlushLiteralRun(bool update_indicator_byte); - - /// Flushes a repeated run to the underlying buffer. - void FlushRepeatedRun(); - - /// Checks and sets buffer_full_. This must be called after flushing a run to - /// make sure there are enough bytes remaining to encode the next run. - void CheckBufferFull(); - - /// The maximum number of values in a single literal run - /// (number of groups encodable by a 1-byte indicator * 8) - static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8; - - /// Number of bits needed to encode the value. Must be between 0 and 64. - const int bit_width_; - - /// Underlying buffer. - BitUtil::BitWriter bit_writer_; - - /// If true, the buffer is full and subsequent Put()'s will fail. - bool buffer_full_; - - /// The maximum byte size a single run can take. - int max_run_byte_size_; - - /// We need to buffer at most 8 values for literals. This happens when the - /// bit_width is 1 (so 8 values fit in one byte). - /// TODO: generalize this to other bit widths - int64_t buffered_values_[8]; - - /// Number of values in buffered_values_ - int num_buffered_values_; - - /// The current (also last) value that was written and the count of how - /// many times in a row that value has been seen. This is maintained even - /// if we are in a literal run. If the repeat_count_ get high enough, we switch - /// to encoding repeated runs. - uint64_t current_value_; - int repeat_count_; - - /// Number of literals in the current run. This does not include the literals - /// that might be in buffered_values_. Only after we've got a group big enough - /// can we decide if they should part of the literal_count_ or repeat_count_ - int literal_count_; - - /// Pointer to a byte in the underlying buffer that stores the indicator byte. - /// This is reserved as soon as we need a literal run but the value is written - /// when the literal run is complete. - uint8_t* literal_indicator_byte_; -}; - -template -inline bool RleDecoder::Get(T* val) { - return GetBatch(val, 1) == 1; -} - -template -inline int RleDecoder::GetBatch(T* values, int batch_size) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - - while (values_read < batch_size) { - if (repeat_count_ > 0) { - int repeat_batch = - std::min(batch_size - values_read, static_cast(repeat_count_)); - std::fill(values + values_read, values + values_read + repeat_batch, - static_cast(current_value_)); - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = - std::min(batch_size - values_read, static_cast(literal_count_)); - int actual_read = - bit_reader_.GetBatch(bit_width_, values + values_read, literal_batch); - DCHECK_EQ(actual_read, literal_batch); - literal_count_ -= literal_batch; - values_read += literal_batch; - } else { - if (!NextCounts()) return values_read; - } - } - - return values_read; -} - -template -inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batch_size) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - - while (values_read < batch_size) { - if (repeat_count_ > 0) { - int repeat_batch = - std::min(batch_size - values_read, static_cast(repeat_count_)); - std::fill(values + values_read, values + values_read + repeat_batch, - dictionary[current_value_]); - repeat_count_ -= repeat_batch; - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = - std::min(batch_size - values_read, static_cast(literal_count_)); - - const int buffer_size = 1024; - int indices[buffer_size]; - literal_batch = std::min(literal_batch, buffer_size); - int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); - DCHECK_EQ(actual_read, literal_batch); - for (int i = 0; i < literal_batch; ++i) { - values[values_read + i] = dictionary[indices[i]]; - } - literal_count_ -= literal_batch; - values_read += literal_batch; - } else { - if (!NextCounts()) return values_read; - } - } - - return values_read; -} - -template -inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, - int batch_size, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { - DCHECK_GE(bit_width_, 0); - int values_read = 0; - int remaining_nulls = null_count; - - arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); - - while (values_read < batch_size) { - bool is_valid = bit_reader.IsSet(); - bit_reader.Next(); - - if (is_valid) { - if ((repeat_count_ == 0) && (literal_count_ == 0)) { - if (!NextCounts()) return values_read; - } - if (repeat_count_ > 0) { - T value = dictionary[current_value_]; - // The current index is already valid, we don't need to check that again - int repeat_batch = 1; - repeat_count_--; - - while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { - if (bit_reader.IsSet()) { - repeat_count_--; - } else { - remaining_nulls--; - } - repeat_batch++; - - bit_reader.Next(); - } - std::fill(values + values_read, values + values_read + repeat_batch, value); - values_read += repeat_batch; - } else if (literal_count_ > 0) { - int literal_batch = std::min(batch_size - values_read - remaining_nulls, - static_cast(literal_count_)); - - // Decode the literals - constexpr int kBufferSize = 1024; - int indices[kBufferSize]; - literal_batch = std::min(literal_batch, kBufferSize); - int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); - DCHECK_EQ(actual_read, literal_batch); - - int skipped = 0; - int literals_read = 1; - values[values_read] = dictionary[indices[0]]; - - // Read the first bitset to the end - while (literals_read < literal_batch) { - if (bit_reader.IsSet()) { - values[values_read + literals_read + skipped] = - dictionary[indices[literals_read]]; - literals_read++; - } else { - skipped++; - } - - bit_reader.Next(); - } - literal_count_ -= literal_batch; - values_read += literal_batch + skipped; - remaining_nulls -= skipped; - } - } else { - values_read++; - remaining_nulls--; - } - } - - return values_read; -} - -template -bool RleDecoder::NextCounts() { - // Read the next run's indicator int, it could be a literal or repeated run. - // The int is encoded as a vlq-encoded value. - int32_t indicator_value = 0; - bool result = bit_reader_.GetVlqInt(&indicator_value); - if (!result) return false; - - // lsb indicates if it is a literal run or repeated run - bool is_literal = indicator_value & 1; - if (is_literal) { - literal_count_ = (indicator_value >> 1) * 8; - } else { - repeat_count_ = indicator_value >> 1; - // XXX (ARROW-4018) this is not big-endian compatible - bool result = - bit_reader_.GetAligned(static_cast(BitUtil::CeilDiv(bit_width_, 8)), - reinterpret_cast(¤t_value_)); - DCHECK(result); - } - return true; -} - -/// This function buffers input values 8 at a time. After seeing all 8 values, -/// it decides whether they should be encoded as a literal or repeated run. -inline bool RleEncoder::Put(uint64_t value) { - DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_)); - if (ARROW_PREDICT_FALSE(buffer_full_)) return false; - - if (ARROW_PREDICT_TRUE(current_value_ == value)) { - ++repeat_count_; - if (repeat_count_ > 8) { - // This is just a continuation of the current run, no need to buffer the - // values. - // Note that this is the fast path for long repeated runs. - return true; - } - } else { - if (repeat_count_ >= 8) { - // We had a run that was long enough but it has ended. Flush the - // current repeated run. - DCHECK_EQ(literal_count_, 0); - FlushRepeatedRun(); - } - repeat_count_ = 1; - current_value_ = value; - } - - buffered_values_[num_buffered_values_] = value; - if (++num_buffered_values_ == 8) { - DCHECK_EQ(literal_count_ % 8, 0); - FlushBufferedValues(false); - } - return true; -} - -inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { - if (literal_indicator_byte_ == NULL) { - // The literal indicator byte has not been reserved yet, get one now. - literal_indicator_byte_ = bit_writer_.GetNextBytePtr(); - DCHECK(literal_indicator_byte_ != NULL); - } - - // Write all the buffered values as bit packed literals - for (int i = 0; i < num_buffered_values_; ++i) { - bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_); - DCHECK(success) << "There is a bug in using CheckBufferFull()"; - } - num_buffered_values_ = 0; - - if (update_indicator_byte) { - // At this point we need to write the indicator byte for the literal run. - // We only reserve one byte, to allow for streaming writes of literal values. - // The logic makes sure we flush literal runs often enough to not overrun - // the 1 byte. - DCHECK_EQ(literal_count_ % 8, 0); - int num_groups = literal_count_ / 8; - int32_t indicator_value = (num_groups << 1) | 1; - DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); - *literal_indicator_byte_ = static_cast(indicator_value); - literal_indicator_byte_ = NULL; - literal_count_ = 0; - CheckBufferFull(); - } -} - -inline void RleEncoder::FlushRepeatedRun() { - DCHECK_GT(repeat_count_, 0); - bool result = true; - // The lsb of 0 indicates this is a repeated run - int32_t indicator_value = repeat_count_ << 1 | 0; - result &= bit_writer_.PutVlqInt(indicator_value); - result &= bit_writer_.PutAligned(current_value_, - static_cast(BitUtil::CeilDiv(bit_width_, 8))); - DCHECK(result); - num_buffered_values_ = 0; - repeat_count_ = 0; - CheckBufferFull(); -} - -/// Flush the values that have been buffered. At this point we decide whether -/// we need to switch between the run types or continue the current one. -inline void RleEncoder::FlushBufferedValues(bool done) { - if (repeat_count_ >= 8) { - // Clear the buffered values. They are part of the repeated run now and we - // don't want to flush them out as literals. - num_buffered_values_ = 0; - if (literal_count_ != 0) { - // There was a current literal run. All the values in it have been flushed - // but we still need to update the indicator byte. - DCHECK_EQ(literal_count_ % 8, 0); - DCHECK_EQ(repeat_count_, 8); - FlushLiteralRun(true); - } - DCHECK_EQ(literal_count_, 0); - return; - } - - literal_count_ += num_buffered_values_; - DCHECK_EQ(literal_count_ % 8, 0); - int num_groups = literal_count_ / 8; - if (num_groups + 1 >= (1 << 6)) { - // We need to start a new literal run because the indicator byte we've reserved - // cannot store more values. - DCHECK(literal_indicator_byte_ != NULL); - FlushLiteralRun(true); - } else { - FlushLiteralRun(done); - } - repeat_count_ = 0; -} - -inline int RleEncoder::Flush() { - if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { - bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ || - num_buffered_values_ == 0); - // There is something pending, figure out if it's a repeated or literal run - if (repeat_count_ > 0 && all_repeat) { - FlushRepeatedRun(); - } else { - DCHECK_EQ(literal_count_ % 8, 0); - // Buffer the last group of literals to 8 by padding with 0s. - for (; num_buffered_values_ != 0 && num_buffered_values_ < 8; - ++num_buffered_values_) { - buffered_values_[num_buffered_values_] = 0; - } - literal_count_ += num_buffered_values_; - FlushLiteralRun(true); - repeat_count_ = 0; - } - } - bit_writer_.Flush(); - DCHECK_EQ(num_buffered_values_, 0); - DCHECK_EQ(literal_count_, 0); - DCHECK_EQ(repeat_count_, 0); - - return bit_writer_.bytes_written(); -} - -inline void RleEncoder::CheckBufferFull() { - int bytes_written = bit_writer_.bytes_written(); - if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { - buffer_full_ = true; - } -} - -inline void RleEncoder::Clear() { - buffer_full_ = false; - current_value_ = 0; - repeat_count_ = 0; - num_buffered_values_ = 0; - literal_count_ = 0; - literal_indicator_byte_ = NULL; - bit_writer_.Clear(); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_RLE_ENCODING_H diff --git a/r/R/inst/include/arrow/util/sse-util.h b/r/R/inst/include/arrow/util/sse-util.h deleted file mode 100644 index 6f451fd0efc..00000000000 --- a/r/R/inst/include/arrow/util/sse-util.h +++ /dev/null @@ -1,122 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// From Apache Impala as of 2016-01-29. Pared down to a minimal set of -// functions needed for parquet-cpp - -#pragma once - -#include "arrow/util/macros.h" - -#ifdef ARROW_USE_SIMD - -// MSVC x86-64 - -#if (defined(_M_AMD64) || defined(_M_X64)) -#define ARROW_HAVE_SSE2 1 -#define ARROW_HAVE_SSE4_2 1 -#include -#endif - -// gcc/clang (possibly others) - -#if defined(__SSE2__) -#define ARROW_HAVE_SSE2 1 -#include -#endif - -#if defined(__SSE4_2__) -#define ARROW_HAVE_SSE4_2 1 -#include -#endif - -#endif // ARROW_USE_SIMD - -// MSVC x86-64 - -namespace arrow { - -/// This class contains constants useful for text processing with SSE4.2 intrinsics. -namespace SSEUtil { -/// Number of characters that fit in 64/128 bit register. SSE provides instructions -/// for loading 64 or 128 bits into a register at a time. -static const int CHARS_PER_64_BIT_REGISTER = 8; -static const int CHARS_PER_128_BIT_REGISTER = 16; - -/// SSE4.2 adds instructions for text processing. The instructions have a control -/// byte that determines some of functionality of the instruction. (Equivalent to -/// GCC's _SIDD_CMP_EQUAL_ANY, etc). -static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr -static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp -static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) -static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. - -/// In this mode, SSE text processing functions will return a mask of all the -/// characters that matched. -static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; - -/// In this mode, SSE text processing functions will return the number of -/// bytes that match consecutively from the beginning. -static const int STRCMP_MODE = - PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; - -/// Precomputed mask values up to 16 bits. -static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, - 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, -}; -} // namespace SSEUtil - -#ifdef ARROW_HAVE_SSE4_2 - -/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen -/// IR load time) that the processor supports SSE 4.2 before calling these. These are -/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. - -template -static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestrm(str1, len1, str2, len2, MODE); -} - -template -static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { - return _mm_cmpestri(str1, len1, str2, len2, MODE); -} - -static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { - return _mm_crc32_u8(crc, v); -} - -static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { - return _mm_crc32_u16(crc, v); -} - -static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { - return _mm_crc32_u32(crc, v); -} - -static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { -#if ARROW_BITNESS == 32 - return 0; -#else - return static_cast(_mm_crc32_u64(crc, v)); -#endif -} - -#endif // ARROW_HAVE_SSE4_2 - -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/stl.h b/r/R/inst/include/arrow/util/stl.h deleted file mode 100644 index 48898140bf1..00000000000 --- a/r/R/inst/include/arrow/util/stl.h +++ /dev/null @@ -1,95 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_STL_H -#define ARROW_UTIL_STL_H - -#include -#include -#include -#include - -#include "arrow/util/logging.h" - -namespace arrow { -namespace internal { - -template -typename std::enable_if::value, std::unique_ptr>::type make_unique( - A&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - -template -typename std::enable_if::value && std::extent::value == 0, - std::unique_ptr>::type -make_unique(std::size_t n) { - using value_type = typename std::remove_extent::type; - return std::unique_ptr(new value_type[n]); -} - -template -inline std::vector DeleteVectorElement(const std::vector& values, size_t index) { - DCHECK(!values.empty()); - DCHECK_LT(index, values.size()); - std::vector out; - out.reserve(values.size() - 1); - for (size_t i = 0; i < index; ++i) { - out.push_back(values[i]); - } - for (size_t i = index + 1; i < values.size(); ++i) { - out.push_back(values[i]); - } - return out; -} - -template -inline std::vector AddVectorElement(const std::vector& values, size_t index, - const T& new_element) { - DCHECK_LE(index, values.size()); - std::vector out; - out.reserve(values.size() + 1); - for (size_t i = 0; i < index; ++i) { - out.push_back(values[i]); - } - out.push_back(new_element); - for (size_t i = index; i < values.size(); ++i) { - out.push_back(values[i]); - } - return out; -} - -template -inline std::vector ReplaceVectorElement(const std::vector& values, size_t index, - const T& new_element) { - DCHECK_LE(index, values.size()); - std::vector out; - out.reserve(values.size()); - for (size_t i = 0; i < index; ++i) { - out.push_back(values[i]); - } - out.push_back(new_element); - for (size_t i = index + 1; i < values.size(); ++i) { - out.push_back(values[i]); - } - return out; -} - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_STL_H diff --git a/r/R/inst/include/arrow/util/stopwatch.h b/r/R/inst/include/arrow/util/stopwatch.h deleted file mode 100644 index db4e67f59ed..00000000000 --- a/r/R/inst/include/arrow/util/stopwatch.h +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -namespace arrow { -namespace internal { - -class StopWatch { - // This clock should give us wall clock time - using ClockType = std::chrono::steady_clock; - - public: - StopWatch() {} - - void Start() { start_ = ClockType::now(); } - - // Returns time in nanoseconds. - uint64_t Stop() { - auto stop = ClockType::now(); - std::chrono::nanoseconds d = stop - start_; - assert(d.count() >= 0); - return static_cast(d.count()); - } - - private: - std::chrono::time_point start_; -}; - -} // namespace internal -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/string.h b/r/R/inst/include/arrow/util/string.h deleted file mode 100644 index 1d716c5a156..00000000000 --- a/r/R/inst/include/arrow/util/string.h +++ /dev/null @@ -1,68 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_STRING_UTIL_H -#define ARROW_UTIL_STRING_UTIL_H - -#include -#include - -#include "arrow/status.h" -#include "arrow/util/string_view.h" - -namespace arrow { - -static const char* kAsciiTable = "0123456789ABCDEF"; - -static inline std::string HexEncode(const uint8_t* data, size_t length) { - std::string hex_string; - hex_string.reserve(length * 2); - for (size_t j = 0; j < length; ++j) { - // Convert to 2 base16 digits - hex_string.push_back(kAsciiTable[data[j] >> 4]); - hex_string.push_back(kAsciiTable[data[j] & 15]); - } - return hex_string; -} - -static inline std::string HexEncode(const char* data, size_t length) { - return HexEncode(reinterpret_cast(data), length); -} - -static inline std::string HexEncode(util::string_view str) { - return HexEncode(str.data(), str.size()); -} - -static inline Status ParseHexValue(const char* data, uint8_t* out) { - char c1 = data[0]; - char c2 = data[1]; - - const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1); - const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2); - - // Error checking - if (*pos1 != c1 || *pos2 != c2) { - return Status::Invalid("Encountered non-hex digit"); - } - - *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); - return Status::OK(); -} - -} // namespace arrow - -#endif // ARROW_UTIL_STRING_UTIL_H diff --git a/r/R/inst/include/arrow/util/string_builder.h b/r/R/inst/include/arrow/util/string_builder.h deleted file mode 100644 index 9129f12c681..00000000000 --- a/r/R/inst/include/arrow/util/string_builder.h +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. template - -#ifndef ARROW_UTIL_STRING_BUILDER_H -#define ARROW_UTIL_STRING_BUILDER_H - -#include -#include -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -namespace detail { - -class ARROW_EXPORT StringStreamWrapper { - public: - StringStreamWrapper(); - ~StringStreamWrapper(); - - std::ostream& stream() { return ostream_; } - std::string str(); - - protected: - std::unique_ptr sstream_; - std::ostream& ostream_; -}; - -} // namespace detail - -template -void StringBuilderRecursive(std::ostream& stream, Head&& head) { - stream << head; -} - -template -void StringBuilderRecursive(std::ostream& stream, Head&& head, Tail&&... tail) { - StringBuilderRecursive(stream, std::forward(head)); - StringBuilderRecursive(stream, std::forward(tail)...); -} - -template -std::string StringBuilder(Args&&... args) { - detail::StringStreamWrapper ss; - StringBuilderRecursive(ss.stream(), std::forward(args)...); - return ss.str(); -} - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_STRING_BUILDER_H diff --git a/r/R/inst/include/arrow/util/string_view.h b/r/R/inst/include/arrow/util/string_view.h deleted file mode 100644 index 88748429b7e..00000000000 --- a/r/R/inst/include/arrow/util/string_view.h +++ /dev/null @@ -1,33 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_STRING_VIEW_H -#define ARROW_UTIL_STRING_VIEW_H - -#define nssv_CONFIG_SELECT_STRING_VIEW nssv_STRING_VIEW_NONSTD - -#include "arrow/vendored/string_view.hpp" // IWYU pragma: export - -namespace arrow { -namespace util { - -using nonstd::string_view; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_STRING_VIEW_H diff --git a/r/R/inst/include/arrow/util/task-group.h b/r/R/inst/include/arrow/util/task-group.h deleted file mode 100644 index 390d9476e59..00000000000 --- a/r/R/inst/include/arrow/util/task-group.h +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_TASK_GROUP_H -#define ARROW_UTIL_TASK_GROUP_H - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -class ThreadPool; - -// TODO Simplify this. Subgroups don't seem necessary. - -/// \brief A group of related tasks -/// -/// A TaskGroup executes tasks with the signature `Status()`. -/// Execution can be serial or parallel, depending on the TaskGroup -/// implementation. When Finish() returns, it is guaranteed that all -/// tasks have finished, or at least one has errored. -/// -class ARROW_EXPORT TaskGroup { - public: - /// Add a Status-returning function to execute. Execution order is - /// undefined. The function may be executed immediately or later. - template - void Append(Function&& func) { - return AppendReal(std::forward(func)); - } - - /// Wait for execution of all tasks (and subgroups) to be finished, - /// or for at least one task (or subgroup) to error out. - /// The returned Status propagates the error status of the first failing - /// task (or subgroup). - virtual Status Finish() = 0; - - /// The current agregate error Status. Non-blocking, useful for stopping early. - virtual Status current_status() = 0; - - /// Whether some tasks have already failed. Non-blocking , useful for stopping early. - virtual bool ok() = 0; - - /// How many tasks can typically be executed in parallel. - /// This is only a hint, useful for testing or debugging. - virtual int parallelism() = 0; - - /// Create a subgroup of this group. This group can only finish - /// when all subgroups have finished (this means you must be - /// be careful to call Finish() on subgroups before calling it - /// on the main group). - // XXX if a subgroup errors out, should it propagate immediately to the parent - // and to children? - virtual std::shared_ptr MakeSubGroup() = 0; - - static std::shared_ptr MakeSerial(); - static std::shared_ptr MakeThreaded(internal::ThreadPool*); - - virtual ~TaskGroup() = default; - - protected: - TaskGroup() = default; - ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup); - - virtual void AppendReal(std::function task) = 0; -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_TASK_GROUP_H diff --git a/r/R/inst/include/arrow/util/thread-pool.h b/r/R/inst/include/arrow/util/thread-pool.h deleted file mode 100644 index 2de212e64c5..00000000000 --- a/r/R/inst/include/arrow/util/thread-pool.h +++ /dev/null @@ -1,169 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_THREAD_POOL_H -#define ARROW_UTIL_THREAD_POOL_H - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -/// \brief Get the capacity of the global thread pool -/// -/// Return the number of worker threads in the thread pool to which -/// Arrow dispatches various CPU-bound tasks. This is an ideal number, -/// not necessarily the exact number of threads at a given point in time. -/// -/// You can change this number using SetCpuThreadPoolCapacity(). -ARROW_EXPORT int GetCpuThreadPoolCapacity(); - -/// \brief Set the capacity of the global thread pool -/// -/// Set the number of worker threads int the thread pool to which -/// Arrow dispatches various CPU-bound tasks. -/// -/// The current number is returned by GetCpuThreadPoolCapacity(). -ARROW_EXPORT Status SetCpuThreadPoolCapacity(int threads); - -namespace internal { - -namespace detail { - -// Needed because std::packaged_task is not copyable and hence not convertible -// to std::function. -template -struct packaged_task_wrapper { - using PackagedTask = std::packaged_task; - - explicit packaged_task_wrapper(PackagedTask&& task) - : task_(std::make_shared(std::forward(task))) {} - - void operator()(Args&&... args) { return (*task_)(std::forward(args)...); } - std::shared_ptr task_; -}; - -} // namespace detail - -class ARROW_EXPORT ThreadPool { - public: - // Construct a thread pool with the given number of worker threads - static Status Make(int threads, std::shared_ptr* out); - - // Destroy thread pool; the pool will first be shut down - ~ThreadPool(); - - // Return the desired number of worker threads. - // The actual number of workers may lag a bit before being adjusted to - // match this value. - int GetCapacity(); - - // Dynamically change the number of worker threads. - // This function returns quickly, but it may take more time before the - // thread count is fully adjusted. - Status SetCapacity(int threads); - - // Heuristic for the default capacity of a thread pool for CPU-bound tasks. - // This is exposed as a static method to help with testing. - static int DefaultCapacity(); - - // Shutdown the pool. Once the pool starts shutting down, new tasks - // cannot be submitted anymore. - // If "wait" is true, shutdown waits for all pending tasks to be finished. - // If "wait" is false, workers are stopped as soon as currently executing - // tasks are finished. - Status Shutdown(bool wait = true); - - // Spawn a fire-and-forget task on one of the workers. - template - Status Spawn(Function&& func) { - return SpawnReal(std::forward(func)); - } - - // Submit a callable and arguments for execution. Return a future that - // will return the callable's result value once. - // The callable's arguments are copied before execution. - // Since the function is variadic and needs to return a result (the future), - // an exception is raised if the task fails spawning (which currently - // only occurs if the ThreadPool is shutting down). - template ::type> - std::future Submit(Function&& func, Args&&... args) { - // Trying to templatize std::packaged_task with Function doesn't seem - // to work, so go through std::bind to simplify the packaged signature - using PackagedTask = std::packaged_task; - auto task = PackagedTask(std::bind(std::forward(func), args...)); - auto fut = task.get_future(); - - Status st = SpawnReal(detail::packaged_task_wrapper(std::move(task))); - if (!st.ok()) { - st.Abort("ThreadPool::Submit() was probably called after Shutdown()"); - } - return fut; - } - - struct State; - - protected: - FRIEND_TEST(TestThreadPool, SetCapacity); - FRIEND_TEST(TestGlobalThreadPool, Capacity); - friend ARROW_EXPORT ThreadPool* GetCpuThreadPool(); - - ThreadPool(); - - ARROW_DISALLOW_COPY_AND_ASSIGN(ThreadPool); - - Status SpawnReal(std::function task); - // Collect finished worker threads, making sure the OS threads have exited - void CollectFinishedWorkersUnlocked(); - // Launch a given number of additional workers - void LaunchWorkersUnlocked(int threads); - // Get the current actual capacity - int GetActualCapacity(); - // Reinitialize the thread pool if the pid changed - void ProtectAgainstFork(); - - static std::shared_ptr MakeCpuThreadPool(); - - std::shared_ptr sp_state_; - State* state_; - bool shutdown_on_destroy_; -#ifndef _WIN32 - pid_t pid_; -#endif -}; - -// Return the process-global thread pool for CPU-bound tasks. -ARROW_EXPORT ThreadPool* GetCpuThreadPool(); - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_THREAD_POOL_H diff --git a/r/R/inst/include/arrow/util/trie.h b/r/R/inst/include/arrow/util/trie.h deleted file mode 100644 index 3e82bfd8ee2..00000000000 --- a/r/R/inst/include/arrow/util/trie.h +++ /dev/null @@ -1,245 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_TRIE_H -#define ARROW_UTIL_TRIE_H - -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -// A non-zero-terminated small string class. -// std::string usually has a small string optimization -// (see review at https://shaharmike.com/cpp/std-string/) -// but this one allows tight control and optimization of memory layout. -template -class SmallString { - public: - SmallString() : length_(0) {} - - template - SmallString(const T& v) { // NOLINT implicit constructor - *this = util::string_view(v); - } - - SmallString& operator=(const util::string_view s) { -#ifndef NDEBUG - CheckSize(s.size()); -#endif - length_ = static_cast(s.size()); - std::memcpy(data_, s.data(), length_); - return *this; - } - - SmallString& operator=(const std::string& s) { - *this = util::string_view(s); - return *this; - } - - SmallString& operator=(const char* s) { - *this = util::string_view(s); - return *this; - } - - explicit operator util::string_view() const { - return util::string_view(data_, length_); - } - - const char* data() const { return data_; } - size_t length() const { return length_; } - bool empty() const { return length_ == 0; } - char operator[](size_t pos) const { -#ifdef NDEBUG - assert(pos <= length_); -#endif - return data_[pos]; - } - - SmallString substr(size_t pos) const { - return SmallString(util::string_view(*this).substr(pos)); - } - - SmallString substr(size_t pos, size_t count) const { - return SmallString(util::string_view(*this).substr(pos, count)); - } - - template - bool operator==(T&& other) const { - return util::string_view(*this) == util::string_view(std::forward(other)); - } - - template - bool operator!=(T&& other) const { - return util::string_view(*this) != util::string_view(std::forward(other)); - } - - protected: - uint8_t length_; - char data_[N]; - -#ifndef NDEBUG - void CheckSize(size_t n) { assert(n <= N); } -#endif -}; - -template -std::ostream& operator<<(std::ostream& os, const SmallString& str) { - return os << util::string_view(str); -} - -// A trie class for byte strings, optimized for small sets of short strings. -// This class is immutable by design, use a TrieBuilder to construct it. -class ARROW_EXPORT Trie { - using index_type = int16_t; - using fast_index_type = int_fast16_t; - - public: - Trie() : size_(0) {} - Trie(Trie&&) = default; - Trie& operator=(Trie&&) = default; - - int32_t Find(util::string_view s) const { - const Node* node = &nodes_[0]; - fast_index_type pos = 0; - fast_index_type remaining = static_cast(s.length()); - - while (remaining > 0) { - auto substring_length = node->substring_length(); - if (substring_length > 0) { - auto substring_data = node->substring_data(); - if (remaining < substring_length) { - // Input too short - return -1; - } - for (fast_index_type i = 0; i < substring_length; ++i) { - if (s[pos++] != substring_data[i]) { - // Mismatching substring - return -1; - } - --remaining; - } - if (remaining == 0) { - // Matched node exactly - return node->found_index_; - } - } - // Lookup child using next input character - if (node->child_lookup_ == -1) { - // Input too long - return -1; - } - auto c = static_cast(s[pos++]); - --remaining; - auto child_index = lookup_table_[node->child_lookup_ * 256 + c]; - if (child_index == -1) { - // Child not found - return -1; - } - node = &nodes_[child_index]; - } - - // Input exhausted - if (node->substring_.empty()) { - // Matched node exactly - return node->found_index_; - } else { - return -1; - } - } - - Status Validate() const; - - void Dump() const; - - protected: - static constexpr size_t kNodeSize = 16; - static constexpr auto kMaxSubstringLength = - kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t); - - struct Node { - // If this node is a valid end of string, index of found string, otherwise -1 - index_type found_index_; - // Base index for child lookup in lookup_table_ (-1 if no child nodes) - index_type child_lookup_; - // The substring for this node. - SmallString substring_; - - fast_index_type substring_length() const { - return static_cast(substring_.length()); - } - const char* substring_data() const { return substring_.data(); } - }; - - static_assert(sizeof(Node) == kNodeSize, "Unexpected node size"); - - ARROW_DISALLOW_COPY_AND_ASSIGN(Trie); - - void Dump(const Node* node, const std::string& indent) const; - - // Node table: entry 0 is the root node - std::vector nodes_; - - // Indexed lookup structure: gives index in node table, or -1 if not found - std::vector lookup_table_; - - // Number of entries - index_type size_; - - friend class TrieBuilder; -}; - -class ARROW_EXPORT TrieBuilder { - using index_type = Trie::index_type; - using fast_index_type = Trie::fast_index_type; - - public: - TrieBuilder(); - Status Append(util::string_view s, bool allow_duplicate = false); - Trie Finish(); - - protected: - // Extend the lookup table by 256 entries, return the index of the new span - Status ExtendLookupTable(index_type* out_lookup_index); - // Split the node given by the index at the substring index `split_at` - Status SplitNode(fast_index_type node_index, fast_index_type split_at); - // Append an already constructed child node to the parent - Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node); - // Create a matching child node from this parent - Status CreateChildNode(Trie::Node* parent, uint8_t ch, util::string_view substring); - Status CreateChildNode(Trie::Node* parent, char ch, util::string_view substring); - - Trie trie_; - - static constexpr auto kMaxIndex = std::numeric_limits::max(); -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_TRIE_H diff --git a/r/R/inst/include/arrow/util/type_traits.h b/r/R/inst/include/arrow/util/type_traits.h deleted file mode 100644 index 570f6486789..00000000000 --- a/r/R/inst/include/arrow/util/type_traits.h +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_TYPE_TRAITS_H -#define ARROW_UTIL_TYPE_TRAITS_H - -#include - -namespace arrow { -namespace internal { - -/// \brief Metafunction to allow checking if a type matches any of another set of types -template -struct IsOneOf : std::false_type {}; /// Base case: nothing has matched - -template -struct IsOneOf { - /// Recursive case: T == U or T matches any other types provided (not including U). - static constexpr bool value = std::is_same::value || IsOneOf::value; -}; - -/// \brief Shorthand for using IsOneOf + std::enable_if -template -using EnableIfIsOneOf = typename std::enable_if::value, T>::type; - -/// \brief is_null_pointer from C++17 -template -struct is_null_pointer : std::is_same::type> { -}; - -} // namespace internal -} // namespace arrow - -#endif // ARROW_UTIL_TYPE_TRAITS_H diff --git a/r/R/inst/include/arrow/util/ubsan.h b/r/R/inst/include/arrow/util/ubsan.h deleted file mode 100644 index f9fcfb54022..00000000000 --- a/r/R/inst/include/arrow/util/ubsan.h +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Contains utilities for making UBSan happy. - -#pragma once - -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -namespace internal { - -static uint8_t non_null_filler; - -} // namespace internal - -/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory -/// that shouldn't be dereferenced. -/// -/// Memset/Memcpy are undefinfed when a nullptr is passed as an argument use this utility -/// method to wrap locations where this could happen. -/// -/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed. -/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve them. -template -inline T* MakeNonNull(T* maybe_null) { - if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) { - return maybe_null; - } - - return reinterpret_cast(&internal::non_null_filler); -} - -} // namespace util -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/uri.h b/r/R/inst/include/arrow/util/uri.h deleted file mode 100644 index ce082ccc8e6..00000000000 --- a/r/R/inst/include/arrow/util/uri.h +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace internal { - -/// \brief A parsed URI -class ARROW_EXPORT Uri { - public: - Uri(); - ~Uri(); - - // XXX Should we use util::string_view instead? These functions are - // not performance-critical. - - /// The URI scheme, such as "http", or the empty string if the URI has no - /// explicit scheme. - std::string scheme() const; - /// Whether the URI has an explicit host name. This may return true if - /// the URI has an empty host (e.g. "file:///tmp/foo"), while it returns - /// false is the URI has not host component at all (e.g. "file:/tmp/foo"). - bool has_host() const; - /// The URI host name, such as "localhost", "127.0.0.1" or "::1", or the empty - /// string is the URI does not have a host component. - std::string host() const; - /// The URI port number, as a string such as "80", or the empty string is the URI - /// does not have a port number component. - std::string port_text() const; - /// The URI port parsed as an integer, or -1 if the URI does not have a port - /// number component. - int32_t port() const; - /// The URI path component. - std::string path() const; - - /// Get the string representation of this URI. - const std::string& ToString() const; - - /// Factory function to parse a URI from its string representation. - Status Parse(const std::string& uri_string); - - private: - struct Impl; - std::unique_ptr impl_; -}; - -} // namespace internal -} // namespace arrow diff --git a/r/R/inst/include/arrow/util/utf8.h b/r/R/inst/include/arrow/util/utf8.h deleted file mode 100644 index 739c7566c05..00000000000 --- a/r/R/inst/include/arrow/util/utf8.h +++ /dev/null @@ -1,176 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_UTF8_H -#define ARROW_UTIL_UTF8_H - -#include -#include -#include -#include - -#include "arrow/status.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace util { - -namespace internal { - -// Copyright (c) 2008-2010 Bjoern Hoehrmann -// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -// A compact state table allowing UTF8 decoding using two dependent -// lookups per byte. The first lookup determines the character class -// and the second lookup reads the next state. -// In this table states are multiples of 12. -ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12]; - -// Success / reject states when looked up in the small table -static constexpr uint8_t kUTF8DecodeAccept = 0; -static constexpr uint8_t kUTF8DecodeReject = 12; - -// An expanded state table allowing transitions using a single lookup -// at the expense of a larger memory footprint (but on non-random data, -// not all the table will end up accessed and cached). -// In this table states are multiples of 256. -ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256]; - -// Success / reject states when looked up in the large table -static constexpr uint16_t kUTF8ValidateAccept = 0; -static constexpr uint16_t kUTF8ValidateReject = 256; - -static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) { - uint8_t type = utf8_small_table[byte]; - - *codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6) - : (0xff >> type) & (byte); - - state = utf8_small_table[256 + state + type]; - return state; -} - -static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) { - return utf8_large_table[state + byte]; -} - -#ifndef NDEBUG -ARROW_EXPORT void CheckUTF8Initialized(); -#endif - -} // namespace internal - -// This function needs to be called before doing UTF8 validation. -ARROW_EXPORT void InitializeUTF8(); - -inline bool ValidateUTF8(const uint8_t* data, int64_t size) { - static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL; - // For some reason, defining this variable outside the loop helps clang - uint64_t mask; - -#ifndef NDEBUG - internal::CheckUTF8Initialized(); -#endif - - while (size >= 8) { - // XXX This is doing an unaligned access. Contemporary architectures - // (x86-64, AArch64, PPC64) support it natively and often have good - // performance nevertheless. - memcpy(&mask, data, 8); - if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) { - // 8 bytes of pure ASCII, move forward - size -= 8; - data += 8; - continue; - } - // Non-ASCII run detected. - // We process at least 4 bytes, to avoid too many spurious 64-bit reads - // in case the non-ASCII bytes are at the end of the tested 64-bit word. - // We also only check for rejection at the end since that state is stable - // (once in reject state, we always remain in reject state). - // It is guaranteed that size >= 8 when arriving here, which allows - // us to avoid size checks. - uint16_t state = internal::kUTF8ValidateAccept; - // Byte 0 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - // Byte 1 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - // Byte 2 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - // Byte 3 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - // Byte 4 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - if (state == internal::kUTF8ValidateAccept) { - continue; // Got full char, switch back to ASCII detection - } - // Byte 5 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - if (state == internal::kUTF8ValidateAccept) { - continue; // Got full char, switch back to ASCII detection - } - // Byte 6 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - if (state == internal::kUTF8ValidateAccept) { - continue; // Got full char, switch back to ASCII detection - } - // Byte 7 - state = internal::ValidateOneUTF8Byte(*data++, state); - --size; - if (state == internal::kUTF8ValidateAccept) { - continue; // Got full char, switch back to ASCII detection - } - // kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection - assert(state == internal::kUTF8ValidateReject); - return false; - } - - // Validate string tail one byte at a time - // Note the state table is designed so that, once in the reject state, - // we remain in that state until the end. So we needn't check for - // rejection at each char (we don't gain much by short-circuiting here). - uint16_t state = internal::kUTF8ValidateAccept; - while (size-- > 0) { - state = internal::ValidateOneUTF8Byte(*data++, state); - } - return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); -} - -inline bool ValidateUTF8(const util::string_view& str) { - const uint8_t* data = reinterpret_cast(str.data()); - const size_t length = str.size(); - - return ValidateUTF8(data, length); -} - -// Skip UTF8 byte order mark, if any. -ARROW_EXPORT -Status SkipUTF8BOM(const uint8_t* data, int64_t size, const uint8_t** out); - -} // namespace util -} // namespace arrow - -#endif diff --git a/r/R/inst/include/arrow/util/variant.h b/r/R/inst/include/arrow/util/variant.h deleted file mode 100644 index 0097c5afb2a..00000000000 --- a/r/R/inst/include/arrow/util/variant.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_VARIANT_H -#define ARROW_UTIL_VARIANT_H - -#include "arrow/vendored/variant.hpp" // IWYU pragma: export - -namespace arrow { -namespace util { - -using ::mpark::bad_variant_access; -using ::mpark::get; -using ::mpark::get_if; -using ::mpark::holds_alternative; -using ::mpark::variant; -using ::mpark::visit; - -} // namespace util -} // namespace arrow - -#endif // ARROW_UTIL_VARIANT_H diff --git a/r/R/inst/include/arrow/util/visibility.h b/r/R/inst/include/arrow/util/visibility.h deleted file mode 100644 index b224717a62d..00000000000 --- a/r/R/inst/include/arrow/util/visibility.h +++ /dev/null @@ -1,56 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_UTIL_VISIBILITY_H -#define ARROW_UTIL_VISIBILITY_H - -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(_MSC_VER) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef ARROW_STATIC -#define ARROW_EXPORT -#elif defined(ARROW_EXPORTING) -#define ARROW_EXPORT __declspec(dllexport) -#else -#define ARROW_EXPORT __declspec(dllimport) -#endif - -#define ARROW_NO_EXPORT -#else // Not Windows -#ifndef ARROW_EXPORT -#define ARROW_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef ARROW_NO_EXPORT -#define ARROW_NO_EXPORT __attribute__((visibility("hidden"))) -#endif -#endif // Non-Windows - -// This is a complicated topic, some reading on it: -// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ -#if defined(_MSC_VER) || defined(__clang__) -#define ARROW_TEMPLATE_CLASS_EXPORT -#define ARROW_TEMPLATE_EXPORT ARROW_EXPORT -#else -#define ARROW_TEMPLATE_CLASS_EXPORT ARROW_EXPORT -#define ARROW_TEMPLATE_EXPORT -#endif - -#endif // ARROW_UTIL_VISIBILITY_H diff --git a/r/R/inst/include/arrow/util/windows_compatibility.h b/r/R/inst/include/arrow/util/windows_compatibility.h deleted file mode 100644 index 70c4313a542..00000000000 --- a/r/R/inst/include/arrow/util/windows_compatibility.h +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#ifdef _WIN32 - -// Windows defines min and max macros that mess up std::min/max -#ifndef NOMINMAX -#define NOMINMAX -#endif - -#define WIN32_LEAN_AND_MEAN - -// Set Windows 7 as a conservative minimum for Apache Arrow -#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601 -#undef _WIN32_WINNT -#endif -#ifndef _WIN32_WINNT -#define _WIN32_WINNT 0x601 -#endif - -#include -#include - -#endif // _WIN32 diff --git a/r/R/inst/include/arrow/vendored/datetime.h b/r/R/inst/include/arrow/vendored/datetime.h deleted file mode 100644 index 424313a5f5d..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime.h +++ /dev/null @@ -1,21 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/vendored/datetime/date.h" -#include "arrow/vendored/datetime/tz.h" diff --git a/r/R/inst/include/arrow/vendored/datetime/date.h b/r/R/inst/include/arrow/vendored/datetime/date.h deleted file mode 100644 index c8e14e53704..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime/date.h +++ /dev/null @@ -1,8025 +0,0 @@ -#ifndef DATE_H -#define DATE_H - -// The MIT License (MIT) -// -// Copyright (c) 2015, 2016, 2017 Howard Hinnant -// Copyright (c) 2016 Adrian Colomitchi -// Copyright (c) 2017 Florian Dang -// Copyright (c) 2017 Paul Thompson -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Our apologies. When the previous paragraph was written, lowercase had not yet -// been invented (that would involve another several millennia of evolution). -// We did not mean to shout. - -#ifndef HAS_STRING_VIEW -# if __cplusplus >= 201703 -# define HAS_STRING_VIEW 1 -# else -# define HAS_STRING_VIEW 0 -# endif -#endif // HAS_STRING_VIEW - -#include -#include -#include -#include -#include -#if !(__cplusplus >= 201402) -# include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if HAS_STRING_VIEW -# include -#endif -#include -#include - -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wpedantic" -# if __GNUC__ < 5 - // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers -# pragma GCC diagnostic ignored "-Wmissing-field-initializers" -# endif -#endif - -namespace arrow_vendored -{ -namespace date -{ - -//---------------+ -// Configuration | -//---------------+ - -#ifndef ONLY_C_LOCALE -# define ONLY_C_LOCALE 0 -#endif - -#if defined(_MSC_VER) && (!defined(__clang__) || (_MSC_VER < 1910)) -// MSVC -# if _MSC_VER < 1910 -// before VS2017 -# define CONSTDATA const -# define CONSTCD11 -# define CONSTCD14 -# define NOEXCEPT _NOEXCEPT -# else -// VS2017 and later -# define CONSTDATA constexpr const -# define CONSTCD11 constexpr -# define CONSTCD14 constexpr -# define NOEXCEPT noexcept -# endif - -#elif defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5150 -// Oracle Developer Studio 12.6 and earlier -# define CONSTDATA constexpr const -# define CONSTCD11 constexpr -# define CONSTCD14 -# define NOEXCEPT noexcept - -#elif __cplusplus >= 201402 -// C++14 -# define CONSTDATA constexpr const -# define CONSTCD11 constexpr -# define CONSTCD14 constexpr -# define NOEXCEPT noexcept -#else -// C++11 -# define CONSTDATA constexpr const -# define CONSTCD11 constexpr -# define CONSTCD14 -# define NOEXCEPT noexcept -#endif - -#ifndef HAS_VOID_T -# if __cplusplus >= 201703 -# define HAS_VOID_T 1 -# else -# define HAS_VOID_T 0 -# endif -#endif // HAS_VOID_T - -// Protect from Oracle sun macro -#ifdef sun -# undef sun -#endif - -//-----------+ -// Interface | -//-----------+ - -// durations - -using days = std::chrono::duration - , std::chrono::hours::period>>; - -using weeks = std::chrono::duration - , days::period>>; - -using years = std::chrono::duration - , days::period>>; - -using months = std::chrono::duration - >>; - -// time_point - -template - using sys_time = std::chrono::time_point; - -using sys_days = sys_time; -using sys_seconds = sys_time; - -struct local_t {}; - -template - using local_time = std::chrono::time_point; - -using local_seconds = local_time; -using local_days = local_time; - -// types - -struct last_spec -{ - explicit last_spec() = default; -}; - -class day; -class month; -class year; - -class weekday; -class weekday_indexed; -class weekday_last; - -class month_day; -class month_day_last; -class month_weekday; -class month_weekday_last; - -class year_month; - -class year_month_day; -class year_month_day_last; -class year_month_weekday; -class year_month_weekday_last; - -// date composition operators - -CONSTCD11 year_month operator/(const year& y, const month& m) NOEXCEPT; -CONSTCD11 year_month operator/(const year& y, int m) NOEXCEPT; - -CONSTCD11 month_day operator/(const day& d, const month& m) NOEXCEPT; -CONSTCD11 month_day operator/(const day& d, int m) NOEXCEPT; -CONSTCD11 month_day operator/(const month& m, const day& d) NOEXCEPT; -CONSTCD11 month_day operator/(const month& m, int d) NOEXCEPT; -CONSTCD11 month_day operator/(int m, const day& d) NOEXCEPT; - -CONSTCD11 month_day_last operator/(const month& m, last_spec) NOEXCEPT; -CONSTCD11 month_day_last operator/(int m, last_spec) NOEXCEPT; -CONSTCD11 month_day_last operator/(last_spec, const month& m) NOEXCEPT; -CONSTCD11 month_day_last operator/(last_spec, int m) NOEXCEPT; - -CONSTCD11 month_weekday operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT; -CONSTCD11 month_weekday operator/(int m, const weekday_indexed& wdi) NOEXCEPT; -CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT; -CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, int m) NOEXCEPT; - -CONSTCD11 month_weekday_last operator/(const month& m, const weekday_last& wdl) NOEXCEPT; -CONSTCD11 month_weekday_last operator/(int m, const weekday_last& wdl) NOEXCEPT; -CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, const month& m) NOEXCEPT; -CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, int m) NOEXCEPT; - -CONSTCD11 year_month_day operator/(const year_month& ym, const day& d) NOEXCEPT; -CONSTCD11 year_month_day operator/(const year_month& ym, int d) NOEXCEPT; -CONSTCD11 year_month_day operator/(const year& y, const month_day& md) NOEXCEPT; -CONSTCD11 year_month_day operator/(int y, const month_day& md) NOEXCEPT; -CONSTCD11 year_month_day operator/(const month_day& md, const year& y) NOEXCEPT; -CONSTCD11 year_month_day operator/(const month_day& md, int y) NOEXCEPT; - -CONSTCD11 - year_month_day_last operator/(const year_month& ym, last_spec) NOEXCEPT; -CONSTCD11 - year_month_day_last operator/(const year& y, const month_day_last& mdl) NOEXCEPT; -CONSTCD11 - year_month_day_last operator/(int y, const month_day_last& mdl) NOEXCEPT; -CONSTCD11 - year_month_day_last operator/(const month_day_last& mdl, const year& y) NOEXCEPT; -CONSTCD11 - year_month_day_last operator/(const month_day_last& mdl, int y) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator/(const year& y, const month_weekday& mwd) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator/(int y, const month_weekday& mwd) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator/(const month_weekday& mwd, const year& y) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator/(const month_weekday& mwd, int y) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator/(int y, const month_weekday_last& mwdl) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator/(const month_weekday_last& mwdl, int y) NOEXCEPT; - -// Detailed interface - -// day - -class day -{ - unsigned char d_; - -public: - day() = default; - explicit CONSTCD11 day(unsigned d) NOEXCEPT; - - CONSTCD14 day& operator++() NOEXCEPT; - CONSTCD14 day operator++(int) NOEXCEPT; - CONSTCD14 day& operator--() NOEXCEPT; - CONSTCD14 day operator--(int) NOEXCEPT; - - CONSTCD14 day& operator+=(const days& d) NOEXCEPT; - CONSTCD14 day& operator-=(const days& d) NOEXCEPT; - - CONSTCD11 explicit operator unsigned() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const day& x, const day& y) NOEXCEPT; -CONSTCD11 bool operator!=(const day& x, const day& y) NOEXCEPT; -CONSTCD11 bool operator< (const day& x, const day& y) NOEXCEPT; -CONSTCD11 bool operator> (const day& x, const day& y) NOEXCEPT; -CONSTCD11 bool operator<=(const day& x, const day& y) NOEXCEPT; -CONSTCD11 bool operator>=(const day& x, const day& y) NOEXCEPT; - -CONSTCD11 day operator+(const day& x, const days& y) NOEXCEPT; -CONSTCD11 day operator+(const days& x, const day& y) NOEXCEPT; -CONSTCD11 day operator-(const day& x, const days& y) NOEXCEPT; -CONSTCD11 days operator-(const day& x, const day& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const day& d); - -// month - -class month -{ - unsigned char m_; - -public: - month() = default; - explicit CONSTCD11 month(unsigned m) NOEXCEPT; - - CONSTCD14 month& operator++() NOEXCEPT; - CONSTCD14 month operator++(int) NOEXCEPT; - CONSTCD14 month& operator--() NOEXCEPT; - CONSTCD14 month operator--(int) NOEXCEPT; - - CONSTCD14 month& operator+=(const months& m) NOEXCEPT; - CONSTCD14 month& operator-=(const months& m) NOEXCEPT; - - CONSTCD11 explicit operator unsigned() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const month& x, const month& y) NOEXCEPT; -CONSTCD11 bool operator!=(const month& x, const month& y) NOEXCEPT; -CONSTCD11 bool operator< (const month& x, const month& y) NOEXCEPT; -CONSTCD11 bool operator> (const month& x, const month& y) NOEXCEPT; -CONSTCD11 bool operator<=(const month& x, const month& y) NOEXCEPT; -CONSTCD11 bool operator>=(const month& x, const month& y) NOEXCEPT; - -CONSTCD14 month operator+(const month& x, const months& y) NOEXCEPT; -CONSTCD14 month operator+(const months& x, const month& y) NOEXCEPT; -CONSTCD14 month operator-(const month& x, const months& y) NOEXCEPT; -CONSTCD14 months operator-(const month& x, const month& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const month& m); - -// year - -class year -{ - short y_; - -public: - year() = default; - explicit CONSTCD11 year(int y) NOEXCEPT; - - CONSTCD14 year& operator++() NOEXCEPT; - CONSTCD14 year operator++(int) NOEXCEPT; - CONSTCD14 year& operator--() NOEXCEPT; - CONSTCD14 year operator--(int) NOEXCEPT; - - CONSTCD14 year& operator+=(const years& y) NOEXCEPT; - CONSTCD14 year& operator-=(const years& y) NOEXCEPT; - - CONSTCD11 year operator-() const NOEXCEPT; - CONSTCD11 year operator+() const NOEXCEPT; - - CONSTCD11 bool is_leap() const NOEXCEPT; - - CONSTCD11 explicit operator int() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; - - static CONSTCD11 year min() NOEXCEPT; - static CONSTCD11 year max() NOEXCEPT; -}; - -CONSTCD11 bool operator==(const year& x, const year& y) NOEXCEPT; -CONSTCD11 bool operator!=(const year& x, const year& y) NOEXCEPT; -CONSTCD11 bool operator< (const year& x, const year& y) NOEXCEPT; -CONSTCD11 bool operator> (const year& x, const year& y) NOEXCEPT; -CONSTCD11 bool operator<=(const year& x, const year& y) NOEXCEPT; -CONSTCD11 bool operator>=(const year& x, const year& y) NOEXCEPT; - -CONSTCD11 year operator+(const year& x, const years& y) NOEXCEPT; -CONSTCD11 year operator+(const years& x, const year& y) NOEXCEPT; -CONSTCD11 year operator-(const year& x, const years& y) NOEXCEPT; -CONSTCD11 years operator-(const year& x, const year& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year& y); - -// weekday - -class weekday -{ - unsigned char wd_; -public: - weekday() = default; - explicit CONSTCD11 weekday(unsigned wd) NOEXCEPT; - CONSTCD11 weekday(const sys_days& dp) NOEXCEPT; - CONSTCD11 explicit weekday(const local_days& dp) NOEXCEPT; - - CONSTCD14 weekday& operator++() NOEXCEPT; - CONSTCD14 weekday operator++(int) NOEXCEPT; - CONSTCD14 weekday& operator--() NOEXCEPT; - CONSTCD14 weekday operator--(int) NOEXCEPT; - - CONSTCD14 weekday& operator+=(const days& d) NOEXCEPT; - CONSTCD14 weekday& operator-=(const days& d) NOEXCEPT; - - CONSTCD11 explicit operator unsigned() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; - - CONSTCD11 weekday_indexed operator[](unsigned index) const NOEXCEPT; - CONSTCD11 weekday_last operator[](last_spec) const NOEXCEPT; - -private: - static CONSTCD11 unsigned char weekday_from_days(int z) NOEXCEPT; -}; - -CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT; -CONSTCD11 bool operator!=(const weekday& x, const weekday& y) NOEXCEPT; - -CONSTCD14 weekday operator+(const weekday& x, const days& y) NOEXCEPT; -CONSTCD14 weekday operator+(const days& x, const weekday& y) NOEXCEPT; -CONSTCD14 weekday operator-(const weekday& x, const days& y) NOEXCEPT; -CONSTCD14 days operator-(const weekday& x, const weekday& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday& wd); - -// weekday_indexed - -class weekday_indexed -{ - unsigned char wd_ : 4; - unsigned char index_ : 4; - -public: - weekday_indexed() = default; - CONSTCD11 weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT; - - CONSTCD11 date::weekday weekday() const NOEXCEPT; - CONSTCD11 unsigned index() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT; -CONSTCD11 bool operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday_indexed& wdi); - -// weekday_last - -class weekday_last -{ - date::weekday wd_; - -public: - explicit CONSTCD11 weekday_last(const date::weekday& wd) NOEXCEPT; - - CONSTCD11 date::weekday weekday() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT; -CONSTCD11 bool operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday_last& wdl); - -// year_month - -class year_month -{ - date::year y_; - date::month m_; - -public: - year_month() = default; - CONSTCD11 year_month(const date::year& y, const date::month& m) NOEXCEPT; - - CONSTCD11 date::year year() const NOEXCEPT; - CONSTCD11 date::month month() const NOEXCEPT; - - CONSTCD14 year_month& operator+=(const months& dm) NOEXCEPT; - CONSTCD14 year_month& operator-=(const months& dm) NOEXCEPT; - CONSTCD14 year_month& operator+=(const years& dy) NOEXCEPT; - CONSTCD14 year_month& operator-=(const years& dy) NOEXCEPT; - - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 bool operator!=(const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 bool operator< (const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 bool operator> (const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 bool operator<=(const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 bool operator>=(const year_month& x, const year_month& y) NOEXCEPT; - -CONSTCD14 year_month operator+(const year_month& ym, const months& dm) NOEXCEPT; -CONSTCD14 year_month operator+(const months& dm, const year_month& ym) NOEXCEPT; -CONSTCD14 year_month operator-(const year_month& ym, const months& dm) NOEXCEPT; - -CONSTCD11 months operator-(const year_month& x, const year_month& y) NOEXCEPT; -CONSTCD11 year_month operator+(const year_month& ym, const years& dy) NOEXCEPT; -CONSTCD11 year_month operator+(const years& dy, const year_month& ym) NOEXCEPT; -CONSTCD11 year_month operator-(const year_month& ym, const years& dy) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month& ym); - -// month_day - -class month_day -{ - date::month m_; - date::day d_; - -public: - month_day() = default; - CONSTCD11 month_day(const date::month& m, const date::day& d) NOEXCEPT; - - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::day day() const NOEXCEPT; - - CONSTCD14 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const month_day& x, const month_day& y) NOEXCEPT; -CONSTCD11 bool operator!=(const month_day& x, const month_day& y) NOEXCEPT; -CONSTCD11 bool operator< (const month_day& x, const month_day& y) NOEXCEPT; -CONSTCD11 bool operator> (const month_day& x, const month_day& y) NOEXCEPT; -CONSTCD11 bool operator<=(const month_day& x, const month_day& y) NOEXCEPT; -CONSTCD11 bool operator>=(const month_day& x, const month_day& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_day& md); - -// month_day_last - -class month_day_last -{ - date::month m_; - -public: - CONSTCD11 explicit month_day_last(const date::month& m) NOEXCEPT; - - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT; -CONSTCD11 bool operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT; -CONSTCD11 bool operator< (const month_day_last& x, const month_day_last& y) NOEXCEPT; -CONSTCD11 bool operator> (const month_day_last& x, const month_day_last& y) NOEXCEPT; -CONSTCD11 bool operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT; -CONSTCD11 bool operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_day_last& mdl); - -// month_weekday - -class month_weekday -{ - date::month m_; - date::weekday_indexed wdi_; -public: - CONSTCD11 month_weekday(const date::month& m, - const date::weekday_indexed& wdi) NOEXCEPT; - - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT; - - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT; -CONSTCD11 bool operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_weekday& mwd); - -// month_weekday_last - -class month_weekday_last -{ - date::month m_; - date::weekday_last wdl_; - -public: - CONSTCD11 month_weekday_last(const date::month& m, - const date::weekday_last& wd) NOEXCEPT; - - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT; - - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 - bool operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT; -CONSTCD11 - bool operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_weekday_last& mwdl); - -// class year_month_day - -class year_month_day -{ - date::year y_; - date::month m_; - date::day d_; - -public: - year_month_day() = default; - CONSTCD11 year_month_day(const date::year& y, const date::month& m, - const date::day& d) NOEXCEPT; - CONSTCD14 year_month_day(const year_month_day_last& ymdl) NOEXCEPT; - - CONSTCD14 year_month_day(sys_days dp) NOEXCEPT; - CONSTCD14 explicit year_month_day(local_days dp) NOEXCEPT; - - CONSTCD14 year_month_day& operator+=(const months& m) NOEXCEPT; - CONSTCD14 year_month_day& operator-=(const months& m) NOEXCEPT; - CONSTCD14 year_month_day& operator+=(const years& y) NOEXCEPT; - CONSTCD14 year_month_day& operator-=(const years& y) NOEXCEPT; - - CONSTCD11 date::year year() const NOEXCEPT; - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::day day() const NOEXCEPT; - - CONSTCD14 operator sys_days() const NOEXCEPT; - CONSTCD14 explicit operator local_days() const NOEXCEPT; - CONSTCD14 bool ok() const NOEXCEPT; - -private: - static CONSTCD14 year_month_day from_days(days dp) NOEXCEPT; - CONSTCD14 days to_days() const NOEXCEPT; -}; - -CONSTCD11 bool operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT; -CONSTCD11 bool operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT; -CONSTCD11 bool operator< (const year_month_day& x, const year_month_day& y) NOEXCEPT; -CONSTCD11 bool operator> (const year_month_day& x, const year_month_day& y) NOEXCEPT; -CONSTCD11 bool operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT; -CONSTCD11 bool operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT; - -CONSTCD14 year_month_day operator+(const year_month_day& ymd, const months& dm) NOEXCEPT; -CONSTCD14 year_month_day operator+(const months& dm, const year_month_day& ymd) NOEXCEPT; -CONSTCD14 year_month_day operator-(const year_month_day& ymd, const months& dm) NOEXCEPT; -CONSTCD11 year_month_day operator+(const year_month_day& ymd, const years& dy) NOEXCEPT; -CONSTCD11 year_month_day operator+(const years& dy, const year_month_day& ymd) NOEXCEPT; -CONSTCD11 year_month_day operator-(const year_month_day& ymd, const years& dy) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_day& ymd); - -// year_month_day_last - -class year_month_day_last -{ - date::year y_; - date::month_day_last mdl_; - -public: - CONSTCD11 year_month_day_last(const date::year& y, - const date::month_day_last& mdl) NOEXCEPT; - - CONSTCD14 year_month_day_last& operator+=(const months& m) NOEXCEPT; - CONSTCD14 year_month_day_last& operator-=(const months& m) NOEXCEPT; - CONSTCD14 year_month_day_last& operator+=(const years& y) NOEXCEPT; - CONSTCD14 year_month_day_last& operator-=(const years& y) NOEXCEPT; - - CONSTCD11 date::year year() const NOEXCEPT; - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::month_day_last month_day_last() const NOEXCEPT; - CONSTCD14 date::day day() const NOEXCEPT; - - CONSTCD14 operator sys_days() const NOEXCEPT; - CONSTCD14 explicit operator local_days() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; -}; - -CONSTCD11 - bool operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; -CONSTCD11 - bool operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; -CONSTCD11 - bool operator< (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; -CONSTCD11 - bool operator> (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; -CONSTCD11 - bool operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; -CONSTCD11 - bool operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT; - -CONSTCD14 -year_month_day_last -operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT; - -CONSTCD14 -year_month_day_last -operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT; - -CONSTCD11 -year_month_day_last -operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT; - -CONSTCD11 -year_month_day_last -operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT; - -CONSTCD14 -year_month_day_last -operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT; - -CONSTCD11 -year_month_day_last -operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_day_last& ymdl); - -// year_month_weekday - -class year_month_weekday -{ - date::year y_; - date::month m_; - date::weekday_indexed wdi_; - -public: - year_month_weekday() = default; - CONSTCD11 year_month_weekday(const date::year& y, const date::month& m, - const date::weekday_indexed& wdi) NOEXCEPT; - CONSTCD14 year_month_weekday(const sys_days& dp) NOEXCEPT; - CONSTCD14 explicit year_month_weekday(const local_days& dp) NOEXCEPT; - - CONSTCD14 year_month_weekday& operator+=(const months& m) NOEXCEPT; - CONSTCD14 year_month_weekday& operator-=(const months& m) NOEXCEPT; - CONSTCD14 year_month_weekday& operator+=(const years& y) NOEXCEPT; - CONSTCD14 year_month_weekday& operator-=(const years& y) NOEXCEPT; - - CONSTCD11 date::year year() const NOEXCEPT; - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::weekday weekday() const NOEXCEPT; - CONSTCD11 unsigned index() const NOEXCEPT; - CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT; - - CONSTCD14 operator sys_days() const NOEXCEPT; - CONSTCD14 explicit operator local_days() const NOEXCEPT; - CONSTCD14 bool ok() const NOEXCEPT; - -private: - static CONSTCD14 year_month_weekday from_days(days dp) NOEXCEPT; - CONSTCD14 days to_days() const NOEXCEPT; -}; - -CONSTCD11 - bool operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT; -CONSTCD11 - bool operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT; - -CONSTCD14 -year_month_weekday -operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT; - -CONSTCD14 -year_month_weekday -operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT; - -CONSTCD14 -year_month_weekday -operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT; - -CONSTCD11 -year_month_weekday -operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_weekday& ymwdi); - -// year_month_weekday_last - -class year_month_weekday_last -{ - date::year y_; - date::month m_; - date::weekday_last wdl_; - -public: - CONSTCD11 year_month_weekday_last(const date::year& y, const date::month& m, - const date::weekday_last& wdl) NOEXCEPT; - - CONSTCD14 year_month_weekday_last& operator+=(const months& m) NOEXCEPT; - CONSTCD14 year_month_weekday_last& operator-=(const months& m) NOEXCEPT; - CONSTCD14 year_month_weekday_last& operator+=(const years& y) NOEXCEPT; - CONSTCD14 year_month_weekday_last& operator-=(const years& y) NOEXCEPT; - - CONSTCD11 date::year year() const NOEXCEPT; - CONSTCD11 date::month month() const NOEXCEPT; - CONSTCD11 date::weekday weekday() const NOEXCEPT; - CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT; - - CONSTCD14 operator sys_days() const NOEXCEPT; - CONSTCD14 explicit operator local_days() const NOEXCEPT; - CONSTCD11 bool ok() const NOEXCEPT; - -private: - CONSTCD14 days to_days() const NOEXCEPT; -}; - -CONSTCD11 -bool -operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT; - -CONSTCD11 -bool -operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT; - -CONSTCD14 -year_month_weekday_last -operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT; - -CONSTCD14 -year_month_weekday_last -operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT; - -CONSTCD14 -year_month_weekday_last -operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT; - -CONSTCD11 -year_month_weekday_last -operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_weekday_last& ymwdl); - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline namespace literals -{ - -CONSTCD11 date::day operator "" _d(unsigned long long d) NOEXCEPT; -CONSTCD11 date::year operator "" _y(unsigned long long y) NOEXCEPT; - -// CONSTDATA date::month jan{1}; -// CONSTDATA date::month feb{2}; -// CONSTDATA date::month mar{3}; -// CONSTDATA date::month apr{4}; -// CONSTDATA date::month may{5}; -// CONSTDATA date::month jun{6}; -// CONSTDATA date::month jul{7}; -// CONSTDATA date::month aug{8}; -// CONSTDATA date::month sep{9}; -// CONSTDATA date::month oct{10}; -// CONSTDATA date::month nov{11}; -// CONSTDATA date::month dec{12}; -// -// CONSTDATA date::weekday sun{0u}; -// CONSTDATA date::weekday mon{1u}; -// CONSTDATA date::weekday tue{2u}; -// CONSTDATA date::weekday wed{3u}; -// CONSTDATA date::weekday thu{4u}; -// CONSTDATA date::weekday fri{5u}; -// CONSTDATA date::weekday sat{6u}; - -} // inline namespace literals -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -#if HAS_VOID_T - -template > -struct is_clock - : std::false_type -{}; - -template -struct is_clock> - : std::true_type -{}; - -#endif // HAS_VOID_T - -//----------------+ -// Implementation | -//----------------+ - -// utilities -namespace detail { - -template> -class save_stream -{ - std::basic_ostream& os_; - CharT fill_; - std::ios::fmtflags flags_; - std::locale loc_; - -public: - ~save_stream() - { - os_.fill(fill_); - os_.flags(flags_); - os_.imbue(loc_); - } - - save_stream(const save_stream&) = delete; - save_stream& operator=(const save_stream&) = delete; - - explicit save_stream(std::basic_ostream& os) - : os_(os) - , fill_(os.fill()) - , flags_(os.flags()) - , loc_(os.getloc()) - {} -}; - -template -struct choose_trunc_type -{ - static const int digits = std::numeric_limits::digits; - using type = typename std::conditional - < - digits < 32, - std::int32_t, - typename std::conditional - < - digits < 64, - std::int64_t, -#ifdef __SIZEOF_INT128__ - __int128 -#else - std::int64_t -#endif - >::type - >::type; -}; - -template -CONSTCD11 -inline -typename std::enable_if -< - !std::chrono::treat_as_floating_point::value, - T ->::type -trunc(T t) NOEXCEPT -{ - return t; -} - -template -CONSTCD14 -inline -typename std::enable_if -< - std::chrono::treat_as_floating_point::value, - T ->::type -trunc(T t) NOEXCEPT -{ - using namespace std; - using I = typename choose_trunc_type::type; - CONSTDATA auto digits = numeric_limits::digits; - static_assert(digits < numeric_limits::digits, ""); - CONSTDATA auto max = I{1} << (digits-1); - CONSTDATA auto min = -max; - const auto negative = t < T{0}; - if (min <= t && t <= max && t != 0 && t == t) - { - t = static_cast(static_cast(t)); - if (t == 0 && negative) - t = -t; - } - return t; -} - -template -struct static_gcd -{ - static const std::intmax_t value = static_gcd::value; -}; - -template -struct static_gcd -{ - static const std::intmax_t value = Xp; -}; - -template <> -struct static_gcd<0, 0> -{ - static const std::intmax_t value = 1; -}; - -template -struct no_overflow -{ -private: - static const std::intmax_t gcd_n1_n2 = static_gcd::value; - static const std::intmax_t gcd_d1_d2 = static_gcd::value; - static const std::intmax_t n1 = R1::num / gcd_n1_n2; - static const std::intmax_t d1 = R1::den / gcd_d1_d2; - static const std::intmax_t n2 = R2::num / gcd_n1_n2; - static const std::intmax_t d2 = R2::den / gcd_d1_d2; - static const std::intmax_t max = -((std::intmax_t(1) << - (sizeof(std::intmax_t) * CHAR_BIT - 1)) + 1); - - template - struct mul // overflow == false - { - static const std::intmax_t value = Xp * Yp; - }; - - template - struct mul - { - static const std::intmax_t value = 1; - }; - -public: - static const bool value = (n1 <= max / d2) && (n2 <= max / d1); - typedef std::ratio::value, - mul::value> type; -}; - -} // detail - -// trunc towards zero -template -CONSTCD11 -inline -typename std::enable_if -< - detail::no_overflow::value, - To ->::type -trunc(const std::chrono::duration& d) -{ - return To{detail::trunc(std::chrono::duration_cast(d).count())}; -} - -template -CONSTCD11 -inline -typename std::enable_if -< - !detail::no_overflow::value, - To ->::type -trunc(const std::chrono::duration& d) -{ - using namespace std::chrono; - using rep = typename std::common_type::type; - return To{detail::trunc(duration_cast(duration_cast>(d)).count())}; -} - -#ifndef HAS_CHRONO_ROUNDING -# if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023918 || (_MSC_FULL_VER >= 190000000 && defined (__clang__))) -# define HAS_CHRONO_ROUNDING 1 -# elif defined(__cpp_lib_chrono) && __cplusplus > 201402 && __cpp_lib_chrono >= 201510 -# define HAS_CHRONO_ROUNDING 1 -# elif defined(_LIBCPP_VERSION) && __cplusplus > 201402 && _LIBCPP_VERSION >= 3800 -# define HAS_CHRONO_ROUNDING 1 -# else -# define HAS_CHRONO_ROUNDING 0 -# endif -#endif // HAS_CHRONO_ROUNDING - -#if HAS_CHRONO_ROUNDING == 0 - -// round down -template -CONSTCD14 -inline -typename std::enable_if -< - detail::no_overflow::value, - To ->::type -floor(const std::chrono::duration& d) -{ - auto t = trunc(d); - if (t > d) - return t - To{1}; - return t; -} - -template -CONSTCD14 -inline -typename std::enable_if -< - !detail::no_overflow::value, - To ->::type -floor(const std::chrono::duration& d) -{ - using namespace std::chrono; - using rep = typename std::common_type::type; - return floor(floor>(d)); -} - -// round to nearest, to even on tie -template -CONSTCD14 -inline -To -round(const std::chrono::duration& d) -{ - auto t0 = floor(d); - auto t1 = t0 + To{1}; - if (t1 == To{0} && t0 < To{0}) - t1 = -t1; - auto diff0 = d - t0; - auto diff1 = t1 - d; - if (diff0 == diff1) - { - if (t0 - trunc(t0/2)*2 == To{0}) - return t0; - return t1; - } - if (diff0 < diff1) - return t0; - return t1; -} - -// round up -template -CONSTCD14 -inline -To -ceil(const std::chrono::duration& d) -{ - auto t = trunc(d); - if (t < d) - return t + To{1}; - return t; -} - -template ::is_signed - >::type> -CONSTCD11 -std::chrono::duration -abs(std::chrono::duration d) -{ - return d >= d.zero() ? d : -d; -} - -// round down -template -CONSTCD11 -inline -std::chrono::time_point -floor(const std::chrono::time_point& tp) -{ - using std::chrono::time_point; - return time_point{date::floor(tp.time_since_epoch())}; -} - -// round to nearest, to even on tie -template -CONSTCD11 -inline -std::chrono::time_point -round(const std::chrono::time_point& tp) -{ - using std::chrono::time_point; - return time_point{round(tp.time_since_epoch())}; -} - -// round up -template -CONSTCD11 -inline -std::chrono::time_point -ceil(const std::chrono::time_point& tp) -{ - using std::chrono::time_point; - return time_point{ceil(tp.time_since_epoch())}; -} - -#else // HAS_CHRONO_ROUNDING == 1 - -using std::chrono::floor; -using std::chrono::ceil; -using std::chrono::round; -using std::chrono::abs; - -#endif // HAS_CHRONO_ROUNDING - -// trunc towards zero -template -CONSTCD11 -inline -std::chrono::time_point -trunc(const std::chrono::time_point& tp) -{ - using std::chrono::time_point; - return time_point{trunc(tp.time_since_epoch())}; -} - -// day - -CONSTCD11 inline day::day(unsigned d) NOEXCEPT : d_(static_cast(d)) {} -CONSTCD14 inline day& day::operator++() NOEXCEPT {++d_; return *this;} -CONSTCD14 inline day day::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} -CONSTCD14 inline day& day::operator--() NOEXCEPT {--d_; return *this;} -CONSTCD14 inline day day::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} -CONSTCD14 inline day& day::operator+=(const days& d) NOEXCEPT {*this = *this + d; return *this;} -CONSTCD14 inline day& day::operator-=(const days& d) NOEXCEPT {*this = *this - d; return *this;} -CONSTCD11 inline day::operator unsigned() const NOEXCEPT {return d_;} -CONSTCD11 inline bool day::ok() const NOEXCEPT {return 1 <= d_ && d_ <= 31;} - -CONSTCD11 -inline -bool -operator==(const day& x, const day& y) NOEXCEPT -{ - return static_cast(x) == static_cast(y); -} - -CONSTCD11 -inline -bool -operator!=(const day& x, const day& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const day& x, const day& y) NOEXCEPT -{ - return static_cast(x) < static_cast(y); -} - -CONSTCD11 -inline -bool -operator>(const day& x, const day& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const day& x, const day& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const day& x, const day& y) NOEXCEPT -{ - return !(x < y); -} - -CONSTCD11 -inline -days -operator-(const day& x, const day& y) NOEXCEPT -{ - return days{static_cast(static_cast(x) - - static_cast(y))}; -} - -CONSTCD11 -inline -day -operator+(const day& x, const days& y) NOEXCEPT -{ - return day{static_cast(x) + static_cast(y.count())}; -} - -CONSTCD11 -inline -day -operator+(const days& x, const day& y) NOEXCEPT -{ - return y + x; -} - -CONSTCD11 -inline -day -operator-(const day& x, const days& y) NOEXCEPT -{ - return x + -y; -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const day& d) -{ - detail::save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << static_cast(d); - if (!d.ok()) - os << " is not a valid day"; - return os; -} - -// month - -CONSTCD11 inline month::month(unsigned m) NOEXCEPT : m_(static_cast(m)) {} -CONSTCD14 inline month& month::operator++() NOEXCEPT {*this += months{1}; return *this;} -CONSTCD14 inline month month::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} -CONSTCD14 inline month& month::operator--() NOEXCEPT {*this -= months{1}; return *this;} -CONSTCD14 inline month month::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} - -CONSTCD14 -inline -month& -month::operator+=(const months& m) NOEXCEPT -{ - *this = *this + m; - return *this; -} - -CONSTCD14 -inline -month& -month::operator-=(const months& m) NOEXCEPT -{ - *this = *this - m; - return *this; -} - -CONSTCD11 inline month::operator unsigned() const NOEXCEPT {return m_;} -CONSTCD11 inline bool month::ok() const NOEXCEPT {return 1 <= m_ && m_ <= 12;} - -CONSTCD11 -inline -bool -operator==(const month& x, const month& y) NOEXCEPT -{ - return static_cast(x) == static_cast(y); -} - -CONSTCD11 -inline -bool -operator!=(const month& x, const month& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const month& x, const month& y) NOEXCEPT -{ - return static_cast(x) < static_cast(y); -} - -CONSTCD11 -inline -bool -operator>(const month& x, const month& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const month& x, const month& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const month& x, const month& y) NOEXCEPT -{ - return !(x < y); -} - -CONSTCD14 -inline -months -operator-(const month& x, const month& y) NOEXCEPT -{ - auto const d = static_cast(x) - static_cast(y); - return months(d <= 11 ? d : d + 12); -} - -CONSTCD14 -inline -month -operator+(const month& x, const months& y) NOEXCEPT -{ - auto const mu = static_cast(static_cast(x)) + (y.count() - 1); - auto const yr = (mu >= 0 ? mu : mu-11) / 12; - return month{static_cast(mu - yr * 12 + 1)}; -} - -CONSTCD14 -inline -month -operator+(const months& x, const month& y) NOEXCEPT -{ - return y + x; -} - -CONSTCD14 -inline -month -operator-(const month& x, const months& y) NOEXCEPT -{ - return x + -y; -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const month& m) -{ - if (m.ok()) - { - CharT fmt[] = {'%', 'b', 0}; - os << format(os.getloc(), fmt, m); - } - else - os << static_cast(m) << " is not a valid month"; - return os; -} - -// year - -CONSTCD11 inline year::year(int y) NOEXCEPT : y_(static_cast(y)) {} -CONSTCD14 inline year& year::operator++() NOEXCEPT {++y_; return *this;} -CONSTCD14 inline year year::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} -CONSTCD14 inline year& year::operator--() NOEXCEPT {--y_; return *this;} -CONSTCD14 inline year year::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} -CONSTCD14 inline year& year::operator+=(const years& y) NOEXCEPT {*this = *this + y; return *this;} -CONSTCD14 inline year& year::operator-=(const years& y) NOEXCEPT {*this = *this - y; return *this;} -CONSTCD11 inline year year::operator-() const NOEXCEPT {return year{-y_};} -CONSTCD11 inline year year::operator+() const NOEXCEPT {return *this;} - -CONSTCD11 -inline -bool -year::is_leap() const NOEXCEPT -{ - return y_ % 4 == 0 && (y_ % 100 != 0 || y_ % 400 == 0); -} - -CONSTCD11 inline year::operator int() const NOEXCEPT {return y_;} - -CONSTCD11 -inline -bool -year::ok() const NOEXCEPT -{ - return y_ != std::numeric_limits::min(); -} - -CONSTCD11 -inline -year -year::min() NOEXCEPT -{ - return year{-32767}; -} - -CONSTCD11 -inline -year -year::max() NOEXCEPT -{ - return year{32767}; -} - -CONSTCD11 -inline -bool -operator==(const year& x, const year& y) NOEXCEPT -{ - return static_cast(x) == static_cast(y); -} - -CONSTCD11 -inline -bool -operator!=(const year& x, const year& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const year& x, const year& y) NOEXCEPT -{ - return static_cast(x) < static_cast(y); -} - -CONSTCD11 -inline -bool -operator>(const year& x, const year& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const year& x, const year& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const year& x, const year& y) NOEXCEPT -{ - return !(x < y); -} - -CONSTCD11 -inline -years -operator-(const year& x, const year& y) NOEXCEPT -{ - return years{static_cast(x) - static_cast(y)}; -} - -CONSTCD11 -inline -year -operator+(const year& x, const years& y) NOEXCEPT -{ - return year{static_cast(x) + y.count()}; -} - -CONSTCD11 -inline -year -operator+(const years& x, const year& y) NOEXCEPT -{ - return y + x; -} - -CONSTCD11 -inline -year -operator-(const year& x, const years& y) NOEXCEPT -{ - return year{static_cast(x) - y.count()}; -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year& y) -{ - detail::save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::internal); - os.width(4 + (y < year{0})); - os << static_cast(y); - if (!y.ok()) - os << " is not a valid year"; - return os; -} - -// weekday - -CONSTCD11 -inline -unsigned char -weekday::weekday_from_days(int z) NOEXCEPT -{ - return static_cast(static_cast( - z >= -4 ? (z+4) % 7 : (z+5) % 7 + 6)); -} - -CONSTCD11 -inline -weekday::weekday(unsigned wd) NOEXCEPT - : wd_(static_cast(wd)) - {} - -CONSTCD11 -inline -weekday::weekday(const sys_days& dp) NOEXCEPT - : wd_(weekday_from_days(dp.time_since_epoch().count())) - {} - -CONSTCD11 -inline -weekday::weekday(const local_days& dp) NOEXCEPT - : wd_(weekday_from_days(dp.time_since_epoch().count())) - {} - -CONSTCD14 inline weekday& weekday::operator++() NOEXCEPT {*this += days{1}; return *this;} -CONSTCD14 inline weekday weekday::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;} -CONSTCD14 inline weekday& weekday::operator--() NOEXCEPT {*this -= days{1}; return *this;} -CONSTCD14 inline weekday weekday::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;} - -CONSTCD14 -inline -weekday& -weekday::operator+=(const days& d) NOEXCEPT -{ - *this = *this + d; - return *this; -} - -CONSTCD14 -inline -weekday& -weekday::operator-=(const days& d) NOEXCEPT -{ - *this = *this - d; - return *this; -} - -CONSTCD11 -inline -weekday::operator unsigned() const NOEXCEPT -{ - return static_cast(wd_); -} - -CONSTCD11 inline bool weekday::ok() const NOEXCEPT {return wd_ <= 6;} - -CONSTCD11 -inline -bool -operator==(const weekday& x, const weekday& y) NOEXCEPT -{ - return static_cast(x) == static_cast(y); -} - -CONSTCD11 -inline -bool -operator!=(const weekday& x, const weekday& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD14 -inline -days -operator-(const weekday& x, const weekday& y) NOEXCEPT -{ - auto const diff = static_cast(x) - static_cast(y); - return days{diff <= 6 ? diff : diff + 7}; -} - -CONSTCD14 -inline -weekday -operator+(const weekday& x, const days& y) NOEXCEPT -{ - auto const wdu = static_cast(static_cast(x)) + y.count(); - auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7; - return weekday{static_cast(wdu - wk * 7)}; -} - -CONSTCD14 -inline -weekday -operator+(const days& x, const weekday& y) NOEXCEPT -{ - return y + x; -} - -CONSTCD14 -inline -weekday -operator-(const weekday& x, const days& y) NOEXCEPT -{ - return x + -y; -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday& wd) -{ - if (wd.ok()) - { - CharT fmt[] = {'%', 'a', 0}; - os << format(fmt, wd); - } - else - os << static_cast(wd) << " is not a valid weekday"; - return os; -} - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -inline namespace literals -{ - -CONSTCD11 -inline -date::day -operator "" _d(unsigned long long d) NOEXCEPT -{ - return date::day{static_cast(d)}; -} - -CONSTCD11 -inline -date::year -operator "" _y(unsigned long long y) NOEXCEPT -{ - return date::year(static_cast(y)); -} -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -CONSTDATA date::last_spec last{}; - -CONSTDATA date::month jan{1}; -CONSTDATA date::month feb{2}; -CONSTDATA date::month mar{3}; -CONSTDATA date::month apr{4}; -CONSTDATA date::month may{5}; -CONSTDATA date::month jun{6}; -CONSTDATA date::month jul{7}; -CONSTDATA date::month aug{8}; -CONSTDATA date::month sep{9}; -CONSTDATA date::month oct{10}; -CONSTDATA date::month nov{11}; -CONSTDATA date::month dec{12}; - -CONSTDATA date::weekday sun{0u}; -CONSTDATA date::weekday mon{1u}; -CONSTDATA date::weekday tue{2u}; -CONSTDATA date::weekday wed{3u}; -CONSTDATA date::weekday thu{4u}; -CONSTDATA date::weekday fri{5u}; -CONSTDATA date::weekday sat{6u}; - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -} // inline namespace literals -#endif - -CONSTDATA date::month January{1}; -CONSTDATA date::month February{2}; -CONSTDATA date::month March{3}; -CONSTDATA date::month April{4}; -CONSTDATA date::month May{5}; -CONSTDATA date::month June{6}; -CONSTDATA date::month July{7}; -CONSTDATA date::month August{8}; -CONSTDATA date::month September{9}; -CONSTDATA date::month October{10}; -CONSTDATA date::month November{11}; -CONSTDATA date::month December{12}; - -CONSTDATA date::weekday Sunday{0u}; -CONSTDATA date::weekday Monday{1u}; -CONSTDATA date::weekday Tuesday{2u}; -CONSTDATA date::weekday Wednesday{3u}; -CONSTDATA date::weekday Thursday{4u}; -CONSTDATA date::weekday Friday{5u}; -CONSTDATA date::weekday Saturday{6u}; - -// weekday_indexed - -CONSTCD11 -inline -weekday -weekday_indexed::weekday() const NOEXCEPT -{ - return date::weekday{static_cast(wd_)}; -} - -CONSTCD11 inline unsigned weekday_indexed::index() const NOEXCEPT {return index_;} - -CONSTCD11 -inline -bool -weekday_indexed::ok() const NOEXCEPT -{ - return weekday().ok() && 1 <= index_ && index_ <= 5; -} - -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wconversion" -#endif // __GNUC__ - -CONSTCD11 -inline -weekday_indexed::weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT - : wd_(static_cast(static_cast(wd))) - , index_(static_cast(index)) - {} - -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif // __GNUC__ - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday_indexed& wdi) -{ - os << wdi.weekday() << '[' << wdi.index(); - if (!(1 <= wdi.index() && wdi.index() <= 5)) - os << " is not a valid index"; - os << ']'; - return os; -} - -CONSTCD11 -inline -weekday_indexed -weekday::operator[](unsigned index) const NOEXCEPT -{ - return {*this, index}; -} - -CONSTCD11 -inline -bool -operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT -{ - return x.weekday() == y.weekday() && x.index() == y.index(); -} - -CONSTCD11 -inline -bool -operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT -{ - return !(x == y); -} - -// weekday_last - -CONSTCD11 inline date::weekday weekday_last::weekday() const NOEXCEPT {return wd_;} -CONSTCD11 inline bool weekday_last::ok() const NOEXCEPT {return wd_.ok();} -CONSTCD11 inline weekday_last::weekday_last(const date::weekday& wd) NOEXCEPT : wd_(wd) {} - -CONSTCD11 -inline -bool -operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT -{ - return x.weekday() == y.weekday(); -} - -CONSTCD11 -inline -bool -operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT -{ - return !(x == y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const weekday_last& wdl) -{ - return os << wdl.weekday() << "[last]"; -} - -CONSTCD11 -inline -weekday_last -weekday::operator[](last_spec) const NOEXCEPT -{ - return weekday_last{*this}; -} - -// year_month - -CONSTCD11 -inline -year_month::year_month(const date::year& y, const date::month& m) NOEXCEPT - : y_(y) - , m_(m) - {} - -CONSTCD11 inline year year_month::year() const NOEXCEPT {return y_;} -CONSTCD11 inline month year_month::month() const NOEXCEPT {return m_;} -CONSTCD11 inline bool year_month::ok() const NOEXCEPT {return y_.ok() && m_.ok();} - -CONSTCD14 -inline -year_month& -year_month::operator+=(const months& dm) NOEXCEPT -{ - *this = *this + dm; - return *this; -} - -CONSTCD14 -inline -year_month& -year_month::operator-=(const months& dm) NOEXCEPT -{ - *this = *this - dm; - return *this; -} - -CONSTCD14 -inline -year_month& -year_month::operator+=(const years& dy) NOEXCEPT -{ - *this = *this + dy; - return *this; -} - -CONSTCD14 -inline -year_month& -year_month::operator-=(const years& dy) NOEXCEPT -{ - *this = *this - dy; - return *this; -} - -CONSTCD11 -inline -bool -operator==(const year_month& x, const year_month& y) NOEXCEPT -{ - return x.year() == y.year() && x.month() == y.month(); -} - -CONSTCD11 -inline -bool -operator!=(const year_month& x, const year_month& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const year_month& x, const year_month& y) NOEXCEPT -{ - return x.year() < y.year() ? true - : (x.year() > y.year() ? false - : (x.month() < y.month())); -} - -CONSTCD11 -inline -bool -operator>(const year_month& x, const year_month& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const year_month& x, const year_month& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const year_month& x, const year_month& y) NOEXCEPT -{ - return !(x < y); -} - -CONSTCD14 -inline -year_month -operator+(const year_month& ym, const months& dm) NOEXCEPT -{ - auto dmi = static_cast(static_cast(ym.month())) - 1 + dm.count(); - auto dy = (dmi >= 0 ? dmi : dmi-11) / 12; - dmi = dmi - dy * 12 + 1; - return (ym.year() + years(dy)) / month(static_cast(dmi)); -} - -CONSTCD14 -inline -year_month -operator+(const months& dm, const year_month& ym) NOEXCEPT -{ - return ym + dm; -} - -CONSTCD14 -inline -year_month -operator-(const year_month& ym, const months& dm) NOEXCEPT -{ - return ym + -dm; -} - -CONSTCD11 -inline -months -operator-(const year_month& x, const year_month& y) NOEXCEPT -{ - return (x.year() - y.year()) + - months(static_cast(x.month()) - static_cast(y.month())); -} - -CONSTCD11 -inline -year_month -operator+(const year_month& ym, const years& dy) NOEXCEPT -{ - return (ym.year() + dy) / ym.month(); -} - -CONSTCD11 -inline -year_month -operator+(const years& dy, const year_month& ym) NOEXCEPT -{ - return ym + dy; -} - -CONSTCD11 -inline -year_month -operator-(const year_month& ym, const years& dy) NOEXCEPT -{ - return ym + -dy; -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month& ym) -{ - return os << ym.year() << '/' << ym.month(); -} - -// month_day - -CONSTCD11 -inline -month_day::month_day(const date::month& m, const date::day& d) NOEXCEPT - : m_(m) - , d_(d) - {} - -CONSTCD11 inline date::month month_day::month() const NOEXCEPT {return m_;} -CONSTCD11 inline date::day month_day::day() const NOEXCEPT {return d_;} - -CONSTCD14 -inline -bool -month_day::ok() const NOEXCEPT -{ - CONSTDATA date::day d[] = - { - date::day(31), date::day(29), date::day(31), - date::day(30), date::day(31), date::day(30), - date::day(31), date::day(31), date::day(30), - date::day(31), date::day(30), date::day(31) - }; - return m_.ok() && date::day{1} <= d_ && d_ <= d[static_cast(m_)-1]; -} - -CONSTCD11 -inline -bool -operator==(const month_day& x, const month_day& y) NOEXCEPT -{ - return x.month() == y.month() && x.day() == y.day(); -} - -CONSTCD11 -inline -bool -operator!=(const month_day& x, const month_day& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const month_day& x, const month_day& y) NOEXCEPT -{ - return x.month() < y.month() ? true - : (x.month() > y.month() ? false - : (x.day() < y.day())); -} - -CONSTCD11 -inline -bool -operator>(const month_day& x, const month_day& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const month_day& x, const month_day& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const month_day& x, const month_day& y) NOEXCEPT -{ - return !(x < y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_day& md) -{ - return os << md.month() << '/' << md.day(); -} - -// month_day_last - -CONSTCD11 inline month month_day_last::month() const NOEXCEPT {return m_;} -CONSTCD11 inline bool month_day_last::ok() const NOEXCEPT {return m_.ok();} -CONSTCD11 inline month_day_last::month_day_last(const date::month& m) NOEXCEPT : m_(m) {} - -CONSTCD11 -inline -bool -operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return x.month() == y.month(); -} - -CONSTCD11 -inline -bool -operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return x.month() < y.month(); -} - -CONSTCD11 -inline -bool -operator>(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT -{ - return !(x < y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_day_last& mdl) -{ - return os << mdl.month() << "/last"; -} - -// month_weekday - -CONSTCD11 -inline -month_weekday::month_weekday(const date::month& m, - const date::weekday_indexed& wdi) NOEXCEPT - : m_(m) - , wdi_(wdi) - {} - -CONSTCD11 inline month month_weekday::month() const NOEXCEPT {return m_;} - -CONSTCD11 -inline -weekday_indexed -month_weekday::weekday_indexed() const NOEXCEPT -{ - return wdi_; -} - -CONSTCD11 -inline -bool -month_weekday::ok() const NOEXCEPT -{ - return m_.ok() && wdi_.ok(); -} - -CONSTCD11 -inline -bool -operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT -{ - return x.month() == y.month() && x.weekday_indexed() == y.weekday_indexed(); -} - -CONSTCD11 -inline -bool -operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT -{ - return !(x == y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_weekday& mwd) -{ - return os << mwd.month() << '/' << mwd.weekday_indexed(); -} - -// month_weekday_last - -CONSTCD11 -inline -month_weekday_last::month_weekday_last(const date::month& m, - const date::weekday_last& wdl) NOEXCEPT - : m_(m) - , wdl_(wdl) - {} - -CONSTCD11 inline month month_weekday_last::month() const NOEXCEPT {return m_;} - -CONSTCD11 -inline -weekday_last -month_weekday_last::weekday_last() const NOEXCEPT -{ - return wdl_; -} - -CONSTCD11 -inline -bool -month_weekday_last::ok() const NOEXCEPT -{ - return m_.ok() && wdl_.ok(); -} - -CONSTCD11 -inline -bool -operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT -{ - return x.month() == y.month() && x.weekday_last() == y.weekday_last(); -} - -CONSTCD11 -inline -bool -operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT -{ - return !(x == y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const month_weekday_last& mwdl) -{ - return os << mwdl.month() << '/' << mwdl.weekday_last(); -} - -// year_month_day_last - -CONSTCD11 -inline -year_month_day_last::year_month_day_last(const date::year& y, - const date::month_day_last& mdl) NOEXCEPT - : y_(y) - , mdl_(mdl) - {} - -CONSTCD14 -inline -year_month_day_last& -year_month_day_last::operator+=(const months& m) NOEXCEPT -{ - *this = *this + m; - return *this; -} - -CONSTCD14 -inline -year_month_day_last& -year_month_day_last::operator-=(const months& m) NOEXCEPT -{ - *this = *this - m; - return *this; -} - -CONSTCD14 -inline -year_month_day_last& -year_month_day_last::operator+=(const years& y) NOEXCEPT -{ - *this = *this + y; - return *this; -} - -CONSTCD14 -inline -year_month_day_last& -year_month_day_last::operator-=(const years& y) NOEXCEPT -{ - *this = *this - y; - return *this; -} - -CONSTCD11 inline year year_month_day_last::year() const NOEXCEPT {return y_;} -CONSTCD11 inline month year_month_day_last::month() const NOEXCEPT {return mdl_.month();} - -CONSTCD11 -inline -month_day_last -year_month_day_last::month_day_last() const NOEXCEPT -{ - return mdl_; -} - -CONSTCD14 -inline -day -year_month_day_last::day() const NOEXCEPT -{ - CONSTDATA date::day d[] = - { - date::day(31), date::day(28), date::day(31), - date::day(30), date::day(31), date::day(30), - date::day(31), date::day(31), date::day(30), - date::day(31), date::day(30), date::day(31) - }; - return month() != feb || !y_.is_leap() ? - d[static_cast(month()) - 1] : date::day{29}; -} - -CONSTCD14 -inline -year_month_day_last::operator sys_days() const NOEXCEPT -{ - return sys_days(year()/month()/day()); -} - -CONSTCD14 -inline -year_month_day_last::operator local_days() const NOEXCEPT -{ - return local_days(year()/month()/day()); -} - -CONSTCD11 -inline -bool -year_month_day_last::ok() const NOEXCEPT -{ - return y_.ok() && mdl_.ok(); -} - -CONSTCD11 -inline -bool -operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return x.year() == y.year() && x.month_day_last() == y.month_day_last(); -} - -CONSTCD11 -inline -bool -operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return x.year() < y.year() ? true - : (x.year() > y.year() ? false - : (x.month_day_last() < y.month_day_last())); -} - -CONSTCD11 -inline -bool -operator>(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT -{ - return !(x < y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_day_last& ymdl) -{ - return os << ymdl.year() << '/' << ymdl.month_day_last(); -} - -CONSTCD14 -inline -year_month_day_last -operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT -{ - return (ymdl.year() / ymdl.month() + dm) / last; -} - -CONSTCD14 -inline -year_month_day_last -operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT -{ - return ymdl + dm; -} - -CONSTCD14 -inline -year_month_day_last -operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT -{ - return ymdl + (-dm); -} - -CONSTCD11 -inline -year_month_day_last -operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT -{ - return {ymdl.year()+dy, ymdl.month_day_last()}; -} - -CONSTCD11 -inline -year_month_day_last -operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT -{ - return ymdl + dy; -} - -CONSTCD11 -inline -year_month_day_last -operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT -{ - return ymdl + (-dy); -} - -// year_month_day - -CONSTCD11 -inline -year_month_day::year_month_day(const date::year& y, const date::month& m, - const date::day& d) NOEXCEPT - : y_(y) - , m_(m) - , d_(d) - {} - -CONSTCD14 -inline -year_month_day::year_month_day(const year_month_day_last& ymdl) NOEXCEPT - : y_(ymdl.year()) - , m_(ymdl.month()) - , d_(ymdl.day()) - {} - -CONSTCD14 -inline -year_month_day::year_month_day(sys_days dp) NOEXCEPT - : year_month_day(from_days(dp.time_since_epoch())) - {} - -CONSTCD14 -inline -year_month_day::year_month_day(local_days dp) NOEXCEPT - : year_month_day(from_days(dp.time_since_epoch())) - {} - -CONSTCD11 inline year year_month_day::year() const NOEXCEPT {return y_;} -CONSTCD11 inline month year_month_day::month() const NOEXCEPT {return m_;} -CONSTCD11 inline day year_month_day::day() const NOEXCEPT {return d_;} - -CONSTCD14 -inline -year_month_day& -year_month_day::operator+=(const months& m) NOEXCEPT -{ - *this = *this + m; - return *this; -} - -CONSTCD14 -inline -year_month_day& -year_month_day::operator-=(const months& m) NOEXCEPT -{ - *this = *this - m; - return *this; -} - -CONSTCD14 -inline -year_month_day& -year_month_day::operator+=(const years& y) NOEXCEPT -{ - *this = *this + y; - return *this; -} - -CONSTCD14 -inline -year_month_day& -year_month_day::operator-=(const years& y) NOEXCEPT -{ - *this = *this - y; - return *this; -} - -CONSTCD14 -inline -days -year_month_day::to_days() const NOEXCEPT -{ - static_assert(std::numeric_limits::digits >= 18, - "This algorithm has not been ported to a 16 bit unsigned integer"); - static_assert(std::numeric_limits::digits >= 20, - "This algorithm has not been ported to a 16 bit signed integer"); - auto const y = static_cast(y_) - (m_ <= feb); - auto const m = static_cast(m_); - auto const d = static_cast(d_); - auto const era = (y >= 0 ? y : y-399) / 400; - auto const yoe = static_cast(y - era * 400); // [0, 399] - auto const doy = (153*(m > 2 ? m-3 : m+9) + 2)/5 + d-1; // [0, 365] - auto const doe = yoe * 365 + yoe/4 - yoe/100 + doy; // [0, 146096] - return days{era * 146097 + static_cast(doe) - 719468}; -} - -CONSTCD14 -inline -year_month_day::operator sys_days() const NOEXCEPT -{ - return sys_days{to_days()}; -} - -CONSTCD14 -inline -year_month_day::operator local_days() const NOEXCEPT -{ - return local_days{to_days()}; -} - -CONSTCD14 -inline -bool -year_month_day::ok() const NOEXCEPT -{ - if (!(y_.ok() && m_.ok())) - return false; - return date::day{1} <= d_ && d_ <= (y_ / m_ / last).day(); -} - -CONSTCD11 -inline -bool -operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return x.year() == y.year() && x.month() == y.month() && x.day() == y.day(); -} - -CONSTCD11 -inline -bool -operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return !(x == y); -} - -CONSTCD11 -inline -bool -operator<(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return x.year() < y.year() ? true - : (x.year() > y.year() ? false - : (x.month() < y.month() ? true - : (x.month() > y.month() ? false - : (x.day() < y.day())))); -} - -CONSTCD11 -inline -bool -operator>(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return y < x; -} - -CONSTCD11 -inline -bool -operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return !(y < x); -} - -CONSTCD11 -inline -bool -operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT -{ - return !(x < y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_day& ymd) -{ - detail::save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os << ymd.year() << '-'; - os.width(2); - os << static_cast(ymd.month()) << '-'; - os << ymd.day(); - if (!ymd.ok()) - os << " is not a valid date"; - return os; -} - -CONSTCD14 -inline -year_month_day -year_month_day::from_days(days dp) NOEXCEPT -{ - static_assert(std::numeric_limits::digits >= 18, - "This algorithm has not been ported to a 16 bit unsigned integer"); - static_assert(std::numeric_limits::digits >= 20, - "This algorithm has not been ported to a 16 bit signed integer"); - auto const z = dp.count() + 719468; - auto const era = (z >= 0 ? z : z - 146096) / 146097; - auto const doe = static_cast(z - era * 146097); // [0, 146096] - auto const yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365; // [0, 399] - auto const y = static_cast(yoe) + era * 400; - auto const doy = doe - (365*yoe + yoe/4 - yoe/100); // [0, 365] - auto const mp = (5*doy + 2)/153; // [0, 11] - auto const d = doy - (153*mp+2)/5 + 1; // [1, 31] - auto const m = mp < 10 ? mp+3 : mp-9; // [1, 12] - return year_month_day{date::year{y + (m <= 2)}, date::month(m), date::day(d)}; -} - -CONSTCD14 -inline -year_month_day -operator+(const year_month_day& ymd, const months& dm) NOEXCEPT -{ - return (ymd.year() / ymd.month() + dm) / ymd.day(); -} - -CONSTCD14 -inline -year_month_day -operator+(const months& dm, const year_month_day& ymd) NOEXCEPT -{ - return ymd + dm; -} - -CONSTCD14 -inline -year_month_day -operator-(const year_month_day& ymd, const months& dm) NOEXCEPT -{ - return ymd + (-dm); -} - -CONSTCD11 -inline -year_month_day -operator+(const year_month_day& ymd, const years& dy) NOEXCEPT -{ - return (ymd.year() + dy) / ymd.month() / ymd.day(); -} - -CONSTCD11 -inline -year_month_day -operator+(const years& dy, const year_month_day& ymd) NOEXCEPT -{ - return ymd + dy; -} - -CONSTCD11 -inline -year_month_day -operator-(const year_month_day& ymd, const years& dy) NOEXCEPT -{ - return ymd + (-dy); -} - -// year_month_weekday - -CONSTCD11 -inline -year_month_weekday::year_month_weekday(const date::year& y, const date::month& m, - const date::weekday_indexed& wdi) - NOEXCEPT - : y_(y) - , m_(m) - , wdi_(wdi) - {} - -CONSTCD14 -inline -year_month_weekday::year_month_weekday(const sys_days& dp) NOEXCEPT - : year_month_weekday(from_days(dp.time_since_epoch())) - {} - -CONSTCD14 -inline -year_month_weekday::year_month_weekday(const local_days& dp) NOEXCEPT - : year_month_weekday(from_days(dp.time_since_epoch())) - {} - -CONSTCD14 -inline -year_month_weekday& -year_month_weekday::operator+=(const months& m) NOEXCEPT -{ - *this = *this + m; - return *this; -} - -CONSTCD14 -inline -year_month_weekday& -year_month_weekday::operator-=(const months& m) NOEXCEPT -{ - *this = *this - m; - return *this; -} - -CONSTCD14 -inline -year_month_weekday& -year_month_weekday::operator+=(const years& y) NOEXCEPT -{ - *this = *this + y; - return *this; -} - -CONSTCD14 -inline -year_month_weekday& -year_month_weekday::operator-=(const years& y) NOEXCEPT -{ - *this = *this - y; - return *this; -} - -CONSTCD11 inline year year_month_weekday::year() const NOEXCEPT {return y_;} -CONSTCD11 inline month year_month_weekday::month() const NOEXCEPT {return m_;} - -CONSTCD11 -inline -weekday -year_month_weekday::weekday() const NOEXCEPT -{ - return wdi_.weekday(); -} - -CONSTCD11 -inline -unsigned -year_month_weekday::index() const NOEXCEPT -{ - return wdi_.index(); -} - -CONSTCD11 -inline -weekday_indexed -year_month_weekday::weekday_indexed() const NOEXCEPT -{ - return wdi_; -} - -CONSTCD14 -inline -year_month_weekday::operator sys_days() const NOEXCEPT -{ - return sys_days{to_days()}; -} - -CONSTCD14 -inline -year_month_weekday::operator local_days() const NOEXCEPT -{ - return local_days{to_days()}; -} - -CONSTCD14 -inline -bool -year_month_weekday::ok() const NOEXCEPT -{ - if (!y_.ok() || !m_.ok() || !wdi_.weekday().ok() || wdi_.index() < 1) - return false; - if (wdi_.index() <= 4) - return true; - auto d2 = wdi_.weekday() - date::weekday(static_cast(y_/m_/1)) + days((wdi_.index()-1)*7 + 1); - return static_cast(d2.count()) <= static_cast((y_/m_/last).day()); -} - -CONSTCD14 -inline -year_month_weekday -year_month_weekday::from_days(days d) NOEXCEPT -{ - sys_days dp{d}; - auto const wd = date::weekday(dp); - auto const ymd = year_month_day(dp); - return {ymd.year(), ymd.month(), wd[(static_cast(ymd.day())-1)/7+1]}; -} - -CONSTCD14 -inline -days -year_month_weekday::to_days() const NOEXCEPT -{ - auto d = sys_days(y_/m_/1); - return (d + (wdi_.weekday() - date::weekday(d) + days{(wdi_.index()-1)*7}) - ).time_since_epoch(); -} - -CONSTCD11 -inline -bool -operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT -{ - return x.year() == y.year() && x.month() == y.month() && - x.weekday_indexed() == y.weekday_indexed(); -} - -CONSTCD11 -inline -bool -operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT -{ - return !(x == y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_weekday& ymwdi) -{ - return os << ymwdi.year() << '/' << ymwdi.month() - << '/' << ymwdi.weekday_indexed(); -} - -CONSTCD14 -inline -year_month_weekday -operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT -{ - return (ymwd.year() / ymwd.month() + dm) / ymwd.weekday_indexed(); -} - -CONSTCD14 -inline -year_month_weekday -operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT -{ - return ymwd + dm; -} - -CONSTCD14 -inline -year_month_weekday -operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT -{ - return ymwd + (-dm); -} - -CONSTCD11 -inline -year_month_weekday -operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT -{ - return {ymwd.year()+dy, ymwd.month(), ymwd.weekday_indexed()}; -} - -CONSTCD11 -inline -year_month_weekday -operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT -{ - return ymwd + dy; -} - -CONSTCD11 -inline -year_month_weekday -operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT -{ - return ymwd + (-dy); -} - -// year_month_weekday_last - -CONSTCD11 -inline -year_month_weekday_last::year_month_weekday_last(const date::year& y, - const date::month& m, - const date::weekday_last& wdl) NOEXCEPT - : y_(y) - , m_(m) - , wdl_(wdl) - {} - -CONSTCD14 -inline -year_month_weekday_last& -year_month_weekday_last::operator+=(const months& m) NOEXCEPT -{ - *this = *this + m; - return *this; -} - -CONSTCD14 -inline -year_month_weekday_last& -year_month_weekday_last::operator-=(const months& m) NOEXCEPT -{ - *this = *this - m; - return *this; -} - -CONSTCD14 -inline -year_month_weekday_last& -year_month_weekday_last::operator+=(const years& y) NOEXCEPT -{ - *this = *this + y; - return *this; -} - -CONSTCD14 -inline -year_month_weekday_last& -year_month_weekday_last::operator-=(const years& y) NOEXCEPT -{ - *this = *this - y; - return *this; -} - -CONSTCD11 inline year year_month_weekday_last::year() const NOEXCEPT {return y_;} -CONSTCD11 inline month year_month_weekday_last::month() const NOEXCEPT {return m_;} - -CONSTCD11 -inline -weekday -year_month_weekday_last::weekday() const NOEXCEPT -{ - return wdl_.weekday(); -} - -CONSTCD11 -inline -weekday_last -year_month_weekday_last::weekday_last() const NOEXCEPT -{ - return wdl_; -} - -CONSTCD14 -inline -year_month_weekday_last::operator sys_days() const NOEXCEPT -{ - return sys_days{to_days()}; -} - -CONSTCD14 -inline -year_month_weekday_last::operator local_days() const NOEXCEPT -{ - return local_days{to_days()}; -} - -CONSTCD11 -inline -bool -year_month_weekday_last::ok() const NOEXCEPT -{ - return y_.ok() && m_.ok() && wdl_.ok(); -} - -CONSTCD14 -inline -days -year_month_weekday_last::to_days() const NOEXCEPT -{ - auto const d = sys_days(y_/m_/last); - return (d - (date::weekday{d} - wdl_.weekday())).time_since_epoch(); -} - -CONSTCD11 -inline -bool -operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT -{ - return x.year() == y.year() && x.month() == y.month() && - x.weekday_last() == y.weekday_last(); -} - -CONSTCD11 -inline -bool -operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT -{ - return !(x == y); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const year_month_weekday_last& ymwdl) -{ - return os << ymwdl.year() << '/' << ymwdl.month() << '/' << ymwdl.weekday_last(); -} - -CONSTCD14 -inline -year_month_weekday_last -operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT -{ - return (ymwdl.year() / ymwdl.month() + dm) / ymwdl.weekday_last(); -} - -CONSTCD14 -inline -year_month_weekday_last -operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT -{ - return ymwdl + dm; -} - -CONSTCD14 -inline -year_month_weekday_last -operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT -{ - return ymwdl + (-dm); -} - -CONSTCD11 -inline -year_month_weekday_last -operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT -{ - return {ymwdl.year()+dy, ymwdl.month(), ymwdl.weekday_last()}; -} - -CONSTCD11 -inline -year_month_weekday_last -operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT -{ - return ymwdl + dy; -} - -CONSTCD11 -inline -year_month_weekday_last -operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT -{ - return ymwdl + (-dy); -} - -// year_month from operator/() - -CONSTCD11 -inline -year_month -operator/(const year& y, const month& m) NOEXCEPT -{ - return {y, m}; -} - -CONSTCD11 -inline -year_month -operator/(const year& y, int m) NOEXCEPT -{ - return y / month(static_cast(m)); -} - -// month_day from operator/() - -CONSTCD11 -inline -month_day -operator/(const month& m, const day& d) NOEXCEPT -{ - return {m, d}; -} - -CONSTCD11 -inline -month_day -operator/(const day& d, const month& m) NOEXCEPT -{ - return m / d; -} - -CONSTCD11 -inline -month_day -operator/(const month& m, int d) NOEXCEPT -{ - return m / day(static_cast(d)); -} - -CONSTCD11 -inline -month_day -operator/(int m, const day& d) NOEXCEPT -{ - return month(static_cast(m)) / d; -} - -CONSTCD11 inline month_day operator/(const day& d, int m) NOEXCEPT {return m / d;} - -// month_day_last from operator/() - -CONSTCD11 -inline -month_day_last -operator/(const month& m, last_spec) NOEXCEPT -{ - return month_day_last{m}; -} - -CONSTCD11 -inline -month_day_last -operator/(last_spec, const month& m) NOEXCEPT -{ - return m/last; -} - -CONSTCD11 -inline -month_day_last -operator/(int m, last_spec) NOEXCEPT -{ - return month(static_cast(m))/last; -} - -CONSTCD11 -inline -month_day_last -operator/(last_spec, int m) NOEXCEPT -{ - return m/last; -} - -// month_weekday from operator/() - -CONSTCD11 -inline -month_weekday -operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT -{ - return {m, wdi}; -} - -CONSTCD11 -inline -month_weekday -operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT -{ - return m / wdi; -} - -CONSTCD11 -inline -month_weekday -operator/(int m, const weekday_indexed& wdi) NOEXCEPT -{ - return month(static_cast(m)) / wdi; -} - -CONSTCD11 -inline -month_weekday -operator/(const weekday_indexed& wdi, int m) NOEXCEPT -{ - return m / wdi; -} - -// month_weekday_last from operator/() - -CONSTCD11 -inline -month_weekday_last -operator/(const month& m, const weekday_last& wdl) NOEXCEPT -{ - return {m, wdl}; -} - -CONSTCD11 -inline -month_weekday_last -operator/(const weekday_last& wdl, const month& m) NOEXCEPT -{ - return m / wdl; -} - -CONSTCD11 -inline -month_weekday_last -operator/(int m, const weekday_last& wdl) NOEXCEPT -{ - return month(static_cast(m)) / wdl; -} - -CONSTCD11 -inline -month_weekday_last -operator/(const weekday_last& wdl, int m) NOEXCEPT -{ - return m / wdl; -} - -// year_month_day from operator/() - -CONSTCD11 -inline -year_month_day -operator/(const year_month& ym, const day& d) NOEXCEPT -{ - return {ym.year(), ym.month(), d}; -} - -CONSTCD11 -inline -year_month_day -operator/(const year_month& ym, int d) NOEXCEPT -{ - return ym / day(static_cast(d)); -} - -CONSTCD11 -inline -year_month_day -operator/(const year& y, const month_day& md) NOEXCEPT -{ - return y / md.month() / md.day(); -} - -CONSTCD11 -inline -year_month_day -operator/(int y, const month_day& md) NOEXCEPT -{ - return year(y) / md; -} - -CONSTCD11 -inline -year_month_day -operator/(const month_day& md, const year& y) NOEXCEPT -{ - return y / md; -} - -CONSTCD11 -inline -year_month_day -operator/(const month_day& md, int y) NOEXCEPT -{ - return year(y) / md; -} - -// year_month_day_last from operator/() - -CONSTCD11 -inline -year_month_day_last -operator/(const year_month& ym, last_spec) NOEXCEPT -{ - return {ym.year(), month_day_last{ym.month()}}; -} - -CONSTCD11 -inline -year_month_day_last -operator/(const year& y, const month_day_last& mdl) NOEXCEPT -{ - return {y, mdl}; -} - -CONSTCD11 -inline -year_month_day_last -operator/(int y, const month_day_last& mdl) NOEXCEPT -{ - return year(y) / mdl; -} - -CONSTCD11 -inline -year_month_day_last -operator/(const month_day_last& mdl, const year& y) NOEXCEPT -{ - return y / mdl; -} - -CONSTCD11 -inline -year_month_day_last -operator/(const month_day_last& mdl, int y) NOEXCEPT -{ - return year(y) / mdl; -} - -// year_month_weekday from operator/() - -CONSTCD11 -inline -year_month_weekday -operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT -{ - return {ym.year(), ym.month(), wdi}; -} - -CONSTCD11 -inline -year_month_weekday -operator/(const year& y, const month_weekday& mwd) NOEXCEPT -{ - return {y, mwd.month(), mwd.weekday_indexed()}; -} - -CONSTCD11 -inline -year_month_weekday -operator/(int y, const month_weekday& mwd) NOEXCEPT -{ - return year(y) / mwd; -} - -CONSTCD11 -inline -year_month_weekday -operator/(const month_weekday& mwd, const year& y) NOEXCEPT -{ - return y / mwd; -} - -CONSTCD11 -inline -year_month_weekday -operator/(const month_weekday& mwd, int y) NOEXCEPT -{ - return year(y) / mwd; -} - -// year_month_weekday_last from operator/() - -CONSTCD11 -inline -year_month_weekday_last -operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT -{ - return {ym.year(), ym.month(), wdl}; -} - -CONSTCD11 -inline -year_month_weekday_last -operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT -{ - return {y, mwdl.month(), mwdl.weekday_last()}; -} - -CONSTCD11 -inline -year_month_weekday_last -operator/(int y, const month_weekday_last& mwdl) NOEXCEPT -{ - return year(y) / mwdl; -} - -CONSTCD11 -inline -year_month_weekday_last -operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT -{ - return y / mwdl; -} - -CONSTCD11 -inline -year_month_weekday_last -operator/(const month_weekday_last& mwdl, int y) NOEXCEPT -{ - return year(y) / mwdl; -} - -template -struct fields; - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const fields& fds, const std::string* abbrev = nullptr, - const std::chrono::seconds* offset_sec = nullptr); - -template -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - fields& fds, std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr); - -// time_of_day - -enum {am = 1, pm}; - -namespace detail -{ - -// width::value is the number of fractional decimal digits in 1/n -// width<0>::value and width<1>::value are defined to be 0 -// If 1/n takes more than 18 fractional decimal digits, -// the result is truncated to 19. -// Example: width<2>::value == 1 -// Example: width<3>::value == 19 -// Example: width<4>::value == 2 -// Example: width<10>::value == 1 -// Example: width<1000>::value == 3 -template -struct width -{ - static CONSTDATA unsigned value = 1 + width::value; -}; - -template -struct width -{ - static CONSTDATA unsigned value = 0; -}; - -template -struct static_pow10 -{ -private: - static CONSTDATA std::uint64_t h = static_pow10::value; -public: - static CONSTDATA std::uint64_t value = h * h * (exp % 2 ? 10 : 1); -}; - -template <> -struct static_pow10<0> -{ - static CONSTDATA std::uint64_t value = 1; -}; - -template -struct make_precision -{ - using type = std::chrono::duration::value>>; - static CONSTDATA unsigned width = w; -}; - -template -struct make_precision -{ - using type = std::chrono::duration; - static CONSTDATA unsigned width = 6; -}; - -template ::type::period::den>::value> -class decimal_format_seconds -{ -public: - using rep = typename std::common_type::type::rep; - using precision = typename make_precision::type; - static auto CONSTDATA width = make_precision::width; - -private: - std::chrono::seconds s_; - precision sub_s_; - -public: - CONSTCD11 decimal_format_seconds() - : s_() - , sub_s_() - {} - - CONSTCD11 explicit decimal_format_seconds(const Duration& d) NOEXCEPT - : s_(std::chrono::duration_cast(d)) - , sub_s_(std::chrono::duration_cast(d - s_)) - {} - - CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;} - CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;} - CONSTCD11 precision subseconds() const NOEXCEPT {return sub_s_;} - - CONSTCD14 precision to_duration() const NOEXCEPT - { - return s_ + sub_s_; - } - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - using namespace std::chrono; - return sub_s_ < std::chrono::seconds{1} && s_ < minutes{1}; - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const decimal_format_seconds& x) - { - date::detail::save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << x.s_.count() << - std::use_facet>(os.getloc()).decimal_point(); - os.width(width); - os << static_cast(x.sub_s_.count()); - return os; - } -}; - -template -class decimal_format_seconds -{ - static CONSTDATA unsigned w = 0; -public: - using rep = typename std::common_type::type::rep; - using precision = std::chrono::duration; - static auto CONSTDATA width = make_precision::width; -private: - - std::chrono::seconds s_; - -public: - CONSTCD11 decimal_format_seconds() : s_() {} - CONSTCD11 explicit decimal_format_seconds(const precision& s) NOEXCEPT - : s_(s) - {} - - CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;} - CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;} - CONSTCD14 precision to_duration() const NOEXCEPT {return s_;} - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - using namespace std::chrono; - return s_ < minutes{1}; - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const decimal_format_seconds& x) - { - date::detail::save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << x.s_.count(); - return os; - } -}; - -enum class classify -{ - not_valid, - hour, - minute, - second, - subsecond -}; - -template -struct classify_duration -{ - static CONSTDATA classify value = - std::is_convertible::value - ? classify::hour : - std::is_convertible::value - ? classify::minute : - std::is_convertible::value - ? classify::second : - std::chrono::treat_as_floating_point::value - ? classify::not_valid : - classify::subsecond; -}; - -template -inline -CONSTCD11 -typename std::enable_if - < - std::numeric_limits::is_signed, - std::chrono::duration - >::type -abs(std::chrono::duration d) -{ - return d >= d.zero() ? d : -d; -} - -template -inline -CONSTCD11 -typename std::enable_if - < - !std::numeric_limits::is_signed, - std::chrono::duration - >::type -abs(std::chrono::duration d) -{ - return d; -} - -class time_of_day_base -{ -protected: - std::chrono::hours h_; - unsigned char mode_; - bool neg_; - - enum {is24hr}; - - CONSTCD11 time_of_day_base() NOEXCEPT - : h_(0) - , mode_(static_cast(is24hr)) - , neg_(false) - {} - - - CONSTCD11 time_of_day_base(std::chrono::hours h, bool neg, unsigned m) NOEXCEPT - : h_(detail::abs(h)) - , mode_(static_cast(m)) - , neg_(neg) - {} - - CONSTCD14 void make24() NOEXCEPT; - CONSTCD14 void make12() NOEXCEPT; - - CONSTCD14 std::chrono::hours to24hr() const; - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - return !neg_ && h_ < days{1}; - } -}; - -CONSTCD14 -inline -std::chrono::hours -time_of_day_base::to24hr() const -{ - auto h = h_; - if (mode_ == am || mode_ == pm) - { - CONSTDATA auto h12 = std::chrono::hours(12); - if (mode_ == pm) - { - if (h != h12) - h = h + h12; - } - else if (h == h12) - h = std::chrono::hours(0); - } - return h; -} - -CONSTCD14 -inline -void -time_of_day_base::make24() NOEXCEPT -{ - h_ = to24hr(); - mode_ = is24hr; -} - -CONSTCD14 -inline -void -time_of_day_base::make12() NOEXCEPT -{ - if (mode_ == is24hr) - { - CONSTDATA auto h12 = std::chrono::hours(12); - if (h_ >= h12) - { - if (h_ > h12) - h_ = h_ - h12; - mode_ = pm; - } - else - { - if (h_ == std::chrono::hours(0)) - h_ = h12; - mode_ = am; - } - } -} - -template ::value> -class time_of_day_storage; - -template -class time_of_day_storage, detail::classify::hour> - : private detail::time_of_day_base -{ - using base = detail::time_of_day_base; - -public: - using precision = std::chrono::hours; - -#if !defined(_MSC_VER) || _MSC_VER >= 1900 - CONSTCD11 time_of_day_storage() NOEXCEPT = default; -#else - CONSTCD11 time_of_day_storage() = default; -#endif /* !defined(_MSC_VER) || _MSC_VER >= 1900 */ - - CONSTCD11 explicit time_of_day_storage(std::chrono::hours since_midnight) NOEXCEPT - : base(since_midnight, since_midnight < std::chrono::hours{0}, is24hr) - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, unsigned md) NOEXCEPT - : base(h, h < std::chrono::hours{0}, md) - {} - - CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} - CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} - - CONSTCD14 explicit operator precision() const NOEXCEPT - { - auto p = to24hr(); - if (neg_) - p = -p; - return p; - } - - CONSTCD14 precision to_duration() const NOEXCEPT - { - return static_cast(*this); - } - - CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} - CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - return base::in_conventional_range(); - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const time_of_day_storage& t) - { - using namespace std; - detail::save_stream _(os); - if (t.neg_) - os << '-'; - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - if (t.mode_ != am && t.mode_ != pm) - os.width(2); - os << t.h_.count(); - switch (t.mode_) - { - case time_of_day_storage::is24hr: - os << "00"; - break; - case am: - os << "am"; - break; - case pm: - os << "pm"; - break; - } - return os; - } -}; - -template -class time_of_day_storage, detail::classify::minute> - : private detail::time_of_day_base -{ - using base = detail::time_of_day_base; - - std::chrono::minutes m_; - -public: - using precision = std::chrono::minutes; - - CONSTCD11 time_of_day_storage() NOEXCEPT - : base() - , m_(0) - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::minutes since_midnight) NOEXCEPT - : base(std::chrono::duration_cast(since_midnight), - since_midnight < std::chrono::minutes{0}, is24hr) - , m_(detail::abs(since_midnight) - h_) - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, - unsigned md) NOEXCEPT - : base(h, false, md) - , m_(m) - {} - - CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} - CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} - CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} - - CONSTCD14 explicit operator precision() const NOEXCEPT - { - auto p = to24hr() + m_; - if (neg_) - p = -p; - return p; - } - - CONSTCD14 precision to_duration() const NOEXCEPT - { - return static_cast(*this); - } - - CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} - CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - return base::in_conventional_range() && m_ < std::chrono::hours{1}; - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const time_of_day_storage& t) - { - using namespace std; - detail::save_stream _(os); - if (t.neg_) - os << '-'; - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - if (t.mode_ != am && t.mode_ != pm) - os.width(2); - os << t.h_.count() << ':'; - os.width(2); - os << t.m_.count(); - switch (t.mode_) - { - case am: - os << "am"; - break; - case pm: - os << "pm"; - break; - } - return os; - } -}; - -template -class time_of_day_storage, detail::classify::second> - : private detail::time_of_day_base -{ - using base = detail::time_of_day_base; - using dfs = decimal_format_seconds; - - std::chrono::minutes m_; - dfs s_; - -public: - using precision = std::chrono::seconds; - - CONSTCD11 time_of_day_storage() NOEXCEPT - : base() - , m_(0) - , s_() - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::seconds since_midnight) NOEXCEPT - : base(std::chrono::duration_cast(since_midnight), - since_midnight < std::chrono::seconds{0}, is24hr) - , m_(std::chrono::duration_cast(detail::abs(since_midnight) - h_)) - , s_(detail::abs(since_midnight) - h_ - m_) - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, - std::chrono::seconds s, unsigned md) NOEXCEPT - : base(h, false, md) - , m_(m) - , s_(s) - {} - - CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} - CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} - CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_.seconds();} - CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();} - CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} - - CONSTCD14 explicit operator precision() const NOEXCEPT - { - auto p = to24hr() + s_.to_duration() + m_; - if (neg_) - p = -p; - return p; - } - - CONSTCD14 precision to_duration() const NOEXCEPT - { - return static_cast(*this); - } - - CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} - CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - return base::in_conventional_range() && m_ < std::chrono::hours{1} && - s_.in_conventional_range(); - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const time_of_day_storage& t) - { - using namespace std; - detail::save_stream _(os); - if (t.neg_) - os << '-'; - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - if (t.mode_ != am && t.mode_ != pm) - os.width(2); - os << t.h_.count() << ':'; - os.width(2); - os << t.m_.count() << ':' << t.s_; - switch (t.mode_) - { - case am: - os << "am"; - break; - case pm: - os << "pm"; - break; - } - return os; - } - - template - friend - std::basic_ostream& - date::to_stream(std::basic_ostream& os, const CharT* fmt, - const fields& fds, const std::string* abbrev, - const std::chrono::seconds* offset_sec); - - template - friend - std::basic_istream& - date::from_stream(std::basic_istream& is, const CharT* fmt, - fields& fds, - std::basic_string* abbrev, std::chrono::minutes* offset); -}; - -template -class time_of_day_storage, detail::classify::subsecond> - : private detail::time_of_day_base -{ -public: - using Duration = std::chrono::duration; - using dfs = decimal_format_seconds::type>; - using precision = typename dfs::precision; - -private: - using base = detail::time_of_day_base; - - std::chrono::minutes m_; - dfs s_; - -public: - CONSTCD11 time_of_day_storage() NOEXCEPT - : base() - , m_(0) - , s_() - {} - - CONSTCD11 explicit time_of_day_storage(Duration since_midnight) NOEXCEPT - : base(date::trunc(since_midnight), - since_midnight < Duration{0}, is24hr) - , m_(date::trunc(detail::abs(since_midnight) - h_)) - , s_(detail::abs(since_midnight) - h_ - m_) - {} - - CONSTCD11 explicit time_of_day_storage(std::chrono::hours h, std::chrono::minutes m, - std::chrono::seconds s, precision sub_s, - unsigned md) NOEXCEPT - : base(h, false, md) - , m_(m) - , s_(s + sub_s) - {} - - CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;} - CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;} - CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_.seconds();} - CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();} - CONSTCD11 precision subseconds() const NOEXCEPT {return s_.subseconds();} - CONSTCD11 unsigned mode() const NOEXCEPT {return mode_;} - - CONSTCD14 explicit operator precision() const NOEXCEPT - { - auto p = to24hr() + s_.to_duration() + m_; - if (neg_) - p = -p; - return p; - } - - CONSTCD14 precision to_duration() const NOEXCEPT - { - return static_cast(*this); - } - - CONSTCD14 time_of_day_storage& make24() NOEXCEPT {base::make24(); return *this;} - CONSTCD14 time_of_day_storage& make12() NOEXCEPT {base::make12(); return *this;} - - CONSTCD11 bool in_conventional_range() const NOEXCEPT - { - return base::in_conventional_range() && m_ < std::chrono::hours{1} && - s_.in_conventional_range(); - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const time_of_day_storage& t) - { - using namespace std; - detail::save_stream _(os); - if (t.neg_) - os << '-'; - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - if (t.mode_ != am && t.mode_ != pm) - os.width(2); - os << t.h_.count() << ':'; - os.width(2); - os << t.m_.count() << ':' << t.s_; - switch (t.mode_) - { - case am: - os << "am"; - break; - case pm: - os << "pm"; - break; - } - return os; - } - - template - friend - std::basic_ostream& - date::to_stream(std::basic_ostream& os, const CharT* fmt, - const fields& fds, const std::string* abbrev, - const std::chrono::seconds* offset_sec); - - template - friend - std::basic_istream& - date::from_stream(std::basic_istream& is, const CharT* fmt, - fields& fds, - std::basic_string* abbrev, std::chrono::minutes* offset); -}; - -} // namespace detail - -template -class time_of_day - : public detail::time_of_day_storage -{ - using base = detail::time_of_day_storage; -public: -#if !defined(_MSC_VER) || _MSC_VER >= 1900 - CONSTCD11 time_of_day() NOEXCEPT = default; -#else - CONSTCD11 time_of_day() = default; -#endif /* !defined(_MSC_VER) || _MSC_VER >= 1900 */ - - CONSTCD11 explicit time_of_day(Duration since_midnight) NOEXCEPT - : base(since_midnight) - {} - - template - CONSTCD11 - explicit time_of_day(Arg0&& arg0, Arg1&& arg1, Args&& ...args) NOEXCEPT - : base(std::forward(arg0), std::forward(arg1), std::forward(args)...) - {} -}; - -template ::value>::type> -CONSTCD11 -inline -time_of_day> -make_time(const std::chrono::duration& d) -{ - return time_of_day>(d); -} - -CONSTCD11 -inline -time_of_day -make_time(const std::chrono::hours& h, unsigned md) -{ - return time_of_day(h, md); -} - -CONSTCD11 -inline -time_of_day -make_time(const std::chrono::hours& h, const std::chrono::minutes& m, - unsigned md) -{ - return time_of_day(h, m, md); -} - -CONSTCD11 -inline -time_of_day -make_time(const std::chrono::hours& h, const std::chrono::minutes& m, - const std::chrono::seconds& s, unsigned md) -{ - return time_of_day(h, m, s, md); -} - -template >::value>::type> -CONSTCD11 -inline -time_of_day> -make_time(const std::chrono::hours& h, const std::chrono::minutes& m, - const std::chrono::seconds& s, const std::chrono::duration& sub_s, - unsigned md) -{ - return time_of_day>(h, m, s, sub_s, md); -} - -template -inline -typename std::enable_if -< - !std::chrono::treat_as_floating_point::value && - std::ratio_less::value - , std::basic_ostream& ->::type -operator<<(std::basic_ostream& os, const sys_time& tp) -{ - auto const dp = date::floor(tp); - return os << year_month_day(dp) << ' ' << make_time(tp-dp); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const sys_days& dp) -{ - return os << year_month_day(dp); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const local_time& ut) -{ - return (os << sys_time{ut.time_since_epoch()}); -} - -// to_stream - -template -struct fields -{ - year_month_day ymd{year{0}/0/0}; - weekday wd{7u}; - time_of_day tod{}; - - fields() = default; - - fields(year_month_day ymd_) : ymd(ymd_) {} - fields(weekday wd_) : wd(wd_) {} - fields(time_of_day tod_) : tod(tod_) {} - - fields(year_month_day ymd_, weekday wd_) : ymd(ymd_), wd(wd_) {} - fields(year_month_day ymd_, time_of_day tod_) : ymd(ymd_), tod(tod_) {} - - fields(weekday wd_, time_of_day tod_) : wd(wd_), tod(tod_) {} - - fields(year_month_day ymd_, weekday wd_, time_of_day tod_) - : ymd(ymd_) - , wd(wd_) - , tod(tod_) - {} -}; - -namespace detail -{ - -template -unsigned -extract_weekday(std::basic_ostream& os, const fields& fds) -{ - if (!fds.ymd.ok() && !fds.wd.ok()) - { - // fds does not contain a valid weekday - os.setstate(std::ios::failbit); - return 7; - } - unsigned wd; - if (fds.ymd.ok()) - { - wd = static_cast(weekday{fds.ymd}); - if (fds.wd.ok() && wd != static_cast(fds.wd)) - { - // fds.ymd and fds.wd are inconsistent - os.setstate(std::ios::failbit); - return 7; - } - } - else - wd = static_cast(fds.wd); - return wd; -} - -template -unsigned -extract_month(std::basic_ostream& os, const fields& fds) -{ - if (!fds.ymd.month().ok()) - { - // fds does not contain a valid month - os.setstate(std::ios::failbit); - return 0; - } - return static_cast(fds.ymd.month()); -} - -} // namespace detail - -#if ONLY_C_LOCALE - -namespace detail -{ - -inline -std::pair -weekday_names() -{ - using namespace std; - static const string nm[] = - { - "Sunday", - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sun", - "Mon", - "Tue", - "Wed", - "Thu", - "Fri", - "Sat" - }; - return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); -} - -inline -std::pair -month_names() -{ - using namespace std; - static const string nm[] = - { - "January", - "February", - "March", - "April", - "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December", - "Jan", - "Feb", - "Mar", - "Apr", - "May", - "Jun", - "Jul", - "Aug", - "Sep", - "Oct", - "Nov", - "Dec" - }; - return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); -} - -inline -std::pair -ampm_names() -{ - using namespace std; - static const string nm[] = - { - "AM", - "PM" - }; - return make_pair(nm, nm+sizeof(nm)/sizeof(nm[0])); -} - -template -FwdIter -scan_keyword(std::basic_istream& is, FwdIter kb, FwdIter ke) -{ - using namespace std; - size_t nkw = static_cast(std::distance(kb, ke)); - const unsigned char doesnt_match = '\0'; - const unsigned char might_match = '\1'; - const unsigned char does_match = '\2'; - unsigned char statbuf[100]; - unsigned char* status = statbuf; - unique_ptr stat_hold(0, free); - if (nkw > sizeof(statbuf)) - { - status = (unsigned char*)malloc(nkw); - if (status == nullptr) - throw bad_alloc(); - stat_hold.reset(status); - } - size_t n_might_match = nkw; // At this point, any keyword might match - size_t n_does_match = 0; // but none of them definitely do - // Initialize all statuses to might_match, except for "" keywords are does_match - unsigned char* st = status; - for (auto ky = kb; ky != ke; ++ky, ++st) - { - if (!ky->empty()) - *st = might_match; - else - { - *st = does_match; - --n_might_match; - ++n_does_match; - } - } - // While there might be a match, test keywords against the next CharT - for (size_t indx = 0; is && n_might_match > 0; ++indx) - { - // Peek at the next CharT but don't consume it - auto ic = is.peek(); - if (ic == EOF) - { - is.setstate(ios::eofbit); - break; - } - auto c = static_cast(toupper(ic)); - bool consume = false; - // For each keyword which might match, see if the indx character is c - // If a match if found, consume c - // If a match is found, and that is the last character in the keyword, - // then that keyword matches. - // If the keyword doesn't match this character, then change the keyword - // to doesn't match - st = status; - for (auto ky = kb; ky != ke; ++ky, ++st) - { - if (*st == might_match) - { - if (c == static_cast(toupper((*ky)[indx]))) - { - consume = true; - if (ky->size() == indx+1) - { - *st = does_match; - --n_might_match; - ++n_does_match; - } - } - else - { - *st = doesnt_match; - --n_might_match; - } - } - } - // consume if we matched a character - if (consume) - { - (void)is.get(); - // If we consumed a character and there might be a matched keyword that - // was marked matched on a previous iteration, then such keywords - // are now marked as not matching. - if (n_might_match + n_does_match > 1) - { - st = status; - for (auto ky = kb; ky != ke; ++ky, ++st) - { - if (*st == does_match && ky->size() != indx+1) - { - *st = doesnt_match; - --n_does_match; - } - } - } - } - } - // We've exited the loop because we hit eof and/or we have no more "might matches". - // Return the first matching result - for (st = status; kb != ke; ++kb, ++st) - if (*st == does_match) - break; - if (kb == ke) - is.setstate(ios_base::failbit); - return kb; -} - -} // namespace detail - -#endif // ONLY_C_LOCALE - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const fields& fds, const std::string* abbrev, - const std::chrono::seconds* offset_sec) -{ - using namespace std; - using namespace std::chrono; - using namespace detail; - tm tm{}; -#if !ONLY_C_LOCALE - auto& facet = use_facet>(os.getloc()); -#endif - const CharT* command = nullptr; - CharT modified = CharT{}; - for (; *fmt; ++fmt) - { - switch (*fmt) - { - case 'a': - case 'A': - if (command) - { - if (modified == CharT{}) - { - tm.tm_wday = static_cast(extract_weekday(os, fds)); - if (os.fail()) - return os; -#if !ONLY_C_LOCALE - const CharT f[] = {'%', *fmt}; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); -#else // ONLY_C_LOCALE - os << weekday_names().first[tm.tm_wday+7*(*fmt == 'a')]; -#endif // ONLY_C_LOCALE - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'b': - case 'B': - case 'h': - if (command) - { - if (modified == CharT{}) - { - tm.tm_mon = static_cast(extract_month(os, fds)) - 1; -#if !ONLY_C_LOCALE - const CharT f[] = {'%', *fmt}; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); -#else // ONLY_C_LOCALE - os << month_names().first[tm.tm_mon+12*(*fmt == 'b')]; -#endif // ONLY_C_LOCALE - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'c': - case 'x': - if (command) - { - if (modified == CharT{'O'}) - os << CharT{'%'} << modified << *fmt; - else - { -#if !ONLY_C_LOCALE - tm = std::tm{}; - auto const& ymd = fds.ymd; - auto ld = local_days(ymd); - tm.tm_sec = static_cast(fds.tod.seconds().count()); - tm.tm_min = static_cast(fds.tod.minutes().count()); - tm.tm_hour = static_cast(fds.tod.hours().count()); - tm.tm_mday = static_cast(static_cast(ymd.day())); - tm.tm_mon = static_cast(extract_month(os, fds) - 1); - tm.tm_year = static_cast(ymd.year()) - 1900; - tm.tm_wday = static_cast(extract_weekday(os, fds)); - if (os.fail()) - return os; - tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); - CharT f[3] = {'%'}; - auto fe = begin(f) + 1; - if (modified == CharT{'E'}) - *fe++ = modified; - *fe++ = *fmt; - facet.put(os, os, os.fill(), &tm, begin(f), fe); -#else // ONLY_C_LOCALE - if (*fmt == 'c') - { - auto wd = static_cast(extract_weekday(os, fds)); - os << weekday_names().first[static_cast(wd)+7] - << ' '; - os << month_names().first[extract_month(os, fds)-1+12] << ' '; - auto d = static_cast(static_cast(fds.ymd.day())); - if (d < 10) - os << ' '; - os << d << ' ' - << make_time(duration_cast(fds.tod.to_duration())) - << ' ' << fds.ymd.year(); - - } - else // *fmt == 'x' - { - auto const& ymd = fds.ymd; - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << static_cast(ymd.month()) << CharT{'/'}; - os.width(2); - os << static_cast(ymd.day()) << CharT{'/'}; - os.width(2); - os << static_cast(ymd.year()) % 100; - } -#endif // ONLY_C_LOCALE - } - command = nullptr; - modified = CharT{}; - } - else - os << *fmt; - break; - case 'C': - if (command) - { - auto y = static_cast(fds.ymd.year()); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - if (y >= 0) - { - os.width(2); - os << y/100; - } - else - { - os << CharT{'-'}; - os.width(2); - os << -(y-99)/100; - } -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'E'}) - { - tm.tm_year = y - 1900; - CharT f[3] = {'%', 'E', 'C'}; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - command = nullptr; - modified = CharT{}; - } - else - os << *fmt; - break; - case 'd': - case 'e': - if (command) - { - auto d = static_cast(static_cast(fds.ymd.day())); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - save_stream _(os); - if (*fmt == CharT{'d'}) - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << d; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - tm.tm_mday = d; - CharT f[3] = {'%', 'O', *fmt}; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - command = nullptr; - modified = CharT{}; - } - else - os << *fmt; - break; - case 'D': - if (command) - { - if (modified == CharT{}) - { - auto const& ymd = fds.ymd; - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << static_cast(ymd.month()) << CharT{'/'}; - os.width(2); - os << static_cast(ymd.day()) << CharT{'/'}; - os.width(2); - os << static_cast(ymd.year()) % 100; - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'F': - if (command) - { - if (modified == CharT{}) - { - auto const& ymd = fds.ymd; - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(4); - os << static_cast(ymd.year()) << CharT{'-'}; - os.width(2); - os << static_cast(ymd.month()) << CharT{'-'}; - os.width(2); - os << static_cast(ymd.day()); - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'g': - case 'G': - if (command) - { - if (modified == CharT{}) - { - auto ld = local_days(fds.ymd); - auto y = year_month_day{ld + days{3}}.year(); - auto start = local_days((y - years{1})/date::dec/thu[last]) + (mon-thu); - if (ld < start) - --y; - if (*fmt == CharT{'G'}) - os << y; - else - { - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(2); - os << std::abs(static_cast(y)) % 100; - } - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'H': - case 'I': - if (command) - { - auto hms = fds.tod; -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - if (*fmt == CharT{'I'}) - hms.make12(); - if (hms.hours() < hours{10}) - os << CharT{'0'}; - os << hms.hours().count(); -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_hour = static_cast(hms.hours().count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'j': - if (command) - { - if (modified == CharT{}) - { - auto ld = local_days(fds.ymd); - auto y = fds.ymd.year(); - auto doy = ld - local_days(y/jan/1) + days{1}; - save_stream _(os); - os.fill('0'); - os.flags(std::ios::dec | std::ios::right); - os.width(3); - os << doy.count(); - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'm': - if (command) - { - auto m = static_cast(fds.ymd.month()); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - if (m < 10) - os << CharT{'0'}; - os << m; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_mon = static_cast(m-1); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'M': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - if (fds.tod.minutes() < minutes{10}) - os << CharT{'0'}; - os << fds.tod.minutes().count(); -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_min = static_cast(fds.tod.minutes().count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'n': - if (command) - { - if (modified == CharT{}) - os << CharT{'\n'}; - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'p': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { - const CharT f[] = {'%', *fmt}; - tm.tm_hour = static_cast(fds.tod.hours().count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#else - if (fds.tod.hours() < hours{12}) - os << ampm_names().first[0]; - else - os << ampm_names().first[1]; -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'r': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { - const CharT f[] = {'%', *fmt}; - tm.tm_hour = static_cast(fds.tod.hours().count()); - tm.tm_min = static_cast(fds.tod.minutes().count()); - tm.tm_sec = static_cast(fds.tod.seconds().count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#else - time_of_day tod(duration_cast(fds.tod.to_duration())); - tod.make12(); - save_stream _(os); - os.fill('0'); - os.width(2); - os << tod.hours().count() << CharT{':'}; - os.width(2); - os << tod.minutes().count() << CharT{':'}; - os.width(2); - os << tod.seconds().count() << CharT{' '}; - tod.make24(); - if (tod.hours() < hours{12}) - os << ampm_names().first[0]; - else - os << ampm_names().first[1]; -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'R': - if (command) - { - if (modified == CharT{}) - { - if (fds.tod.hours() < hours{10}) - os << CharT{'0'}; - os << fds.tod.hours().count() << CharT{':'}; - if (fds.tod.minutes() < minutes{10}) - os << CharT{'0'}; - os << fds.tod.minutes().count(); - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'S': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - os << fds.tod.s_; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_sec = static_cast(fds.tod.s_.seconds().count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 't': - if (command) - { - if (modified == CharT{}) - os << CharT{'\t'}; - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'T': - if (command) - { - if (modified == CharT{}) - { - os << fds.tod; - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'u': - if (command) - { - auto wd = extract_weekday(os, fds); - if (os.fail()) - return os; -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - os << (wd != 0 ? wd : 7u); -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_wday = static_cast(wd); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'U': - if (command) - { - auto const& ymd = fds.ymd; - auto ld = local_days(ymd); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - auto st = local_days(sun[1]/jan/ymd.year()); - if (ld < st) - os << CharT{'0'} << CharT{'0'}; - else - { - auto wn = duration_cast(ld - st).count() + 1; - if (wn < 10) - os << CharT{'0'}; - os << wn; - } - #if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_year = static_cast(ymd.year()) - 1900; - tm.tm_wday = static_cast(extract_weekday(os, fds)); - if (os.fail()) - return os; - tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'V': - if (command) - { - auto ld = local_days(fds.ymd); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - auto y = year_month_day{ld + days{3}}.year(); - auto st = local_days((y - years{1})/12/thu[last]) + (mon-thu); - if (ld < st) - { - --y; - st = local_days((y - years{1})/12/thu[last]) + (mon-thu); - } - auto wn = duration_cast(ld - st).count() + 1; - if (wn < 10) - os << CharT{'0'}; - os << wn; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - auto const& ymd = fds.ymd; - tm.tm_year = static_cast(ymd.year()) - 1900; - tm.tm_wday = static_cast(extract_weekday(os, fds)); - if (os.fail()) - return os; - tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'w': - if (command) - { - auto wd = extract_weekday(os, fds); - if (os.fail()) - return os; -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - os << wd; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_wday = static_cast(wd); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'W': - if (command) - { - auto const& ymd = fds.ymd; - auto ld = local_days(ymd); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - auto st = local_days(mon[1]/jan/ymd.year()); - if (ld < st) - os << CharT{'0'} << CharT{'0'}; - else - { - auto wn = duration_cast(ld - st).count() + 1; - if (wn < 10) - os << CharT{'0'}; - os << wn; - } -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_year = static_cast(ymd.year()) - 1900; - tm.tm_wday = static_cast(extract_weekday(os, fds)); - if (os.fail()) - return os; - tm.tm_yday = static_cast((ld - local_days(ymd.year()/1/1)).count()); - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'X': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{'O'}) - os << CharT{'%'} << modified << *fmt; - else - { - tm = std::tm{}; - tm.tm_sec = static_cast(fds.tod.seconds().count()); - tm.tm_min = static_cast(fds.tod.minutes().count()); - tm.tm_hour = static_cast(fds.tod.hours().count()); - CharT f[3] = {'%'}; - auto fe = begin(f) + 1; - if (modified == CharT{'E'}) - *fe++ = modified; - *fe++ = *fmt; - facet.put(os, os, os.fill(), &tm, begin(f), fe); - } -#else - os << fds.tod; -#endif - command = nullptr; - modified = CharT{}; - } - else - os << *fmt; - break; - case 'y': - if (command) - { - auto y = static_cast(fds.ymd.year()); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - y = std::abs(y) % 100; - if (y < 10) - os << CharT{'0'}; - os << y; -#if !ONLY_C_LOCALE - } - else - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_year = y - 1900; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'Y': - if (command) - { - auto y = fds.ymd.year(); -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - os << y; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'E'}) - { - const CharT f[] = {'%', modified, *fmt}; - tm.tm_year = static_cast(y) - 1900; - facet.put(os, os, os.fill(), &tm, begin(f), end(f)); - } - else - { - os << CharT{'%'} << modified << *fmt; - } -#endif - modified = CharT{}; - command = nullptr; - } - else - os << *fmt; - break; - case 'z': - if (command) - { - if (offset_sec == nullptr) - { - // Can not format %z with unknown offset - os.setstate(ios::failbit); - return os; - } - auto m = duration_cast(*offset_sec); - auto neg = m < minutes{0}; - m = date::abs(m); - auto h = duration_cast(m); - m -= h; - if (neg) - os << CharT{'-'}; - else - os << CharT{'+'}; - if (h < hours{10}) - os << CharT{'0'}; - os << h.count(); - if (modified != CharT{}) - os << CharT{':'}; - if (m < minutes{10}) - os << CharT{'0'}; - os << m.count(); - command = nullptr; - modified = CharT{}; - } - else - os << *fmt; - break; - case 'Z': - if (command) - { - if (modified == CharT{}) - { - if (abbrev == nullptr) - { - // Can not format %Z with unknown time_zone - os.setstate(ios::failbit); - return os; - } - for (auto c : *abbrev) - os << CharT(c); - } - else - { - os << CharT{'%'} << modified << *fmt; - modified = CharT{}; - } - command = nullptr; - } - else - os << *fmt; - break; - case 'E': - case 'O': - if (command) - { - if (modified == CharT{}) - { - modified = *fmt; - } - else - { - os << CharT{'%'} << modified << *fmt; - command = nullptr; - modified = CharT{}; - } - } - else - os << *fmt; - break; - case '%': - if (command) - { - if (modified == CharT{}) - { - os << CharT{'%'}; - command = nullptr; - } - else - { - os << CharT{'%'} << modified << CharT{'%'}; - command = nullptr; - modified = CharT{}; - } - } - else - command = fmt; - break; - default: - if (command) - { - os << CharT{'%'}; - command = nullptr; - } - if (modified != CharT{}) - { - os << modified; - modified = CharT{}; - } - os << *fmt; - break; - } - } - if (command) - os << CharT{'%'}; - if (modified != CharT{}) - os << modified; - return os; -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const year& y) -{ - using CT = std::chrono::seconds; - fields fds{y/0/0}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const month& m) -{ - using CT = std::chrono::seconds; - fields fds{m/0/0}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const day& d) -{ - using CT = std::chrono::seconds; - fields fds{d/0/0}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const weekday& wd) -{ - using CT = std::chrono::seconds; - fields fds{wd}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const year_month& ym) -{ - using CT = std::chrono::seconds; - fields fds{ym/0}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, const month_day& md) -{ - using CT = std::chrono::seconds; - fields fds{md/0}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const year_month_day& ymd) -{ - using CT = std::chrono::seconds; - fields fds{ymd}; - return to_stream(os, fmt, fds); -} - -template -inline -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const std::chrono::duration& d) -{ - using Duration = std::chrono::duration; - using CT = typename std::common_type::type; - fields fds{time_of_day{d}}; - return to_stream(os, fmt, fds); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const local_time& tp, const std::string* abbrev = nullptr, - const std::chrono::seconds* offset_sec = nullptr) -{ - using CT = typename std::common_type::type; - auto ld = floor(tp); - fields fds{year_month_day{ld}, time_of_day{tp-local_seconds{ld}}}; - return to_stream(os, fmt, fds, abbrev, offset_sec); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const sys_time& tp) -{ - using namespace std::chrono; - using CT = typename std::common_type::type; - const std::string abbrev("UTC"); - CONSTDATA seconds offset{0}; - auto sd = floor(tp); - fields fds{year_month_day{sd}, time_of_day{tp-sys_seconds{sd}}}; - return to_stream(os, fmt, fds, &abbrev, &offset); -} - -// format - -template -auto -format(const std::locale& loc, const CharT* fmt, const Streamable& tp) - -> decltype(to_stream(std::declval&>(), fmt, tp), - std::basic_string{}) -{ - std::basic_ostringstream os; - os.exceptions(std::ios::failbit | std::ios::badbit); - os.imbue(loc); - to_stream(os, fmt, tp); - return os.str(); -} - -template -auto -format(const CharT* fmt, const Streamable& tp) - -> decltype(to_stream(std::declval&>(), fmt, tp), - std::basic_string{}) -{ - std::basic_ostringstream os; - os.exceptions(std::ios::failbit | std::ios::badbit); - to_stream(os, fmt, tp); - return os.str(); -} - -template -auto -format(const std::locale& loc, const std::basic_string& fmt, - const Streamable& tp) - -> decltype(to_stream(std::declval&>(), fmt.c_str(), tp), - std::basic_string{}) -{ - std::basic_ostringstream os; - os.exceptions(std::ios::failbit | std::ios::badbit); - os.imbue(loc); - to_stream(os, fmt.c_str(), tp); - return os.str(); -} - -template -auto -format(const std::basic_string& fmt, const Streamable& tp) - -> decltype(to_stream(std::declval&>(), fmt.c_str(), tp), - std::basic_string{}) -{ - std::basic_ostringstream os; - os.exceptions(std::ios::failbit | std::ios::badbit); - to_stream(os, fmt.c_str(), tp); - return os.str(); -} - -// parse - -namespace detail -{ - -template -bool -read_char(std::basic_istream& is, CharT fmt, std::ios::iostate& err) -{ - auto ic = is.get(); - if (Traits::eq_int_type(ic, Traits::eof()) || - !Traits::eq(Traits::to_char_type(ic), fmt)) - { - err |= std::ios::failbit; - is.setstate(std::ios::failbit); - return false; - } - return true; -} - -template -unsigned -read_unsigned(std::basic_istream& is, unsigned m = 1, unsigned M = 10) -{ - unsigned x = 0; - unsigned count = 0; - while (true) - { - auto ic = is.peek(); - if (Traits::eq_int_type(ic, Traits::eof())) - break; - auto c = static_cast(Traits::to_char_type(ic)); - if (!('0' <= c && c <= '9')) - break; - (void)is.get(); - ++count; - x = 10*x + static_cast(c - '0'); - if (count == M) - break; - } - if (count < m) - is.setstate(std::ios::failbit); - return x; -} - -template -int -read_signed(std::basic_istream& is, unsigned m = 1, unsigned M = 10) -{ - auto ic = is.peek(); - if (!Traits::eq_int_type(ic, Traits::eof())) - { - auto c = static_cast(Traits::to_char_type(ic)); - if (('0' <= c && c <= '9') || c == '-' || c == '+') - { - if (c == '-' || c == '+') - (void)is.get(); - auto x = static_cast(read_unsigned(is, std::max(m, 1u), M)); - if (!is.fail()) - { - if (c == '-') - x = -x; - return x; - } - } - } - if (m > 0) - is.setstate(std::ios::failbit); - return 0; -} - -template -long double -read_long_double(std::basic_istream& is, unsigned m = 1, unsigned M = 10) -{ - using namespace std; - unsigned count = 0; - auto decimal_point = Traits::to_int_type( - use_facet>(is.getloc()).decimal_point()); - std::string buf; - while (true) - { - auto ic = is.peek(); - if (Traits::eq_int_type(ic, Traits::eof())) - break; - if (Traits::eq_int_type(ic, decimal_point)) - { - buf += '.'; - decimal_point = Traits::eof(); - is.get(); - } - else - { - auto c = static_cast(Traits::to_char_type(ic)); - if (!('0' <= c && c <= '9')) - break; - buf += c; - (void)is.get(); - } - if (++count == M) - break; - } - if (count < m) - { - is.setstate(std::ios::failbit); - return 0; - } - return std::stold(buf); -} - -struct rs -{ - int& i; - unsigned m; - unsigned M; -}; - -struct ru -{ - int& i; - unsigned m; - unsigned M; -}; - -struct rld -{ - long double& i; - unsigned m; - unsigned M; -}; - -template -void -read(std::basic_istream&) -{ -} - -template -void -read(std::basic_istream& is, CharT a0, Args&& ...args); - -template -void -read(std::basic_istream& is, rs a0, Args&& ...args); - -template -void -read(std::basic_istream& is, ru a0, Args&& ...args); - -template -void -read(std::basic_istream& is, int a0, Args&& ...args); - -template -void -read(std::basic_istream& is, rld a0, Args&& ...args); - -template -void -read(std::basic_istream& is, CharT a0, Args&& ...args) -{ - // No-op if a0 == CharT{} - if (a0 != CharT{}) - { - auto ic = is.peek(); - if (Traits::eq_int_type(ic, Traits::eof())) - { - is.setstate(std::ios::failbit | std::ios::eofbit); - return; - } - if (!Traits::eq(Traits::to_char_type(ic), a0)) - { - is.setstate(std::ios::failbit); - return; - } - (void)is.get(); - } - read(is, std::forward(args)...); -} - -template -void -read(std::basic_istream& is, rs a0, Args&& ...args) -{ - auto x = read_signed(is, a0.m, a0.M); - if (is.fail()) - return; - a0.i = x; - read(is, std::forward(args)...); -} - -template -void -read(std::basic_istream& is, ru a0, Args&& ...args) -{ - auto x = read_unsigned(is, a0.m, a0.M); - if (is.fail()) - return; - a0.i = static_cast(x); - read(is, std::forward(args)...); -} - -template -void -read(std::basic_istream& is, int a0, Args&& ...args) -{ - if (a0 != -1) - { - auto u = static_cast(a0); - CharT buf[std::numeric_limits::digits10+2] = {}; - auto e = buf; - do - { - *e++ = CharT(u % 10) + CharT{'0'}; - u /= 10; - } while (u > 0); - std::reverse(buf, e); - for (auto p = buf; p != e && is.rdstate() == std::ios::goodbit; ++p) - read(is, *p); - } - if (is.rdstate() == std::ios::goodbit) - read(is, std::forward(args)...); -} - -template -void -read(std::basic_istream& is, rld a0, Args&& ...args) -{ - auto x = read_long_double(is, a0.m, a0.M); - if (is.fail()) - return; - a0.i = x; - read(is, std::forward(args)...); -} - -} // namespace detail; - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - fields& fds, std::basic_string* abbrev, - std::chrono::minutes* offset) -{ - using namespace std; - using namespace std::chrono; - typename basic_istream::sentry ok{is, true}; - if (ok) - { -#if !ONLY_C_LOCALE - auto& f = use_facet>(is.getloc()); - std::tm tm{}; -#endif - std::basic_string temp_abbrev; - minutes temp_offset{}; - const CharT* command = nullptr; - auto modified = CharT{}; - auto width = -1; - CONSTDATA int not_a_year = numeric_limits::min(); - int Y = not_a_year; - CONSTDATA int not_a_century = not_a_year / 100; - int C = not_a_century; - CONSTDATA int not_a_2digit_year = 100; - int y = not_a_2digit_year; - int m{}; - int d{}; - int j{}; - CONSTDATA int not_a_weekday = 7; - int wd = not_a_weekday; - CONSTDATA int not_a_hour_12_value = 0; - int I = not_a_hour_12_value; - hours h{}; - minutes min{}; - Duration s{}; - int g = not_a_2digit_year; - int G = not_a_year; - CONSTDATA int not_a_week_num = 100; - int V = not_a_week_num; - int U = not_a_week_num; - int W = not_a_week_num; - using detail::read; - using detail::rs; - using detail::ru; - using detail::rld; - for (; *fmt && is.rdstate() == std::ios::goodbit; ++fmt) - { - switch (*fmt) - { - case 'a': - case 'A': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - wd = tm.tm_wday; - is.setstate(err); -#else - auto nm = detail::weekday_names(); - auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (!is.fail()) - wd = i % 7; -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'b': - case 'B': - case 'h': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - m = tm.tm_mon + 1; - is.setstate(err); -#else - auto nm = detail::month_names(); - auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (!is.fail()) - m = i % 12 + 1; -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'c': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - { - Y = tm.tm_year + 1900; - m = tm.tm_mon + 1; - d = tm.tm_mday; - h = hours{tm.tm_hour}; - min = minutes{tm.tm_min}; - s = duration_cast(seconds{tm.tm_sec}); - } - is.setstate(err); -#else - auto nm = detail::weekday_names(); - auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (is.fail()) - goto broken; - wd = i % 7; - ws(is); - nm = detail::month_names(); - i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (is.fail()) - goto broken; - m = i % 12 + 1; - ws(is); - read(is, rs{d, 1, 2}); - if (is.fail()) - goto broken; - ws(is); - using dfs = detail::decimal_format_seconds; - CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; - int H; - int M; - long double S; - read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, - CharT{':'}, rld{S, 1, w}); - if (is.fail()) - goto broken; - h = hours{H}; - min = minutes{M}; - s = round(duration{S}); - ws(is); - read(is, rs{Y, 1, 4u}); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'x': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - { - Y = tm.tm_year + 1900; - m = tm.tm_mon + 1; - d = tm.tm_mday; - } - is.setstate(err); -#else - read(is, ru{m, 1, 2}, CharT{'/'}, ru{d, 1, 2}, CharT{'/'}, - rs{y, 1, 2}); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'X': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - { - h = hours{tm.tm_hour}; - min = minutes{tm.tm_min}; - s = duration_cast(seconds{tm.tm_sec}); - } - is.setstate(err); -#else - using dfs = detail::decimal_format_seconds; - CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; - int H; - int M; - long double S; - read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, - CharT{':'}, rld{S, 1, w}); - if (!is.fail()) - { - h = hours{H}; - min = minutes{M}; - s = round(duration{S}); - } -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'C': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - read(is, rs{C, 1, width == -1 ? 2u : static_cast(width)}); -#if !ONLY_C_LOCALE - } - else - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - { - auto tY = tm.tm_year + 1900; - C = (tY >= 0 ? tY : tY-99) / 100; - } - is.setstate(err); - } -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'D': - if (command) - { - if (modified == CharT{}) - read(is, ru{m, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'}, - ru{d, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'}, - rs{y, 1, 2}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'F': - if (command) - { - if (modified == CharT{}) - read(is, rs{Y, 1, width == -1 ? 4u : static_cast(width)}, - CharT{'-'}, ru{m, 1, 2}, CharT{'-'}, ru{d, 1, 2}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'd': - case 'e': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) -#endif - read(is, rs{d, 1, width == -1 ? 2u : static_cast(width)}); -#if !ONLY_C_LOCALE - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - command = nullptr; - width = -1; - modified = CharT{}; - if ((err & ios::failbit) == 0) - d = tm.tm_mday; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'H': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - int H; - read(is, ru{H, 1, width == -1 ? 2u : static_cast(width)}); - if (!is.fail()) - h = hours{H}; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - h = hours{tm.tm_hour}; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'I': - if (command) - { - if (modified == CharT{}) - { - // reads in an hour into I, but most be in [1, 12] - read(is, rs{I, 1, width == -1 ? 2u : static_cast(width)}); - if (I != not_a_hour_12_value) - { - if (!(1 <= I && I <= 12)) - { - I = not_a_hour_12_value; - goto broken; - } - } - } - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'j': - if (command) - { - if (modified == CharT{}) - read(is, ru{j, 1, width == -1 ? 3u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'M': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - int M; - read(is, ru{M, 1, width == -1 ? 2u : static_cast(width)}); - if (!is.fail()) - min = minutes{M}; -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - min = minutes{tm.tm_min}; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'm': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) -#endif - read(is, rs{m, 1, width == -1 ? 2u : static_cast(width)}); -#if !ONLY_C_LOCALE - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - m = tm.tm_mon + 1; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'n': - case 't': - if (command) - { - // %n matches a single white space character - // %t matches 0 or 1 white space characters - auto ic = is.peek(); - if (Traits::eq_int_type(ic, Traits::eof())) - { - ios_base::iostate err = ios_base::eofbit; - if (*fmt == 'n') - err |= ios_base::failbit; - is.setstate(err); - break; - } - if (isspace(ic)) - { - (void)is.get(); - } - else if (*fmt == 'n') - is.setstate(ios_base::failbit); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'p': - // Error if haven't yet seen %I - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { - if (I == not_a_hour_12_value) - goto broken; - tm = std::tm{}; - tm.tm_hour = I; - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if (err & ios::failbit) - goto broken; - h = hours{tm.tm_hour}; - I = not_a_hour_12_value; - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#else - if (I == not_a_hour_12_value) - goto broken; - auto nm = detail::ampm_names(); - auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (is.fail()) - goto broken; - h = hours{I}; - if (i == 1) - { - if (h != hours{12}) - h += hours{12}; - } - else if (h == hours{12}) - h = hours{0}; - I = not_a_hour_12_value; -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - - break; - case 'r': - if (command) - { -#if !ONLY_C_LOCALE - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - { - h = hours{tm.tm_hour}; - min = minutes{tm.tm_min}; - s = duration_cast(seconds{tm.tm_sec}); - } - is.setstate(err); -#else - using dfs = detail::decimal_format_seconds; - CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; - int H; - int M; - long double S; - read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, - CharT{':'}, rld{S, 1, w}); - if (is.fail() || !(1 <= H && H <= 12)) - goto broken; - ws(is); - auto nm = detail::ampm_names(); - auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first; - if (is.fail()) - goto broken; - h = hours{H}; - if (i == 1) - { - if (h != hours{12}) - h += hours{12}; - } - else if (h == hours{12}) - h = hours{0}; - min = minutes{M}; - s = round(duration{S}); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'R': - if (command) - { - if (modified == CharT{}) - { - int H, M; - read(is, ru{H, 1, 2}, CharT{'\0'}, CharT{':'}, CharT{'\0'}, - ru{M, 1, 2}, CharT{'\0'}); - if (!is.fail()) - { - h = hours{H}; - min = minutes{M}; - } - } - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'S': - if (command) - { - #if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - using dfs = detail::decimal_format_seconds; - CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; - long double S; - read(is, rld{S, 1, width == -1 ? w : static_cast(width)}); - if (!is.fail()) - s = round(duration{S}); -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - s = duration_cast(seconds{tm.tm_sec}); - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'T': - if (command) - { - if (modified == CharT{}) - { - using dfs = detail::decimal_format_seconds; - CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width; - int H; - int M; - long double S; - read(is, ru{H, 1, 2}, CharT{':'}, ru{M, 1, 2}, - CharT{':'}, rld{S, 1, w}); - if (!is.fail()) - { - h = hours{H}; - min = minutes{M}; - s = round(duration{S}); - } - } - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'Y': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) -#endif - read(is, rs{Y, 1, width == -1 ? 4u : static_cast(width)}); -#if !ONLY_C_LOCALE - else if (modified == CharT{'E'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - Y = tm.tm_year + 1900; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'y': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) -#endif - read(is, ru{y, 1, width == -1 ? 2u : static_cast(width)}); -#if !ONLY_C_LOCALE - else - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - Y = tm.tm_year + 1900; - is.setstate(err); - } -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'g': - if (command) - { - if (modified == CharT{}) - read(is, ru{g, 1, width == -1 ? 2u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'G': - if (command) - { - if (modified == CharT{}) - read(is, rs{G, 1, width == -1 ? 4u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'U': - if (command) - { - if (modified == CharT{}) - read(is, ru{U, 1, width == -1 ? 2u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'V': - if (command) - { - if (modified == CharT{}) - read(is, ru{V, 1, width == -1 ? 2u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'W': - if (command) - { - if (modified == CharT{}) - read(is, ru{W, 1, width == -1 ? 2u : static_cast(width)}); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'u': - case 'w': - if (command) - { -#if !ONLY_C_LOCALE - if (modified == CharT{}) - { -#endif - read(is, ru{wd, 1, width == -1 ? 1u : static_cast(width)}); - if (!is.fail() && *fmt == 'u') - { - if (wd == 7) - wd = 0; - else if (wd == 0) - wd = 7; - } -#if !ONLY_C_LOCALE - } - else if (modified == CharT{'O'}) - { - ios_base::iostate err = ios_base::goodbit; - f.get(is, nullptr, is, err, &tm, command, fmt+1); - if ((err & ios::failbit) == 0) - wd = tm.tm_wday; - is.setstate(err); - } - else - read(is, CharT{'%'}, width, modified, *fmt); -#endif - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'E': - case 'O': - if (command) - { - if (modified == CharT{}) - { - modified = *fmt; - } - else - { - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - } - else - read(is, *fmt); - break; - case '%': - if (command) - { - if (modified == CharT{}) - read(is, *fmt); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - command = fmt; - break; - case 'z': - if (command) - { - int H, M; - if (modified == CharT{}) - { - read(is, rs{H, 2, 2}); - if (!is.fail()) - temp_offset = hours{H}; - if (is.good()) - { - auto ic = is.peek(); - if (!Traits::eq_int_type(ic, Traits::eof())) - { - auto c = static_cast(Traits::to_char_type(ic)); - if ('0' <= c && c <= '9') - { - read(is, ru{M, 2, 2}); - if (!is.fail()) - temp_offset += minutes{ H < 0 ? -M : M }; - } - } - } - } - else - { - read(is, rs{H, 1, 2}); - if (!is.fail()) - temp_offset = hours{H}; - if (is.good()) - { - auto ic = is.peek(); - if (!Traits::eq_int_type(ic, Traits::eof())) - { - auto c = static_cast(Traits::to_char_type(ic)); - if (c == ':') - { - (void)is.get(); - read(is, ru{M, 2, 2}); - if (!is.fail()) - temp_offset += minutes{ H < 0 ? -M : M }; - } - } - } - } - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - case 'Z': - if (command) - { - if (modified == CharT{}) - { - if (!temp_abbrev.empty()) - is.setstate(ios::failbit); - else - { - while (is.rdstate() == std::ios::goodbit) - { - auto i = is.rdbuf()->sgetc(); - if (Traits::eq_int_type(i, Traits::eof())) - { - is.setstate(ios::eofbit); - break; - } - auto wc = Traits::to_char_type(i); - auto c = static_cast(wc); - // is c a valid time zone name or abbreviation character? - if (!(CharT{1} < wc && wc < CharT{127}) || !(isalnum(c) || - c == '_' || c == '/' || c == '-' || c == '+')) - break; - temp_abbrev.push_back(c); - is.rdbuf()->sbumpc(); - } - if (temp_abbrev.empty()) - is.setstate(ios::failbit); - } - } - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - else - read(is, *fmt); - break; - default: - if (command) - { - if (width == -1 && modified == CharT{} && '0' <= *fmt && *fmt <= '9') - { - width = static_cast(*fmt) - '0'; - while ('0' <= fmt[1] && fmt[1] <= '9') - width = 10*width + static_cast(*++fmt) - '0'; - } - else - { - if (modified == CharT{}) - read(is, CharT{'%'}, width, *fmt); - else - read(is, CharT{'%'}, width, modified, *fmt); - command = nullptr; - width = -1; - modified = CharT{}; - } - } - else // !command - { - if (isspace(*fmt)) - ws(is); // space matches 0 or more white space characters - else - read(is, *fmt); - } - break; - } - } - // is.rdstate() != ios::goodbit || *fmt == CharT{} - if (is.rdstate() == ios::goodbit && command) - { - if (modified == CharT{}) - read(is, CharT{'%'}, width); - else - read(is, CharT{'%'}, width, modified); - } - if (is.rdstate() != ios::goodbit && *fmt != CharT{} && !is.fail()) - is.setstate(ios::failbit); - if (!is.fail()) - { - if (y != not_a_2digit_year) - { - // Convert y and an optional C to Y - if (!(0 <= y && y <= 99)) - goto broken; - if (C == not_a_century) - { - if (Y == not_a_year) - { - if (y >= 69) - C = 19; - else - C = 20; - } - else - { - C = (Y >= 0 ? Y : Y-100) / 100; - } - } - int tY; - if (C >= 0) - tY = 100*C + y; - else - tY = 100*(C+1) - (y == 0 ? 100 : y); - if (Y != not_a_year && Y != tY) - goto broken; - Y = tY; - } - if (g != not_a_2digit_year) - { - // Convert g and an optional C to G - if (!(0 <= g && g <= 99)) - goto broken; - if (C == not_a_century) - { - if (G == not_a_year) - { - if (g >= 69) - C = 19; - else - C = 20; - } - else - { - C = (G >= 0 ? G : G-100) / 100; - } - } - int tG; - if (C >= 0) - tG = 100*C + g; - else - tG = 100*(C+1) - (g == 0 ? 100 : g); - if (G != not_a_year && G != tG) - goto broken; - G = tG; - } - if (G != not_a_year) - { - // Convert G, V and wd to Y, m and d - if (V == not_a_week_num || wd == not_a_weekday) - goto broken; - auto ymd = year_month_day{local_days(year{G-1}/dec/thu[last]) + - (mon-thu) + weeks{V-1} + - (weekday{static_cast(wd)}-mon)}; - if (Y == not_a_year) - Y = static_cast(ymd.year()); - else if (year{Y} != ymd.year()) - goto broken; - if (m == 0) - m = static_cast(static_cast(ymd.month())); - else if (month(static_cast(m)) != ymd.month()) - goto broken; - if (d == 0) - d = static_cast(static_cast(ymd.day())); - else if (day(static_cast(d)) != ymd.day()) - goto broken; - } - if (j != 0 && Y != not_a_year) - { - auto ymd = year_month_day{local_days(year{Y}/1/1) + days{j-1}}; - if (m == 0) - m = static_cast(static_cast(ymd.month())); - else if (month(static_cast(m)) != ymd.month()) - goto broken; - if (d == 0) - d = static_cast(static_cast(ymd.day())); - else if (day(static_cast(d)) != ymd.day()) - goto broken; - } - if (U != not_a_week_num && Y != not_a_year) - { - if (wd == not_a_weekday) - goto broken; - sys_days sd; - if (U == 0) - sd = year{Y-1}/dec/weekday{static_cast(wd)}[last]; - else - sd = sys_days(year{Y}/jan/sun[1]) + weeks{U-1} + - (weekday{static_cast(wd)} - sun); - year_month_day ymd = sd; - if (year{Y} != ymd.year()) - goto broken; - if (m == 0) - m = static_cast(static_cast(ymd.month())); - else if (month(static_cast(m)) != ymd.month()) - goto broken; - if (d == 0) - d = static_cast(static_cast(ymd.day())); - else if (day(static_cast(d)) != ymd.day()) - goto broken; - } - if (W != not_a_week_num && Y != not_a_year) - { - if (wd == not_a_weekday) - goto broken; - sys_days sd; - if (W == 0) - sd = year{Y-1}/dec/weekday{static_cast(wd)}[last]; - else - sd = sys_days(year{Y}/jan/mon[1]) + weeks{W-1} + - (weekday{static_cast(wd)} - mon); - year_month_day ymd = sd; - if (year{Y} != ymd.year()) - goto broken; - if (m == 0) - m = static_cast(static_cast(ymd.month())); - else if (month(static_cast(m)) != ymd.month()) - goto broken; - if (d == 0) - d = static_cast(static_cast(ymd.day())); - else if (day(static_cast(d)) != ymd.day()) - goto broken; - } - if (Y < static_cast(year::min()) || Y > static_cast(year::max())) - Y = not_a_year; - auto ymd = year{Y}/m/d; - if (wd != not_a_weekday && ymd.ok()) - { - if (weekday{static_cast(wd)} != weekday(ymd)) - goto broken; - } - fds.ymd = ymd; - fds.tod = time_of_day{h}; - fds.tod.m_ = min; - fds.tod.s_ = detail::decimal_format_seconds{s}; - if (wd != not_a_weekday) - fds.wd = weekday{static_cast(wd)}; - if (abbrev != nullptr) - *abbrev = std::move(temp_abbrev); - if (offset != nullptr) - *offset = temp_offset; - } - return is; - } -broken: - is.setstate(ios_base::failbit); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, year& y, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.year().ok()) - is.setstate(ios::failbit); - if (!is.fail()) - y = fds.ymd.year(); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, month& m, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.month().ok()) - is.setstate(ios::failbit); - if (!is.fail()) - m = fds.ymd.month(); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, day& d, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.day().ok()) - is.setstate(ios::failbit); - if (!is.fail()) - d = fds.ymd.day(); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, weekday& wd, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.wd.ok()) - is.setstate(ios::failbit); - if (!is.fail()) - wd = fds.wd; - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, year_month& ym, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.month().ok()) - is.setstate(ios::failbit); - if (!is.fail()) - ym = fds.ymd.year()/fds.ymd.month(); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, month_day& md, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.month().ok() || !fds.ymd.day().ok()) - is.setstate(ios::failbit); - if (!is.fail()) - md = fds.ymd.month()/fds.ymd.day(); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - year_month_day& ymd, std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = seconds; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.ok()) - is.setstate(ios::failbit); - if (!is.fail()) - ymd = fds.ymd; - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - sys_time& tp, std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - minutes offset_local{}; - auto offptr = offset ? offset : &offset_local; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offptr); - if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) - is.setstate(ios::failbit); - if (!is.fail()) - tp = round(sys_days(fds.ymd) - *offptr + fds.tod.to_duration()); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - local_time& tp, std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) - is.setstate(ios::failbit); - if (!is.fail()) - tp = round(local_seconds{local_days(fds.ymd)} + fds.tod.to_duration()); - return is; -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - std::chrono::duration& d, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using Duration = std::chrono::duration; - using CT = typename common_type::type; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offset); - if (!is.fail()) - d = duration_cast(fds.tod.to_duration()); - return is; -} - -template , - class Alloc = std::allocator> -struct parse_manip -{ - const std::basic_string format_; - Parsable& tp_; - std::basic_string* abbrev_; - std::chrono::minutes* offset_; - -public: - parse_manip(std::basic_string format, Parsable& tp, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) - : format_(std::move(format)) - , tp_(tp) - , abbrev_(abbrev) - , offset_(offset) - {} - -}; - -template -std::basic_istream& -operator>>(std::basic_istream& is, - const parse_manip& x) -{ - return from_stream(is, x.format_.c_str(), x.tp_, x.abbrev_, x.offset_); -} - -template -inline -auto -parse(const std::basic_string& format, Parsable& tp) - -> decltype(from_stream(std::declval&>(), - format.c_str(), tp), - parse_manip{format, tp}) -{ - return {format, tp}; -} - -template -inline -auto -parse(const std::basic_string& format, Parsable& tp, - std::basic_string& abbrev) - -> decltype(from_stream(std::declval&>(), - format.c_str(), tp, &abbrev), - parse_manip{format, tp, &abbrev}) -{ - return {format, tp, &abbrev}; -} - -template -inline -auto -parse(const std::basic_string& format, Parsable& tp, - std::chrono::minutes& offset) - -> decltype(from_stream(std::declval&>(), - format.c_str(), tp, nullptr, &offset), - parse_manip{format, tp, nullptr, &offset}) -{ - return {format, tp, nullptr, &offset}; -} - -template -inline -auto -parse(const std::basic_string& format, Parsable& tp, - std::basic_string& abbrev, std::chrono::minutes& offset) - -> decltype(from_stream(std::declval&>(), - format.c_str(), tp, &abbrev, &offset), - parse_manip{format, tp, &abbrev, &offset}) -{ - return {format, tp, &abbrev, &offset}; -} - -// const CharT* formats - -template -inline -auto -parse(const CharT* format, Parsable& tp) - -> decltype(from_stream(std::declval&>(), format, tp), - parse_manip{format, tp}) -{ - return {format, tp}; -} - -template -inline -auto -parse(const CharT* format, Parsable& tp, std::basic_string& abbrev) - -> decltype(from_stream(std::declval&>(), format, - tp, &abbrev), - parse_manip{format, tp, &abbrev}) -{ - return {format, tp, &abbrev}; -} - -template -inline -auto -parse(const CharT* format, Parsable& tp, std::chrono::minutes& offset) - -> decltype(from_stream(std::declval&>(), format, - tp, nullptr, &offset), - parse_manip{format, tp, nullptr, &offset}) -{ - return {format, tp, nullptr, &offset}; -} - -template -inline -auto -parse(const CharT* format, Parsable& tp, - std::basic_string& abbrev, std::chrono::minutes& offset) - -> decltype(from_stream(std::declval&>(), format, - tp, &abbrev, &offset), - parse_manip{format, tp, &abbrev, &offset}) -{ - return {format, tp, &abbrev, &offset}; -} - -// duration streaming - -namespace detail -{ - -#if __cplusplus >= 201402 && (!defined(__EDG_VERSION__) || __EDG_VERSION__ > 411) \ - && (!defined(__SUNPRO_CC) || __SUNPRO_CC > 0x5150) - -template -class string_literal -{ - CharT p_[N]; - -public: - using const_iterator = const CharT*; - - string_literal(string_literal const&) = default; - string_literal& operator=(string_literal const&) = delete; - - template > - CONSTCD14 string_literal(CharT c) NOEXCEPT - : p_{c} - { - } - - CONSTCD14 string_literal(const CharT(&a)[N]) NOEXCEPT - : p_{} - { - for (std::size_t i = 0; i < N; ++i) - p_[i] = a[i]; - } - - template > - CONSTCD14 string_literal(const char(&a)[N]) NOEXCEPT - : p_{} - { - for (std::size_t i = 0; i < N; ++i) - p_[i] = a[i]; - } - - template {}>> - CONSTCD14 string_literal(string_literal const& a) NOEXCEPT - : p_{} - { - for (std::size_t i = 0; i < N; ++i) - p_[i] = a[i]; - } - - template > - CONSTCD14 string_literal(const string_literal& x, - const string_literal& y) NOEXCEPT - : p_{} - { - std::size_t i = 0; - for (; i < N1-1; ++i) - p_[i] = x[i]; - for (std::size_t j = 0; j < N2; ++j, ++i) - p_[i] = y[j]; - } - - CONSTCD14 const CharT* data() const NOEXCEPT {return p_;} - CONSTCD14 std::size_t size() const NOEXCEPT {return N-1;} - - CONSTCD14 const_iterator begin() const NOEXCEPT {return p_;} - CONSTCD14 const_iterator end() const NOEXCEPT {return p_ + N-1;} - - CONSTCD14 CharT const& operator[](std::size_t n) const NOEXCEPT - { - return p_[n]; - } - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, const string_literal& s) - { - return os << s.p_; - } -}; - -template -CONSTCD14 -inline -string_literal, - N1 + N2 - 1> -operator+(const string_literal& x, const string_literal& y) NOEXCEPT -{ - using CharT = std::conditional_t; - return string_literal{string_literal{x}, - string_literal{y}}; -} - -template -inline -std::basic_string -operator+(std::basic_string x, - const string_literal& y) NOEXCEPT -{ - x.append(y.data(), y.size()); - return x; -} - -template -CONSTCD14 -inline -string_literal -msl(const CharT(&a)[N]) NOEXCEPT -{ - return string_literal{a}; -} - -template {} || - std::is_same{} || - std::is_same{} || - std::is_same{}>> -CONSTCD14 -inline -string_literal -msl(CharT c) NOEXCEPT -{ - return string_literal{c}; -} - -CONSTCD14 -inline -std::size_t -to_string_len(std::intmax_t i) -{ - std::size_t r = 0; - do - { - i /= 10; - ++r; - } while (i > 0); - return r; -} - -template -CONSTCD14 -inline -std::enable_if_t -< - N < 10, - string_literal -> -msl() NOEXCEPT -{ - return msl(char(N % 10 + '0')); -} - -template -CONSTCD14 -inline -std::enable_if_t -< - 10 <= N, - string_literal -> -msl() NOEXCEPT -{ - return msl() + msl(char(N % 10 + '0')); -} - -template -CONSTCD14 -inline -std::enable_if_t -< - std::ratio::type::den != 1, - string_literal::type::num) + - to_string_len(std::ratio::type::den) + 4> -> -msl(std::ratio) NOEXCEPT -{ - using R = typename std::ratio::type; - return msl(CharT{'['}) + msl() + msl(CharT{'/'}) + - msl() + msl(CharT{']'}); -} - -template -CONSTCD14 -inline -std::enable_if_t -< - std::ratio::type::den == 1, - string_literal::type::num) + 3> -> -msl(std::ratio) NOEXCEPT -{ - using R = typename std::ratio::type; - return msl(CharT{'['}) + msl() + msl(CharT{']'}); -} - -template -CONSTCD14 -inline -auto -msl(std::atto) NOEXCEPT -{ - return msl(CharT{'a'}); -} - -template -CONSTCD14 -inline -auto -msl(std::femto) NOEXCEPT -{ - return msl(CharT{'f'}); -} - -template -CONSTCD14 -inline -auto -msl(std::pico) NOEXCEPT -{ - return msl(CharT{'p'}); -} - -template -CONSTCD14 -inline -auto -msl(std::nano) NOEXCEPT -{ - return msl(CharT{'n'}); -} - -template -CONSTCD14 -inline -std::enable_if_t -< - std::is_same{}, - string_literal -> -msl(std::micro) NOEXCEPT -{ - return string_literal{"\xC2\xB5"}; -} - -template -CONSTCD14 -inline -std::enable_if_t -< - !std::is_same{}, - string_literal -> -msl(std::micro) NOEXCEPT -{ - return string_literal{CharT{static_cast('\xB5')}}; -} - -template -CONSTCD14 -inline -auto -msl(std::milli) NOEXCEPT -{ - return msl(CharT{'m'}); -} - -template -CONSTCD14 -inline -auto -msl(std::centi) NOEXCEPT -{ - return msl(CharT{'c'}); -} - -template -CONSTCD14 -inline -auto -msl(std::deci) NOEXCEPT -{ - return msl(CharT{'d'}); -} - -template -CONSTCD14 -inline -auto -msl(std::deca) NOEXCEPT -{ - return string_literal{"da"}; -} - -template -CONSTCD14 -inline -auto -msl(std::hecto) NOEXCEPT -{ - return msl(CharT{'h'}); -} - -template -CONSTCD14 -inline -auto -msl(std::kilo) NOEXCEPT -{ - return msl(CharT{'k'}); -} - -template -CONSTCD14 -inline -auto -msl(std::mega) NOEXCEPT -{ - return msl(CharT{'M'}); -} - -template -CONSTCD14 -inline -auto -msl(std::giga) NOEXCEPT -{ - return msl(CharT{'G'}); -} - -template -CONSTCD14 -inline -auto -msl(std::tera) NOEXCEPT -{ - return msl(CharT{'T'}); -} - -template -CONSTCD14 -inline -auto -msl(std::peta) NOEXCEPT -{ - return msl(CharT{'P'}); -} - -template -CONSTCD14 -inline -auto -msl(std::exa) NOEXCEPT -{ - return msl(CharT{'E'}); -} - -template -CONSTCD14 -auto -get_units(Period p) -{ - return msl(p) + string_literal{"s"}; -} - -template -CONSTCD14 -auto -get_units(std::ratio<1>) -{ - return string_literal{"s"}; -} - -template -CONSTCD14 -auto -get_units(std::ratio<60>) -{ - return string_literal{"min"}; -} - -template -CONSTCD14 -auto -get_units(std::ratio<3600>) -{ - return string_literal{"h"}; -} - -#else // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411) - -inline -std::string -to_string(std::uint64_t x) -{ - return std::to_string(x); -} - -template -std::basic_string -to_string(std::uint64_t x) -{ - auto y = std::to_string(x); - return std::basic_string(y.begin(), y.end()); -} - -template -inline -typename std::enable_if -< - std::ratio::type::den != 1, - std::basic_string ->::type -msl(std::ratio) -{ - using R = typename std::ratio::type; - return std::basic_string(1, '[') + to_string(R::num) + CharT{'/'} + - to_string(R::den) + CharT{']'}; -} - -template -inline -typename std::enable_if -< - std::ratio::type::den == 1, - std::basic_string ->::type -msl(std::ratio) -{ - using R = typename std::ratio::type; - return std::basic_string(1, '[') + to_string(R::num) + CharT{']'}; -} - -template -inline -std::basic_string -msl(std::atto) -{ - return {'a'}; -} - -template -inline -std::basic_string -msl(std::femto) -{ - return {'f'}; -} - -template -inline -std::basic_string -msl(std::pico) -{ - return {'p'}; -} - -template -inline -std::basic_string -msl(std::nano) -{ - return {'n'}; -} - -template -inline -typename std::enable_if -< - std::is_same::value, - std::string ->::type -msl(std::micro) -{ - return "\xC2\xB5"; -} - -template -inline -typename std::enable_if -< - !std::is_same::value, - std::basic_string ->::type -msl(std::micro) -{ - return {CharT(static_cast('\xB5'))}; -} - -template -inline -std::basic_string -msl(std::milli) -{ - return {'m'}; -} - -template -inline -std::basic_string -msl(std::centi) -{ - return {'c'}; -} - -template -inline -std::basic_string -msl(std::deci) -{ - return {'d'}; -} - -template -inline -std::basic_string -msl(std::deca) -{ - return {'d', 'a'}; -} - -template -inline -std::basic_string -msl(std::hecto) -{ - return {'h'}; -} - -template -inline -std::basic_string -msl(std::kilo) -{ - return {'k'}; -} - -template -inline -std::basic_string -msl(std::mega) -{ - return {'M'}; -} - -template -inline -std::basic_string -msl(std::giga) -{ - return {'G'}; -} - -template -inline -std::basic_string -msl(std::tera) -{ - return {'T'}; -} - -template -inline -std::basic_string -msl(std::peta) -{ - return {'P'}; -} - -template -inline -std::basic_string -msl(std::exa) -{ - return {'E'}; -} - -template -std::basic_string -get_units(Period p) -{ - return msl(p) + CharT{'s'}; -} - -template -std::basic_string -get_units(std::ratio<1>) -{ - return {'s'}; -} - -template -std::basic_string -get_units(std::ratio<60>) -{ - return {'m', 'i', 'n'}; -} - -template -std::basic_string -get_units(std::ratio<3600>) -{ - return {'h'}; -} - -#endif // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411) - -template > -struct make_string; - -template <> -struct make_string -{ - template - static - std::string - from(Rep n) - { - return std::to_string(n); - } -}; - -template -struct make_string -{ - template - static - std::basic_string - from(Rep n) - { - auto s = std::to_string(n); - return std::basic_string(s.begin(), s.end()); - } -}; - -template <> -struct make_string -{ - template - static - std::wstring - from(Rep n) - { - return std::to_wstring(n); - } -}; - -template -struct make_string -{ - template - static - std::basic_string - from(Rep n) - { - auto s = std::to_wstring(n); - return std::basic_string(s.begin(), s.end()); - } -}; - -} // namespace detail - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, - const std::chrono::duration& d) -{ - using namespace detail; - return os << make_string::from(d.count()) + - get_units(typename Period::type{}); -} - -} // namespace date -} // namespace arrow_vendored - - -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif - - -#endif // DATE_H diff --git a/r/R/inst/include/arrow/vendored/datetime/ios.h b/r/R/inst/include/arrow/vendored/datetime/ios.h deleted file mode 100644 index acad28d13b5..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime/ios.h +++ /dev/null @@ -1,53 +0,0 @@ -// -// ios.h -// DateTimeLib -// -// The MIT License (MIT) -// -// Copyright (c) 2016 Alexander Kormanovsky -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef ios_hpp -#define ios_hpp - -#if __APPLE__ -# include -# if TARGET_OS_IPHONE -# include - - namespace arrow_vendored - { - namespace date - { - namespace iOSUtils - { - - std::string get_tzdata_path(); - std::string get_current_timezone(); - - } // namespace iOSUtils - } // namespace date - } // namespace arrow_vendored - -# endif // TARGET_OS_IPHONE -#else // !__APPLE__ -# define TARGET_OS_IPHONE 0 -#endif // !__APPLE__ -#endif // ios_hpp diff --git a/r/R/inst/include/arrow/vendored/datetime/tz.h b/r/R/inst/include/arrow/vendored/datetime/tz.h deleted file mode 100644 index 249162b0149..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime/tz.h +++ /dev/null @@ -1,2590 +0,0 @@ -#ifndef TZ_H -#define TZ_H - -// The MIT License (MIT) -// -// Copyright (c) 2015, 2016, 2017 Howard Hinnant -// Copyright (c) 2017 Jiangang Zhuang -// Copyright (c) 2017 Aaron Bishop -// Copyright (c) 2017 Tomasz KamiÅ„ski -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Our apologies. When the previous paragraph was written, lowercase had not yet -// been invented (that would involve another several millennia of evolution). -// We did not mean to shout. - -// Get more recent database at http://www.iana.org/time-zones - -// The notion of "current timezone" is something the operating system is expected to "just -// know". How it knows this is system specific. It's often a value set by the user at OS -// installation time and recorded by the OS somewhere. On Linux and Mac systems the current -// timezone name is obtained by looking at the name or contents of a particular file on -// disk. On Windows the current timezone name comes from the registry. In either method, -// there is no guarantee that the "native" current timezone name obtained will match any -// of the "Standard" names in this library's "database". On Linux, the names usually do -// seem to match so mapping functions to map from native to "Standard" are typically not -// required. On Windows, the names are never "Standard" so mapping is always required. -// Technically any OS may use the mapping process but currently only Windows does use it. - -/////////////////////////////////////////////////// - -// Windows does not support OS timezone database -#ifdef _WIN32 -# define USE_OS_TZDB 0 -#else -# define USE_OS_TZDB 1 -#endif -#define HAS_REMOTE_API 0 - -//////////////////////////////////////////////////// - -#ifndef USE_OS_TZDB -# define USE_OS_TZDB 0 -#endif - -#ifndef HAS_REMOTE_API -# if USE_OS_TZDB == 0 -# ifdef _WIN32 -# define HAS_REMOTE_API 0 -# else -# define HAS_REMOTE_API 1 -# endif -# else // HAS_REMOTE_API makes no since when using the OS timezone database -# define HAS_REMOTE_API 0 -# endif -#endif - -#ifdef __clang__ -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wconstant-logical-operand" -#endif - -static_assert(!(USE_OS_TZDB && HAS_REMOTE_API), - "USE_OS_TZDB and HAS_REMOTE_API can not be used together"); - -#ifdef __clang__ -# pragma clang diagnostic pop -#endif - -#ifndef AUTO_DOWNLOAD -# define AUTO_DOWNLOAD HAS_REMOTE_API -#endif - -static_assert(HAS_REMOTE_API == 0 ? AUTO_DOWNLOAD == 0 : true, - "AUTO_DOWNLOAD can not be turned on without HAS_REMOTE_API"); - -#ifndef USE_SHELL_API -# define USE_SHELL_API 1 -#endif - -#if USE_OS_TZDB -# ifdef _WIN32 -# error "USE_OS_TZDB can not be used on Windows" -# endif -# ifndef MISSING_LEAP_SECONDS -# ifdef __APPLE__ -# define MISSING_LEAP_SECONDS 1 -# else -# define MISSING_LEAP_SECONDS 0 -# endif -# endif -#else -# define MISSING_LEAP_SECONDS 0 -#endif - -#ifndef HAS_DEDUCTION_GUIDES -# if __cplusplus >= 201703 -# define HAS_DEDUCTION_GUIDES 1 -# else -# define HAS_DEDUCTION_GUIDES 0 -# endif -#endif // HAS_DEDUCTION_GUIDES - -#include "date.h" - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#include "tz_private.h" -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 -# ifdef DATE_BUILD_DLL -# define DATE_API __declspec(dllexport) -# elif defined(DATE_USE_DLL) -# define DATE_API __declspec(dllimport) -# else -# define DATE_API -# endif -#else -# ifdef DATE_BUILD_DLL -# define DATE_API __attribute__ ((visibility ("default"))) -# else -# define DATE_API -# endif -#endif - -namespace arrow_vendored -{ -namespace date -{ - -enum class choose {earliest, latest}; - -namespace detail -{ - struct undocumented; -} - -struct sys_info -{ - sys_seconds begin; - sys_seconds end; - std::chrono::seconds offset; - std::chrono::minutes save; - std::string abbrev; -}; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const sys_info& r) -{ - os << r.begin << '\n'; - os << r.end << '\n'; - os << make_time(r.offset) << "\n"; - os << make_time(r.save) << "\n"; - os << r.abbrev << '\n'; - return os; -} - -struct local_info -{ - enum {unique, nonexistent, ambiguous} result; - sys_info first; - sys_info second; -}; - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const local_info& r) -{ - if (r.result == local_info::nonexistent) - os << "nonexistent between\n"; - else if (r.result == local_info::ambiguous) - os << "ambiguous between\n"; - os << r.first; - if (r.result != local_info::unique) - { - os << "and\n"; - os << r.second; - } - return os; -} - -class nonexistent_local_time - : public std::runtime_error -{ -public: - template - nonexistent_local_time(local_time tp, const local_info& i); - -private: - template - static - std::string - make_msg(local_time tp, const local_info& i); -}; - -template -inline -nonexistent_local_time::nonexistent_local_time(local_time tp, - const local_info& i) - : std::runtime_error(make_msg(tp, i)) -{ -} - -template -std::string -nonexistent_local_time::make_msg(local_time tp, const local_info& i) -{ - assert(i.result == local_info::nonexistent); - std::ostringstream os; - os << tp << " is in a gap between\n" - << local_seconds{i.first.end.time_since_epoch()} + i.first.offset << ' ' - << i.first.abbrev << " and\n" - << local_seconds{i.second.begin.time_since_epoch()} + i.second.offset << ' ' - << i.second.abbrev - << " which are both equivalent to\n" - << i.first.end << " UTC"; - return os.str(); -} - -class ambiguous_local_time - : public std::runtime_error -{ -public: - template - ambiguous_local_time(local_time tp, const local_info& i); - -private: - template - static - std::string - make_msg(local_time tp, const local_info& i); -}; - -template -inline -ambiguous_local_time::ambiguous_local_time(local_time tp, const local_info& i) - : std::runtime_error(make_msg(tp, i)) -{ -} - -template -std::string -ambiguous_local_time::make_msg(local_time tp, const local_info& i) -{ - assert(i.result == local_info::ambiguous); - std::ostringstream os; - os << tp << " is ambiguous. It could be\n" - << tp << ' ' << i.first.abbrev << " == " - << tp - i.first.offset << " UTC or\n" - << tp << ' ' << i.second.abbrev << " == " - << tp - i.second.offset << " UTC"; - return os.str(); -} - -class time_zone; - -#if HAS_STRING_VIEW -DATE_API const time_zone* locate_zone(std::string_view tz_name); -#else -DATE_API const time_zone* locate_zone(const std::string& tz_name); -#endif - -DATE_API const time_zone* current_zone(); - -template -struct zoned_traits -{ -}; - -template <> -struct zoned_traits -{ - static - const time_zone* - default_zone() - { - return date::locate_zone("Etc/UTC"); - } - -#if HAS_STRING_VIEW - - static - const time_zone* - locate_zone(std::string_view name) - { - return date::locate_zone(name); - } - -#else // !HAS_STRING_VIEW - - static - const time_zone* - locate_zone(const std::string& name) - { - return date::locate_zone(name); - } - - static - const time_zone* - locate_zone(const char* name) - { - return date::locate_zone(name); - } - -#endif // !HAS_STRING_VIEW -}; - -template -class zoned_time; - -template -bool -operator==(const zoned_time& x, - const zoned_time& y); - -template -class zoned_time -{ -public: - using duration = typename std::common_type::type; - -private: - TimeZonePtr zone_; - sys_time tp_; - -public: -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::default_zone())> -#endif - zoned_time(); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::default_zone())> -#endif - zoned_time(const sys_time& st); - explicit zoned_time(TimeZonePtr z); - -#if HAS_STRING_VIEW - template ::locate_zone(std::string_view())) - >::value - >::type> - explicit zoned_time(std::string_view name); -#else -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())) - >::value - >::type> -#endif - explicit zoned_time(const std::string& name); -#endif - - template , - sys_time>::value - >::type> - zoned_time(const zoned_time& zt) NOEXCEPT; - - zoned_time(TimeZonePtr z, const sys_time& st); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ()->to_sys(local_time{})), - sys_time - >::value - >::type> -#endif - zoned_time(TimeZonePtr z, const local_time& tp); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ()->to_sys(local_time{}, - choose::earliest)), - sys_time - >::value - >::type> -#endif - zoned_time(TimeZonePtr z, const local_time& tp, choose c); - - template , - sys_time>::value - >::type> - zoned_time(TimeZonePtr z, const zoned_time& zt); - - template , - sys_time>::value - >::type> - zoned_time(TimeZonePtr z, const zoned_time& zt, choose); - -#if HAS_STRING_VIEW - - template ::locate_zone(std::string_view())), - sys_time - >::value - >::type> - zoned_time(std::string_view name, const sys_time& st); - - template ::locate_zone(std::string_view())), - local_time - >::value - >::type> - zoned_time(std::string_view name, const local_time& tp); - - template ::locate_zone(std::string_view())), - local_time, - choose - >::value - >::type> - zoned_time(std::string_view name, const local_time& tp, choose c); - - template ::locate_zone(std::string_view())), - zoned_time - >::value - >::type> - zoned_time(std::string_view name, const zoned_time& zt); - - template ::locate_zone(std::string_view())), - zoned_time, - choose - >::value - >::type> - zoned_time(std::string_view name, const zoned_time& zt, choose); - -#else // !HAS_STRING_VIEW - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - sys_time - >::value - >::type> -#endif - zoned_time(const std::string& name, const sys_time& st); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - sys_time - >::value - >::type> -#endif - zoned_time(const char* name, const sys_time& st); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - local_time - >::value - >::type> -#endif - zoned_time(const std::string& name, const local_time& tp); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - local_time - >::value - >::type> -#endif - zoned_time(const char* name, const local_time& tp); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - local_time, - choose - >::value - >::type> -#endif - zoned_time(const std::string& name, const local_time& tp, choose c); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - local_time, - choose - >::value - >::type> -#endif - zoned_time(const char* name, const local_time& tp, choose c); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - zoned_time - >::value - >::type> -#endif - zoned_time(const std::string& name, const zoned_time& zt); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - zoned_time - >::value - >::type> -#endif - zoned_time(const char* name, const zoned_time& zt); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - zoned_time, - choose - >::value - >::type> -#endif - zoned_time(const std::string& name, const zoned_time& zt, choose); - -#if !defined(_MSC_VER) || (_MSC_VER > 1900) - template ::locate_zone(std::string())), - zoned_time, - choose - >::value - >::type> -#endif - zoned_time(const char* name, const zoned_time& zt, choose); - -#endif // !HAS_STRING_VIEW - - zoned_time& operator=(const sys_time& st); - zoned_time& operator=(const local_time& ut); - - explicit operator sys_time() const; - explicit operator local_time() const; - - TimeZonePtr get_time_zone() const; - local_time get_local_time() const; - sys_time get_sys_time() const; - sys_info get_info() const; - - template - friend - bool - operator==(const zoned_time& x, - const zoned_time& y); - - template - friend - std::basic_ostream& - operator<<(std::basic_ostream& os, - const zoned_time& t); - -private: - template friend class zoned_time; -}; - -using zoned_seconds = zoned_time; - -#if HAS_DEDUCTION_GUIDES - -zoned_time() - -> zoned_time; - -template -zoned_time(sys_time) - -> zoned_time>; - -template -zoned_time(TimeZonePtr) - -> zoned_time; - -template -zoned_time(TimeZonePtr, sys_time) - -> zoned_time, TimeZonePtr>; - -template -zoned_time(TimeZonePtr, local_time, choose = choose::earliest) - -> zoned_time, TimeZonePtr>; - -#if HAS_STRING_VIEW - -zoned_time(std::string_view) - -> zoned_time; - -template -zoned_time(std::string_view, sys_time) - -> zoned_time>; - -template -zoned_time(std::string_view, local_time, choose = choose::earliest) - -> zoned_time>; - -#else // !HAS_STRING_VIEW - -zoned_time(std::string) - -> zoned_time; - -template -zoned_time(std::string, sys_time) - -> zoned_time>; - -template -zoned_time(std::string, local_time, choose = choose::earliest) - -> zoned_time>; - -#endif // !HAS_STRING_VIEW - -template -zoned_time(const char*, sys_time) - -> zoned_time>; - -template -zoned_time(const char*, local_time, choose = choose::earliest) - -> zoned_time>; - -template -zoned_time(TimeZonePtr, zoned_time, choose = choose::earliest) - -> zoned_time; - -#endif // HAS_DEDUCTION_GUIDES - -template -inline -bool -operator==(const zoned_time& x, - const zoned_time& y) -{ - return x.zone_ == y.zone_ && x.tp_ == y.tp_; -} - -template -inline -bool -operator!=(const zoned_time& x, - const zoned_time& y) -{ - return !(x == y); -} - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - -namespace detail -{ -# if USE_OS_TZDB - struct transition; - struct expanded_ttinfo; -# else // !USE_OS_TZDB - struct zonelet; - class Rule; -# endif // !USE_OS_TZDB -} - -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - -class time_zone -{ -private: - std::string name_; -#if USE_OS_TZDB - std::vector transitions_; - std::vector ttinfos_; -#else // !USE_OS_TZDB - std::vector zonelets_; -#endif // !USE_OS_TZDB - std::unique_ptr adjusted_; - -public: -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - time_zone(time_zone&&) = default; - time_zone& operator=(time_zone&&) = default; -#else // defined(_MSC_VER) && (_MSC_VER < 1900) - time_zone(time_zone&& src); - time_zone& operator=(time_zone&& src); -#endif // defined(_MSC_VER) && (_MSC_VER < 1900) - - DATE_API explicit time_zone(const std::string& s, detail::undocumented); - - const std::string& name() const NOEXCEPT; - - template sys_info get_info(sys_time st) const; - template local_info get_info(local_time tp) const; - - template - sys_time::type> - to_sys(local_time tp) const; - - template - sys_time::type> - to_sys(local_time tp, choose z) const; - - template - local_time::type> - to_local(sys_time tp) const; - - friend bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT; - friend bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT; - friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone& z); - -#if !USE_OS_TZDB - DATE_API void add(const std::string& s); -#endif // !USE_OS_TZDB - -private: - DATE_API sys_info get_info_impl(sys_seconds tp) const; - DATE_API local_info get_info_impl(local_seconds tp) const; - - template - sys_time::type> - to_sys_impl(local_time tp, choose z, std::false_type) const; - template - sys_time::type> - to_sys_impl(local_time tp, choose, std::true_type) const; - -#if USE_OS_TZDB - DATE_API void init() const; - DATE_API void init_impl(); - DATE_API sys_info - load_sys_info(std::vector::const_iterator i) const; - - template - DATE_API void - load_data(std::istream& inf, std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt, - std::int32_t tzh_typecnt, std::int32_t tzh_charcnt); -#else // !USE_OS_TZDB - DATE_API sys_info get_info_impl(sys_seconds tp, int timezone) const; - DATE_API void adjust_infos(const std::vector& rules); - DATE_API void parse_info(std::istream& in); -#endif // !USE_OS_TZDB -}; - -#if defined(_MSC_VER) && (_MSC_VER < 1900) - -inline -time_zone::time_zone(time_zone&& src) - : name_(std::move(src.name_)) - , zonelets_(std::move(src.zonelets_)) - , adjusted_(std::move(src.adjusted_)) - {} - -inline -time_zone& -time_zone::operator=(time_zone&& src) -{ - name_ = std::move(src.name_); - zonelets_ = std::move(src.zonelets_); - adjusted_ = std::move(src.adjusted_); - return *this; -} - -#endif // defined(_MSC_VER) && (_MSC_VER < 1900) - -inline -const std::string& -time_zone::name() const NOEXCEPT -{ - return name_; -} - -template -inline -sys_info -time_zone::get_info(sys_time st) const -{ - using namespace std::chrono; - return get_info_impl(date::floor(st)); -} - -template -inline -local_info -time_zone::get_info(local_time tp) const -{ - using namespace std::chrono; - return get_info_impl(date::floor(tp)); -} - -template -inline -sys_time::type> -time_zone::to_sys(local_time tp) const -{ - return to_sys_impl(tp, choose{}, std::true_type{}); -} - -template -inline -sys_time::type> -time_zone::to_sys(local_time tp, choose z) const -{ - return to_sys_impl(tp, z, std::false_type{}); -} - -template -inline -local_time::type> -time_zone::to_local(sys_time tp) const -{ - using LT = local_time::type>; - auto i = get_info(tp); - return LT{(tp + i.offset).time_since_epoch()}; -} - -inline bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ == y.name_;} -inline bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ < y.name_;} - -inline bool operator!=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x == y);} -inline bool operator> (const time_zone& x, const time_zone& y) NOEXCEPT {return y < x;} -inline bool operator<=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(y < x);} -inline bool operator>=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x < y);} - -template -sys_time::type> -time_zone::to_sys_impl(local_time tp, choose z, std::false_type) const -{ - using namespace date; - using namespace std::chrono; - auto i = get_info(tp); - if (i.result == local_info::nonexistent) - { - return i.first.end; - } - else if (i.result == local_info::ambiguous) - { - if (z == choose::latest) - return sys_time{tp.time_since_epoch()} - i.second.offset; - } - return sys_time{tp.time_since_epoch()} - i.first.offset; -} - -template -sys_time::type> -time_zone::to_sys_impl(local_time tp, choose, std::true_type) const -{ - using namespace date; - using namespace std::chrono; - auto i = get_info(tp); - if (i.result == local_info::nonexistent) - throw nonexistent_local_time(tp, i); - else if (i.result == local_info::ambiguous) - throw ambiguous_local_time(tp, i); - return sys_time{tp.time_since_epoch()} - i.first.offset; -} - -#if !USE_OS_TZDB - -class link -{ -private: - std::string name_; - std::string target_; -public: - DATE_API explicit link(const std::string& s); - - const std::string& name() const {return name_;} - const std::string& target() const {return target_;} - - friend bool operator==(const link& x, const link& y) {return x.name_ == y.name_;} - friend bool operator< (const link& x, const link& y) {return x.name_ < y.name_;} - - friend DATE_API std::ostream& operator<<(std::ostream& os, const link& x); -}; - -inline bool operator!=(const link& x, const link& y) {return !(x == y);} -inline bool operator> (const link& x, const link& y) {return y < x;} -inline bool operator<=(const link& x, const link& y) {return !(y < x);} -inline bool operator>=(const link& x, const link& y) {return !(x < y);} - -#endif // !USE_OS_TZDB - -#if !MISSING_LEAP_SECONDS - -class leap -{ -private: - sys_seconds date_; - -public: -#if USE_OS_TZDB - DATE_API explicit leap(const sys_seconds& s, detail::undocumented); -#else - DATE_API explicit leap(const std::string& s, detail::undocumented); -#endif - - sys_seconds date() const {return date_;} - - friend bool operator==(const leap& x, const leap& y) {return x.date_ == y.date_;} - friend bool operator< (const leap& x, const leap& y) {return x.date_ < y.date_;} - - template - friend - bool - operator==(const leap& x, const sys_time& y) - { - return x.date_ == y; - } - - template - friend - bool - operator< (const leap& x, const sys_time& y) - { - return x.date_ < y; - } - - template - friend - bool - operator< (const sys_time& x, const leap& y) - { - return x < y.date_; - } - - friend DATE_API std::ostream& operator<<(std::ostream& os, const leap& x); -}; - -inline bool operator!=(const leap& x, const leap& y) {return !(x == y);} -inline bool operator> (const leap& x, const leap& y) {return y < x;} -inline bool operator<=(const leap& x, const leap& y) {return !(y < x);} -inline bool operator>=(const leap& x, const leap& y) {return !(x < y);} - -template -inline -bool -operator==(const sys_time& x, const leap& y) -{ - return y == x; -} - -template -inline -bool -operator!=(const leap& x, const sys_time& y) -{ - return !(x == y); -} - -template -inline -bool -operator!=(const sys_time& x, const leap& y) -{ - return !(x == y); -} - -template -inline -bool -operator> (const leap& x, const sys_time& y) -{ - return y < x; -} - -template -inline -bool -operator> (const sys_time& x, const leap& y) -{ - return y < x; -} - -template -inline -bool -operator<=(const leap& x, const sys_time& y) -{ - return !(y < x); -} - -template -inline -bool -operator<=(const sys_time& x, const leap& y) -{ - return !(y < x); -} - -template -inline -bool -operator>=(const leap& x, const sys_time& y) -{ - return !(x < y); -} - -template -inline -bool -operator>=(const sys_time& x, const leap& y) -{ - return !(x < y); -} - -#endif // !MISSING_LEAP_SECONDS - -#ifdef _WIN32 - -namespace detail -{ - -// The time zone mapping is modelled after this data file: -// http://unicode.org/repos/cldr/trunk/common/supplemental/windowsZones.xml -// and the field names match the element names from the mapZone element -// of windowsZones.xml. -// The website displays this file here: -// http://www.unicode.org/cldr/charts/latest/supplemental/zone_tzid.html -// The html view is sorted before being displayed but is otherwise the same -// There is a mapping between the os centric view (in this case windows) -// the html displays uses and the generic view the xml file. -// That mapping is this: -// display column "windows" -> xml field "other". -// display column "region" -> xml field "territory". -// display column "tzid" -> xml field "type". -// This structure uses the generic terminology because it could be -// used to to support other os/native name conversions, not just windows, -// and using the same generic names helps retain the connection to the -// origin of the data that we are using. -struct timezone_mapping -{ - timezone_mapping(const char* other, const char* territory, const char* type) - : other(other), territory(territory), type(type) - { - } - timezone_mapping() = default; - std::string other; - std::string territory; - std::string type; -}; - -} // detail - -#endif // _WIN32 - -struct tzdb -{ - std::string version = "unknown"; - std::vector zones; -#if !USE_OS_TZDB - std::vector links; -#endif -#if !MISSING_LEAP_SECONDS - std::vector leaps; -#endif -#if !USE_OS_TZDB - std::vector rules; -#endif -#ifdef _WIN32 - std::vector mappings; -#endif - tzdb* next = nullptr; - - tzdb() = default; -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - tzdb(tzdb&&) = default; - tzdb& operator=(tzdb&&) = default; -#else // defined(_MSC_VER) && (_MSC_VER < 1900) - tzdb(tzdb&& src) - : version(std::move(src.version)) - , zones(std::move(src.zones)) - , links(std::move(src.links)) - , leaps(std::move(src.leaps)) - , rules(std::move(src.rules)) - , mappings(std::move(src.mappings)) - {} - - tzdb& operator=(tzdb&& src) - { - version = std::move(src.version); - zones = std::move(src.zones); - links = std::move(src.links); - leaps = std::move(src.leaps); - rules = std::move(src.rules); - mappings = std::move(src.mappings); - return *this; - } -#endif // defined(_MSC_VER) && (_MSC_VER < 1900) - -#if HAS_STRING_VIEW - const time_zone* locate_zone(std::string_view tz_name) const; -#else - const time_zone* locate_zone(const std::string& tz_name) const; -#endif - const time_zone* current_zone() const; -}; - -using TZ_DB = tzdb; - -DATE_API std::ostream& -operator<<(std::ostream& os, const tzdb& db); - -DATE_API const tzdb& get_tzdb(); - -class tzdb_list -{ - std::atomic head_{nullptr}; - -public: - ~tzdb_list(); - tzdb_list() = default; - tzdb_list(tzdb_list&& x) noexcept; - - const tzdb& front() const noexcept {return *head_;} - tzdb& front() noexcept {return *head_;} - - class const_iterator; - - const_iterator begin() const noexcept; - const_iterator end() const noexcept; - - const_iterator cbegin() const noexcept; - const_iterator cend() const noexcept; - - const_iterator erase_after(const_iterator p) noexcept; - - struct undocumented_helper; -private: - void push_front(tzdb* tzdb) noexcept; -}; - -class tzdb_list::const_iterator -{ - tzdb* p_ = nullptr; - - explicit const_iterator(tzdb* p) noexcept : p_{p} {} -public: - const_iterator() = default; - - using iterator_category = std::forward_iterator_tag; - using value_type = tzdb; - using reference = const value_type&; - using pointer = const value_type*; - using difference_type = std::ptrdiff_t; - - reference operator*() const noexcept {return *p_;} - pointer operator->() const noexcept {return p_;} - - const_iterator& operator++() noexcept {p_ = p_->next; return *this;} - const_iterator operator++(int) noexcept {auto t = *this; ++(*this); return t;} - - friend - bool - operator==(const const_iterator& x, const const_iterator& y) noexcept - {return x.p_ == y.p_;} - - friend - bool - operator!=(const const_iterator& x, const const_iterator& y) noexcept - {return !(x == y);} - - friend class tzdb_list; -}; - -inline -tzdb_list::const_iterator -tzdb_list::begin() const noexcept -{ - return const_iterator{head_}; -} - -inline -tzdb_list::const_iterator -tzdb_list::end() const noexcept -{ - return const_iterator{nullptr}; -} - -inline -tzdb_list::const_iterator -tzdb_list::cbegin() const noexcept -{ - return begin(); -} - -inline -tzdb_list::const_iterator -tzdb_list::cend() const noexcept -{ - return end(); -} - -DATE_API tzdb_list& get_tzdb_list(); - -#if !USE_OS_TZDB - -DATE_API const tzdb& reload_tzdb(); -DATE_API void set_install(const std::string& install); - -#endif // !USE_OS_TZDB - -#if HAS_REMOTE_API - -DATE_API std::string remote_version(); -DATE_API bool remote_download(const std::string& version); -DATE_API bool remote_install(const std::string& version); - -#endif - -// zoned_time - -namespace detail -{ - -template -inline -T* -to_raw_pointer(T* p) noexcept -{ - return p; -} - -template -inline -auto -to_raw_pointer(Pointer p) noexcept - -> decltype(detail::to_raw_pointer(p.operator->())) -{ - return detail::to_raw_pointer(p.operator->()); -} - -} // namespace detail - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time() - : zone_(zoned_traits::default_zone()) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const sys_time& st) - : zone_(zoned_traits::default_zone()) - , tp_(st) - {} - -template -inline -zoned_time::zoned_time(TimeZonePtr z) - : zone_(std::move(z)) - {assert(detail::to_raw_pointer(zone_) != nullptr);} - -#if HAS_STRING_VIEW - -template -template -inline -zoned_time::zoned_time(std::string_view name) - : zoned_time(zoned_traits::locate_zone(name)) - {} - -#else // !HAS_STRING_VIEW - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name) - : zoned_time(zoned_traits::locate_zone(name)) - {} - -#endif // !HAS_STRING_VIEW - -template -template -inline -zoned_time::zoned_time(const zoned_time& zt) NOEXCEPT - : zone_(zt.zone_) - , tp_(zt.tp_) - {} - -template -inline -zoned_time::zoned_time(TimeZonePtr z, const sys_time& st) - : zone_(std::move(z)) - , tp_(st) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(TimeZonePtr z, const local_time& t) - : zone_(std::move(z)) - , tp_(zone_->to_sys(t)) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(TimeZonePtr z, const local_time& t, - choose c) - : zone_(std::move(z)) - , tp_(zone_->to_sys(t, c)) - {} - -template -template -inline -zoned_time::zoned_time(TimeZonePtr z, - const zoned_time& zt) - : zone_(std::move(z)) - , tp_(zt.tp_) - {} - -template -template -inline -zoned_time::zoned_time(TimeZonePtr z, - const zoned_time& zt, choose) - : zoned_time(std::move(z), zt) - {} - -#if HAS_STRING_VIEW - -template -template -inline -zoned_time::zoned_time(std::string_view name, - const sys_time& st) - : zoned_time(zoned_traits::locate_zone(name), st) - {} - -template -template -inline -zoned_time::zoned_time(std::string_view name, - const local_time& t) - : zoned_time(zoned_traits::locate_zone(name), t) - {} - -template -template -inline -zoned_time::zoned_time(std::string_view name, - const local_time& t, choose c) - : zoned_time(zoned_traits::locate_zone(name), t, c) - {} - -template -template -inline -zoned_time::zoned_time(std::string_view name, const zoned_time& zt) - : zoned_time(zoned_traits::locate_zone(name), zt) - {} - -template -template -inline -zoned_time::zoned_time(std::string_view name, - const zoned_time& zt, choose c) - : zoned_time(zoned_traits::locate_zone(name), zt, c) - {} - -#else // !HAS_STRING_VIEW - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name, - const sys_time& st) - : zoned_time(zoned_traits::locate_zone(name), st) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const char* name, - const sys_time& st) - : zoned_time(zoned_traits::locate_zone(name), st) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name, - const local_time& t) - : zoned_time(zoned_traits::locate_zone(name), t) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const char* name, - const local_time& t) - : zoned_time(zoned_traits::locate_zone(name), t) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name, - const local_time& t, choose c) - : zoned_time(zoned_traits::locate_zone(name), t, c) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const char* name, - const local_time& t, choose c) - : zoned_time(zoned_traits::locate_zone(name), t, c) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name, - const zoned_time& zt) - : zoned_time(zoned_traits::locate_zone(name), zt) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const char* name, const zoned_time& zt) - : zoned_time(zoned_traits::locate_zone(name), zt) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const std::string& name, - const zoned_time& zt, choose c) - : zoned_time(zoned_traits::locate_zone(name), zt, c) - {} - -template -#if !defined(_MSC_VER) || (_MSC_VER > 1900) -template -#endif -inline -zoned_time::zoned_time(const char* name, - const zoned_time& zt, choose c) - : zoned_time(zoned_traits::locate_zone(name), zt, c) - {} - -#endif // HAS_STRING_VIEW - -template -inline -zoned_time& -zoned_time::operator=(const sys_time& st) -{ - tp_ = st; - return *this; -} - -template -inline -zoned_time& -zoned_time::operator=(const local_time& ut) -{ - tp_ = zone_->to_sys(ut); - return *this; -} - -template -inline -zoned_time::operator local_time::duration>() const -{ - return get_local_time(); -} - -template -inline -zoned_time::operator sys_time::duration>() const -{ - return get_sys_time(); -} - -template -inline -TimeZonePtr -zoned_time::get_time_zone() const -{ - return zone_; -} - -template -inline -local_time::duration> -zoned_time::get_local_time() const -{ - return zone_->to_local(tp_); -} - -template -inline -sys_time::duration> -zoned_time::get_sys_time() const -{ - return tp_; -} - -template -inline -sys_info -zoned_time::get_info() const -{ - return zone_->get_info(tp_); -} - -// make_zoned_time - -inline -zoned_time -make_zoned() -{ - return zoned_time(); -} - -template -inline -zoned_time::type> -make_zoned(const sys_time& tp) -{ - return zoned_time::type>(tp); -} - -template 1900) - , class = typename std::enable_if - < - std::is_class - < - typename std::decay - < - decltype(*detail::to_raw_pointer(std::declval())) - >::type - >{} - >::type -#endif - > -inline -zoned_time -make_zoned(TimeZonePtr z) -{ - return zoned_time(std::move(z)); -} - -inline -zoned_seconds -make_zoned(const std::string& name) -{ - return zoned_seconds(name); -} - -template 1900) - , class = typename std::enable_if - < - std::is_class())>::type>{} - >::type -#endif - > -inline -zoned_time::type, TimeZonePtr> -make_zoned(TimeZonePtr zone, const local_time& tp) -{ - return zoned_time::type, - TimeZonePtr>(std::move(zone), tp); -} - -template 1900) - , class = typename std::enable_if - < - std::is_class())>::type>{} - >::type -#endif - > -inline -zoned_time::type, TimeZonePtr> -make_zoned(TimeZonePtr zone, const local_time& tp, choose c) -{ - return zoned_time::type, - TimeZonePtr>(std::move(zone), tp, c); -} - -template -inline -zoned_time::type> -make_zoned(const std::string& name, const local_time& tp) -{ - return zoned_time::type>(name, tp); -} - -template -inline -zoned_time::type> -make_zoned(const std::string& name, const local_time& tp, choose c) -{ - return zoned_time::type>(name, tp, c); -} - -template -inline -zoned_time -make_zoned(TimeZonePtr zone, const zoned_time& zt) -{ - return zoned_time(std::move(zone), zt); -} - -template -inline -zoned_time -make_zoned(const std::string& name, const zoned_time& zt) -{ - return zoned_time(name, zt); -} - -template -inline -zoned_time -make_zoned(TimeZonePtr zone, const zoned_time& zt, choose c) -{ - return zoned_time(std::move(zone), zt, c); -} - -template -inline -zoned_time -make_zoned(const std::string& name, const zoned_time& zt, choose c) -{ - return zoned_time(name, zt, c); -} - -template 1900) - , class = typename std::enable_if - < - std::is_class())>::type>{} - >::type -#endif - > -inline -zoned_time::type, TimeZonePtr> -make_zoned(TimeZonePtr zone, const sys_time& st) -{ - return zoned_time::type, - TimeZonePtr>(std::move(zone), st); -} - -template -inline -zoned_time::type> -make_zoned(const std::string& name, const sys_time& st) -{ - return zoned_time::type>(name, st); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const zoned_time& tp) -{ - using duration = typename zoned_time::duration; - using LT = local_time; - auto const tz = tp.get_time_zone(); - auto const st = tp.get_sys_time(); - auto const info = tz->get_info(st); - return to_stream(os, fmt, LT{(st+info.offset).time_since_epoch()}, - &info.abbrev, &info.offset); -} - -template -inline -std::basic_ostream& -operator<<(std::basic_ostream& os, const zoned_time& t) -{ - const CharT fmt[] = {'%', 'F', ' ', '%', 'T', ' ', '%', 'Z', CharT{}}; - return to_stream(os, fmt, t); -} - -#if !MISSING_LEAP_SECONDS - -class utc_clock -{ -public: - using duration = std::chrono::system_clock::duration; - using rep = duration::rep; - using period = duration::period; - using time_point = std::chrono::time_point; - static CONSTDATA bool is_steady = false; - - static time_point now(); - - template - static - std::chrono::time_point::type> - to_sys(const std::chrono::time_point&); - - template - static - std::chrono::time_point::type> - from_sys(const std::chrono::time_point&); -}; - -template - using utc_time = std::chrono::time_point; - -using utc_seconds = utc_time; - -template -utc_time::type> -utc_clock::from_sys(const sys_time& st) -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - auto const& leaps = get_tzdb().leaps; - auto const lt = std::upper_bound(leaps.begin(), leaps.end(), st); - return utc_time{st.time_since_epoch() + seconds{lt-leaps.begin()}}; -} - -// Return pair -// first is true if ut is during a leap second insertion, otherwise false. -// If ut is during a leap second insertion, that leap second is included in the count -template -std::pair -is_leap_second(date::utc_time const& ut) -{ - using namespace date; - using namespace std::chrono; - using duration = typename std::common_type::type; - auto const& leaps = get_tzdb().leaps; - auto tp = sys_time{ut.time_since_epoch()}; - auto const lt = std::upper_bound(leaps.begin(), leaps.end(), tp); - auto ds = seconds{lt-leaps.begin()}; - tp -= ds; - auto ls = false; - if (lt > leaps.begin()) - { - if (tp < lt[-1]) - { - if (tp >= lt[-1].date() - seconds{1}) - ls = true; - else - --ds; - } - } - return {ls, ds}; -} - -template -sys_time::type> -utc_clock::to_sys(const utc_time& ut) -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - auto ls = is_leap_second(ut); - auto tp = sys_time{ut.time_since_epoch() - ls.second}; - if (ls.first) - tp = floor(tp) + seconds{1} - duration{1}; - return tp; -} - -inline -utc_clock::time_point -utc_clock::now() -{ - using namespace std::chrono; - return from_sys(system_clock::now()); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const utc_time& t) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - const string abbrev("UTC"); - CONSTDATA seconds offset{0}; - auto ls = is_leap_second(t); - auto tp = sys_time{t.time_since_epoch() - ls.second}; - auto const sd = floor(tp); - year_month_day ymd = sd; - auto time = make_time(tp - sys_seconds{sd}); - time.seconds() += seconds{ls.first}; - fields fds{ymd, time}; - return to_stream(os, fmt, fds, &abbrev, &offset); -} - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const utc_time& t) -{ - const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; - return to_stream(os, fmt, t); -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - utc_time& tp, std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - minutes offset_local{}; - auto offptr = offset ? offset : &offset_local; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offptr); - if (!fds.ymd.ok()) - is.setstate(ios::failbit); - if (!is.fail()) - { - bool is_60_sec = fds.tod.seconds() == seconds{60}; - if (is_60_sec) - fds.tod.seconds() -= seconds{1}; - auto tmp = utc_clock::from_sys(sys_days(fds.ymd) - *offptr + fds.tod.to_duration()); - if (is_60_sec) - tmp += seconds{1}; - if (is_60_sec != is_leap_second(tmp).first || !fds.tod.in_conventional_range()) - { - is.setstate(ios::failbit); - return is; - } - tp = time_point_cast(tmp); - } - return is; -} - -// tai_clock - -class tai_clock -{ -public: - using duration = std::chrono::system_clock::duration; - using rep = duration::rep; - using period = duration::period; - using time_point = std::chrono::time_point; - static const bool is_steady = false; - - static time_point now(); - - template - static - std::chrono::time_point::type> - to_utc(const std::chrono::time_point&) NOEXCEPT; - - template - static - std::chrono::time_point::type> - from_utc(const std::chrono::time_point&) NOEXCEPT; -}; - -template - using tai_time = std::chrono::time_point; - -using tai_seconds = tai_time; - -template -inline -utc_time::type> -tai_clock::to_utc(const tai_time& t) NOEXCEPT -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - return utc_time{t.time_since_epoch()} - - (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1) + seconds{10}); -} - -template -inline -tai_time::type> -tai_clock::from_utc(const utc_time& t) NOEXCEPT -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - return tai_time{t.time_since_epoch()} + - (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1) + seconds{10}); -} - -inline -tai_clock::time_point -tai_clock::now() -{ - using namespace std::chrono; - return from_utc(utc_clock::now()); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const tai_time& t) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - const string abbrev("TAI"); - CONSTDATA seconds offset{0}; - auto tp = sys_time{t.time_since_epoch()} - - seconds(sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1)); - auto const sd = floor(tp); - year_month_day ymd = sd; - auto time = make_time(tp - sys_seconds{sd}); - fields fds{ymd, time}; - return to_stream(os, fmt, fds, &abbrev, &offset); -} - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const tai_time& t) -{ - const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; - return to_stream(os, fmt, t); -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - tai_time& tp, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - minutes offset_local{}; - auto offptr = offset ? offset : &offset_local; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offptr); - if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) - is.setstate(ios::failbit); - if (!is.fail()) - tp = tai_time{duration_cast( - (sys_days(fds.ymd) + - (sys_days(year{1970}/jan/1) - sys_days(year{1958}/jan/1)) - - *offptr + fds.tod.to_duration()).time_since_epoch())}; - return is; -} - -// gps_clock - -class gps_clock -{ -public: - using duration = std::chrono::system_clock::duration; - using rep = duration::rep; - using period = duration::period; - using time_point = std::chrono::time_point; - static const bool is_steady = false; - - static time_point now(); - - template - static - std::chrono::time_point::type> - to_utc(const std::chrono::time_point&) NOEXCEPT; - - template - static - std::chrono::time_point::type> - from_utc(const std::chrono::time_point&) NOEXCEPT; - -}; - -template - using gps_time = std::chrono::time_point; - -using gps_seconds = gps_time; - -template -inline -utc_time::type> -gps_clock::to_utc(const gps_time& t) NOEXCEPT -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - return utc_time{t.time_since_epoch()} + - (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1) + seconds{9}); -} - -template -inline -gps_time::type> -gps_clock::from_utc(const utc_time& t) NOEXCEPT -{ - using namespace std::chrono; - using duration = typename std::common_type::type; - return gps_time{t.time_since_epoch()} - - (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1) + seconds{9}); -} - -inline -gps_clock::time_point -gps_clock::now() -{ - using namespace std::chrono; - return from_utc(utc_clock::now()); -} - -template -std::basic_ostream& -to_stream(std::basic_ostream& os, const CharT* fmt, - const gps_time& t) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - const string abbrev("GPS"); - CONSTDATA seconds offset{0}; - auto tp = sys_time{t.time_since_epoch()} + - seconds(sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1)); - auto const sd = floor(tp); - year_month_day ymd = sd; - auto time = make_time(tp - sys_seconds{sd}); - fields fds{ymd, time}; - return to_stream(os, fmt, fds, &abbrev, &offset); -} - -template -std::basic_ostream& -operator<<(std::basic_ostream& os, const gps_time& t) -{ - const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}}; - return to_stream(os, fmt, t); -} - -template > -std::basic_istream& -from_stream(std::basic_istream& is, const CharT* fmt, - gps_time& tp, - std::basic_string* abbrev = nullptr, - std::chrono::minutes* offset = nullptr) -{ - using namespace std; - using namespace std::chrono; - using CT = typename common_type::type; - minutes offset_local{}; - auto offptr = offset ? offset : &offset_local; - fields fds{}; - from_stream(is, fmt, fds, abbrev, offptr); - if (!fds.ymd.ok() || !fds.tod.in_conventional_range()) - is.setstate(ios::failbit); - if (!is.fail()) - tp = gps_time{duration_cast( - (sys_days(fds.ymd) - - (sys_days(year{1980}/jan/sun[1]) - sys_days(year{1970}/jan/1)) - - *offptr + fds.tod.to_duration()).time_since_epoch())}; - return is; -} - -// clock_time_conversion - -template -struct clock_time_conversion -{}; - -template <> -struct clock_time_conversion -{ - template - sys_time - operator()(const sys_time& st) const - { - return st; - } -}; - -template <> -struct clock_time_conversion -{ - template - utc_time - operator()(const utc_time& ut) const - { - return ut; - } -}; - -template <> -struct clock_time_conversion -{ - template - utc_time::type> - operator()(const sys_time& st) const - { - return utc_clock::from_sys(st); - } -}; - -template <> -struct clock_time_conversion -{ - template - sys_time::type> - operator()(const utc_time& ut) const - { - return utc_clock::to_sys(ut); - } -}; - -template -struct clock_time_conversion -{ - template - std::chrono::time_point - operator()(const std::chrono::time_point& tp) const - { - return tp; - } -}; - -namespace ctc_detail -{ - -template - using time_point = std::chrono::time_point; - -using std::declval; -using std::chrono::system_clock; - -//Check if TimePoint is time for given clock, -//if not emits hard error -template -struct return_clock_time -{ - using clock_time_point = time_point; - using type = TimePoint; - - static_assert(std::is_same::value, - "time point with appropariate clock shall be returned"); -}; - -// Check if Clock has to_sys method accepting TimePoint with given duration const& and -// returning sys_time. If so has nested type member equal to return type to_sys. -template -struct return_to_sys -{}; - -template -struct return_to_sys - < - Clock, Duration, - decltype(Clock::to_sys(declval const&>()), void()) - > - : return_clock_time - < - system_clock, - decltype(Clock::to_sys(declval const&>())) - > -{}; - -// Similiar to above -template -struct return_from_sys -{}; - -template -struct return_from_sys - < - Clock, Duration, - decltype(Clock::from_sys(declval const&>()), - void()) - > - : return_clock_time - < - Clock, - decltype(Clock::from_sys(declval const&>())) - > -{}; - -// Similiar to above -template -struct return_to_utc -{}; - -template -struct return_to_utc - < - Clock, Duration, - decltype(Clock::to_utc(declval const&>()), void()) - > - : return_clock_time - < - utc_clock, - decltype(Clock::to_utc(declval const&>()))> -{}; - -// Similiar to above -template -struct return_from_utc -{}; - -template -struct return_from_utc - < - Clock, Duration, - decltype(Clock::from_utc(declval const&>()), - void()) - > - : return_clock_time - < - Clock, - decltype(Clock::from_utc(declval const&>())) - > -{}; - -} // namespace ctc_detail - -template -struct clock_time_conversion -{ - template - typename ctc_detail::return_to_sys::type - operator()(const std::chrono::time_point& tp) const - { - return SrcClock::to_sys(tp); - } -}; - -template -struct clock_time_conversion -{ - template - typename ctc_detail::return_from_sys::type - operator()(const sys_time& st) const - { - return DstClock::from_sys(st); - } -}; - -template -struct clock_time_conversion -{ - template - typename ctc_detail::return_to_utc::type - operator()(const std::chrono::time_point& tp) const - { - return SrcClock::to_utc(tp); - } -}; - -template -struct clock_time_conversion -{ - template - typename ctc_detail::return_from_utc::type - operator()(const utc_time& ut) const - { - return DstClock::from_utc(ut); - } -}; - -namespace clock_cast_detail -{ - -template - using time_point = std::chrono::time_point; -using std::chrono::system_clock; - -template -auto -conv_clock(const time_point& t) - -> decltype(std::declval>()(t)) -{ - return clock_time_conversion{}(t); -} - -//direct trait conversion, 1st candidate -template -auto -cc_impl(const time_point& t, const time_point*) - -> decltype(conv_clock(t)) -{ - return conv_clock(t); -} - -//conversion through sys, 2nd candidate -template -auto -cc_impl(const time_point& t, const void*) - -> decltype(conv_clock(conv_clock(t))) -{ - return conv_clock(conv_clock(t)); -} - -//conversion through utc, 2nd candidate -template -auto -cc_impl(const time_point& t, const void*) - -> decltype(0, // MSVC_WORKAROUND - conv_clock(conv_clock(t))) -{ - return conv_clock(conv_clock(t)); -} - -//conversion through sys and utc, 3rd candidate -template -auto -cc_impl(const time_point& t, ...) - -> decltype(conv_clock(conv_clock(conv_clock(t)))) -{ - return conv_clock(conv_clock(conv_clock(t))); -} - -//conversion through utc and sys, 3rd candidate -template -auto -cc_impl(const time_point& t, ...) - -> decltype(0, // MSVC_WORKAROUND - conv_clock(conv_clock(conv_clock(t)))) -{ - return conv_clock(conv_clock(conv_clock(t))); -} - -} // namespace clock_cast_detail - -template -auto -clock_cast(const std::chrono::time_point& tp) - -> decltype(clock_cast_detail::cc_impl(tp, &tp)) -{ - return clock_cast_detail::cc_impl(tp, &tp); -} - -// Deprecated API - -template -inline -sys_time::type> -to_sys_time(const utc_time& t) -{ - return utc_clock::to_sys(t); -} - -template -inline -sys_time::type> -to_sys_time(const tai_time& t) -{ - return utc_clock::to_sys(tai_clock::to_utc(t)); -} - -template -inline -sys_time::type> -to_sys_time(const gps_time& t) -{ - return utc_clock::to_sys(gps_clock::to_utc(t)); -} - - -template -inline -utc_time::type> -to_utc_time(const sys_time& t) -{ - return utc_clock::from_sys(t); -} - -template -inline -utc_time::type> -to_utc_time(const tai_time& t) -{ - return tai_clock::to_utc(t); -} - -template -inline -utc_time::type> -to_utc_time(const gps_time& t) -{ - return gps_clock::to_utc(t); -} - - -template -inline -tai_time::type> -to_tai_time(const sys_time& t) -{ - return tai_clock::from_utc(utc_clock::from_sys(t)); -} - -template -inline -tai_time::type> -to_tai_time(const utc_time& t) -{ - return tai_clock::from_utc(t); -} - -template -inline -tai_time::type> -to_tai_time(const gps_time& t) -{ - return tai_clock::from_utc(gps_clock::to_utc(t)); -} - - -template -inline -gps_time::type> -to_gps_time(const sys_time& t) -{ - return gps_clock::from_utc(utc_clock::from_sys(t)); -} - -template -inline -gps_time::type> -to_gps_time(const utc_time& t) -{ - return gps_clock::from_utc(t); -} - -template -inline -gps_time::type> -to_gps_time(const tai_time& t) -{ - return gps_clock::from_utc(tai_clock::to_utc(t)); -} - -#endif // !MISSING_LEAP_SECONDS - -} // namespace date -} // namespace arrow - -#endif // TZ_H diff --git a/r/R/inst/include/arrow/vendored/datetime/tz_private.h b/r/R/inst/include/arrow/vendored/datetime/tz_private.h deleted file mode 100644 index f98c3e79a44..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime/tz_private.h +++ /dev/null @@ -1,321 +0,0 @@ -#ifndef TZ_PRIVATE_H -#define TZ_PRIVATE_H - -// The MIT License (MIT) -// -// Copyright (c) 2015, 2016 Howard Hinnant -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Our apologies. When the previous paragraph was written, lowercase had not yet -// been invented (that would involve another several millennia of evolution). -// We did not mean to shout. - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) -#include "tz.h" -#else -#include "date.h" -#include -#endif - -namespace arrow_vendored -{ -namespace date -{ - -namespace detail -{ - -#if !USE_OS_TZDB - -enum class tz {utc, local, standard}; - -//forward declare to avoid warnings in gcc 6.2 -class MonthDayTime; -std::istream& operator>>(std::istream& is, MonthDayTime& x); -std::ostream& operator<<(std::ostream& os, const MonthDayTime& x); - - -class MonthDayTime -{ -private: - struct pair - { -#if defined(_MSC_VER) && (_MSC_VER < 1900) - pair() : month_day_(date::jan / 1), weekday_(0U) {} - - pair(const date::month_day& month_day, const date::weekday& weekday) - : month_day_(month_day), weekday_(weekday) {} -#endif - - date::month_day month_day_; - date::weekday weekday_; - }; - - enum Type {month_day, month_last_dow, lteq, gteq}; - - Type type_{month_day}; - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - union U -#else - struct U -#endif - { - date::month_day month_day_; - date::month_weekday_last month_weekday_last_; - pair month_day_weekday_; - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - U() : month_day_{date::jan/1} {} -#else - U() : - month_day_(date::jan/1), - month_weekday_last_(date::month(0U), date::weekday_last(date::weekday(0U))) - {} - -#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900) - - U& operator=(const date::month_day& x); - U& operator=(const date::month_weekday_last& x); - U& operator=(const pair& x); - } u; - - std::chrono::hours h_{0}; - std::chrono::minutes m_{0}; - std::chrono::seconds s_{0}; - tz zone_{tz::local}; - -public: - MonthDayTime() = default; - MonthDayTime(local_seconds tp, tz timezone); - MonthDayTime(const date::month_day& md, tz timezone); - - date::day day() const; - date::month month() const; - tz zone() const {return zone_;} - - void canonicalize(date::year y); - - sys_seconds - to_sys(date::year y, std::chrono::seconds offset, std::chrono::seconds save) const; - sys_days to_sys_days(date::year y) const; - - sys_seconds to_time_point(date::year y) const; - int compare(date::year y, const MonthDayTime& x, date::year yx, - std::chrono::seconds offset, std::chrono::minutes prev_save) const; - - friend std::istream& operator>>(std::istream& is, MonthDayTime& x); - friend std::ostream& operator<<(std::ostream& os, const MonthDayTime& x); -}; - -// A Rule specifies one or more set of datetimes without using an offset. -// Multiple dates are specified with multiple years. The years in effect -// go from starting_year_ to ending_year_, inclusive. starting_year_ <= -// ending_year_. save_ is in effect for times from the specified time -// onward, including the specified time. When the specified time is -// local, it uses the save_ from the chronologically previous Rule, or if -// there is none, 0. - -//forward declare to avoid warnings in gcc 6.2 -class Rule; -bool operator==(const Rule& x, const Rule& y); -bool operator<(const Rule& x, const Rule& y); -bool operator==(const Rule& x, const date::year& y); -bool operator<(const Rule& x, const date::year& y); -bool operator==(const date::year& x, const Rule& y); -bool operator<(const date::year& x, const Rule& y); -bool operator==(const Rule& x, const std::string& y); -bool operator<(const Rule& x, const std::string& y); -bool operator==(const std::string& x, const Rule& y); -bool operator<(const std::string& x, const Rule& y); -std::ostream& operator<<(std::ostream& os, const Rule& r); - -class Rule -{ -private: - std::string name_; - date::year starting_year_{0}; - date::year ending_year_{0}; - MonthDayTime starting_at_; - std::chrono::minutes save_{0}; - std::string abbrev_; - -public: - Rule() = default; - explicit Rule(const std::string& s); - Rule(const Rule& r, date::year starting_year, date::year ending_year); - - const std::string& name() const {return name_;} - const std::string& abbrev() const {return abbrev_;} - - const MonthDayTime& mdt() const {return starting_at_;} - const date::year& starting_year() const {return starting_year_;} - const date::year& ending_year() const {return ending_year_;} - const std::chrono::minutes& save() const {return save_;} - - static void split_overlaps(std::vector& rules); - - friend bool operator==(const Rule& x, const Rule& y); - friend bool operator<(const Rule& x, const Rule& y); - friend bool operator==(const Rule& x, const date::year& y); - friend bool operator<(const Rule& x, const date::year& y); - friend bool operator==(const date::year& x, const Rule& y); - friend bool operator<(const date::year& x, const Rule& y); - friend bool operator==(const Rule& x, const std::string& y); - friend bool operator<(const Rule& x, const std::string& y); - friend bool operator==(const std::string& x, const Rule& y); - friend bool operator<(const std::string& x, const Rule& y); - - friend std::ostream& operator<<(std::ostream& os, const Rule& r); - -private: - date::day day() const; - date::month month() const; - static void split_overlaps(std::vector& rules, std::size_t i, std::size_t& e); - static bool overlaps(const Rule& x, const Rule& y); - static void split(std::vector& rules, std::size_t i, std::size_t k, - std::size_t& e); -}; - -inline bool operator!=(const Rule& x, const Rule& y) {return !(x == y);} -inline bool operator> (const Rule& x, const Rule& y) {return y < x;} -inline bool operator<=(const Rule& x, const Rule& y) {return !(y < x);} -inline bool operator>=(const Rule& x, const Rule& y) {return !(x < y);} - -inline bool operator!=(const Rule& x, const date::year& y) {return !(x == y);} -inline bool operator> (const Rule& x, const date::year& y) {return y < x;} -inline bool operator<=(const Rule& x, const date::year& y) {return !(y < x);} -inline bool operator>=(const Rule& x, const date::year& y) {return !(x < y);} - -inline bool operator!=(const date::year& x, const Rule& y) {return !(x == y);} -inline bool operator> (const date::year& x, const Rule& y) {return y < x;} -inline bool operator<=(const date::year& x, const Rule& y) {return !(y < x);} -inline bool operator>=(const date::year& x, const Rule& y) {return !(x < y);} - -inline bool operator!=(const Rule& x, const std::string& y) {return !(x == y);} -inline bool operator> (const Rule& x, const std::string& y) {return y < x;} -inline bool operator<=(const Rule& x, const std::string& y) {return !(y < x);} -inline bool operator>=(const Rule& x, const std::string& y) {return !(x < y);} - -inline bool operator!=(const std::string& x, const Rule& y) {return !(x == y);} -inline bool operator> (const std::string& x, const Rule& y) {return y < x;} -inline bool operator<=(const std::string& x, const Rule& y) {return !(y < x);} -inline bool operator>=(const std::string& x, const Rule& y) {return !(x < y);} - -struct zonelet -{ - enum tag {has_rule, has_save, is_empty}; - - std::chrono::seconds gmtoff_; - tag tag_ = has_rule; - -#if !defined(_MSC_VER) || (_MSC_VER >= 1900) - union U -#else - struct U -#endif - { - std::string rule_; - std::chrono::minutes save_; - - ~U() {} - U() {} - U(const U&) {} - U& operator=(const U&) = delete; - } u; - - std::string format_; - date::year until_year_{0}; - MonthDayTime until_date_; - sys_seconds until_utc_; - local_seconds until_std_; - local_seconds until_loc_; - std::chrono::minutes initial_save_{}; - std::string initial_abbrev_; - std::pair first_rule_{nullptr, date::year::min()}; - std::pair last_rule_{nullptr, date::year::max()}; - - ~zonelet(); - zonelet(); - zonelet(const zonelet& i); - zonelet& operator=(const zonelet&) = delete; -}; - -#else // USE_OS_TZDB - -struct ttinfo -{ - std::int32_t tt_gmtoff; - unsigned char tt_isdst; - unsigned char tt_abbrind; - unsigned char pad[2]; -}; - -static_assert(sizeof(ttinfo) == 8, ""); - -struct expanded_ttinfo -{ - std::chrono::seconds offset; - std::string abbrev; - bool is_dst; -}; - -struct transition -{ - sys_seconds timepoint; - const expanded_ttinfo* info; - - transition(sys_seconds tp, const expanded_ttinfo* i = nullptr) - : timepoint(tp) - , info(i) - {} - - friend - std::ostream& - operator<<(std::ostream& os, const transition& t) - { - using namespace date; - using namespace std::chrono; - using date::operator<<; - os << t.timepoint << "Z "; - if (t.info->offset >= seconds{0}) - os << '+'; - os << make_time(t.info->offset); - if (t.info->is_dst > 0) - os << " daylight "; - else - os << " standard "; - os << t.info->abbrev; - return os; - } -}; - -#endif // USE_OS_TZDB - -} // namespace detail - -} // namespace date -} // namespace arrow_vendored - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#include "tz.h" -#endif - -#endif // TZ_PRIVATE_H diff --git a/r/R/inst/include/arrow/vendored/datetime/visibility.h b/r/R/inst/include/arrow/vendored/datetime/visibility.h deleted file mode 100644 index ae031238d85..00000000000 --- a/r/R/inst/include/arrow/vendored/datetime/visibility.h +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#if defined(ARROW_STATIC) -// intentially empty -#elif defined(ARROW_EXPORTING) -#define DATE_BUILD_DLL -#else -#define DATE_USE_DLL -#endif diff --git a/r/R/inst/include/arrow/vendored/xxhash/xxhash.h b/r/R/inst/include/arrow/vendored/xxhash/xxhash.h deleted file mode 100644 index 8c2d5fac1e7..00000000000 --- a/r/R/inst/include/arrow/vendored/xxhash/xxhash.h +++ /dev/null @@ -1,330 +0,0 @@ -// Vendored from git tag v0.6.5 - -/* - xxHash - Extremely Fast Hash algorithm - Header File - Copyright (C) 2012-2016, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - xxHash source repository : https://github.com/Cyan4973/xxHash -*/ - -/* Notice extracted from xxHash homepage : - -xxHash is an extremely fast Hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MumurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -A 64-bit version, named XXH64, is available since r35. -It offers much better speed, but for 64-bit applications only. -Name Speed on 64 bits Speed on 32 bits -XXH64 13.8 GB/s 1.9 GB/s -XXH32 6.8 GB/s 6.0 GB/s -*/ - -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 - -#if defined (__cplusplus) -extern "C" { -#endif - - -/* **************************** -* Definitions -******************************/ -#include /* size_t */ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; - - -/* **************************** - * API modifier - ******************************/ -/** XXH_INLINE_ALL (and XXH_PRIVATE_API) - * This is useful to include xxhash functions in `static` mode - * in order to inline them, and remove their symbol from the public list. - * Inlining can offer dramatic performance improvement on small keys. - * Methodology : - * #define XXH_INLINE_ALL - * #include "xxhash.h" - * `xxhash.c` is automatically included. - * It's not useful to compile and link it as a separate module. - */ -#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) -# ifndef XXH_STATIC_LINKING_ONLY -# define XXH_STATIC_LINKING_ONLY -# endif -# if defined(__GNUC__) -# define XXH_PUBLIC_API static __inline __attribute__((unused)) -# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define XXH_PUBLIC_API static inline -# elif defined(_MSC_VER) -# define XXH_PUBLIC_API static __inline -# else - /* this version may generate warnings for unused static functions */ -# define XXH_PUBLIC_API static -# endif -#else -# define XXH_PUBLIC_API /* do nothing */ -#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - -/*! XXH_NAMESPACE, aka Namespace Emulation : - * - * If you want to include _and expose_ xxHash functions from within your own library, - * but also want to avoid symbol collisions with other libraries which may also include xxHash, - * - * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library - * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values). - * - * Note that no change is required within the calling program as long as it includes `xxhash.h` : - * regular symbol name will be automatically translated by this header. - */ -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) -# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) -# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) -# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) -# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) -# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) -# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) -# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) -# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) -# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) -# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) -# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) -#endif - - -/* ************************************* -* Version -***************************************/ -#define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 6 -#define XXH_VERSION_RELEASE 5 -#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) -XXH_PUBLIC_API unsigned XXH_versionNumber (void); - - -/*-********************************************************************** -* 32-bit hash -************************************************************************/ -typedef unsigned int XXH32_hash_t; - -/*! XXH32() : - Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input". - The memory between input & input+length must be valid (allocated and read-accessible). - "seed" can be used to alter the result predictably. - Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); - -/*====== Streaming ======*/ -typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); - -XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); - -/* - * Streaming functions generate the xxHash of an input provided in multiple segments. - * Note that, for small input, they are slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * XXH state must first be allocated, using XXH*_createState() . - * - * Start a new hash by initializing state with a seed, using XXH*_reset(). - * - * Then, feed the hash state by calling XXH*_update() as many times as necessary. - * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using XXH*_digest(). - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a digest, - * and generate some new hashes later on, by calling again XXH*_digest(). - * - * When done, free XXH state space if it was allocated dynamically. - */ - -/*====== Canonical representation ======*/ - -typedef struct { unsigned char digest[4]; } XXH32_canonical_t; -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); - -/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. - * The canonical representation uses human-readable write convention, aka big-endian (large digits first). - * These functions allow transformation of hash result into and from its canonical format. - * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. - */ - - -#ifndef XXH_NO_LONG_LONG -/*-********************************************************************** -* 64-bit hash -************************************************************************/ -typedef unsigned long long XXH64_hash_t; - -/*! XXH64() : - Calculate the 64-bit hash of sequence of length "len" stored at memory address "input". - "seed" can be used to alter the result predictably. - This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark). -*/ -XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); - -/*====== Streaming ======*/ -typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); - -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); - -/*====== Canonical representation ======*/ -typedef struct { unsigned char digest[8]; } XXH64_canonical_t; -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); -#endif /* XXH_NO_LONG_LONG */ - - - -#ifdef XXH_STATIC_LINKING_ONLY - -/* ================================================================================================ - This section contains declarations which are not guaranteed to remain stable. - They may change in future versions, becoming incompatible with a different version of the library. - These declarations should only be used with static linking. - Never use them in association with dynamic linking ! -=================================================================================================== */ - -/* These definitions are only present to allow - * static allocation of XXH state, on stack or in a struct for example. - * Never **ever** use members directly. */ - -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - -struct XXH32_state_s { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; - uint32_t reserved; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH32_state_t */ - -struct XXH64_state_s { - uint64_t total_len; - uint64_t v1; - uint64_t v2; - uint64_t v3; - uint64_t v4; - uint64_t mem64[4]; - uint32_t memsize; - uint32_t reserved[2]; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH64_state_t */ - -# else - -struct XXH32_state_s { - unsigned total_len_32; - unsigned large_len; - unsigned v1; - unsigned v2; - unsigned v3; - unsigned v4; - unsigned mem32[4]; - unsigned memsize; - unsigned reserved; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH32_state_t */ - -# ifndef XXH_NO_LONG_LONG /* remove 64-bit support */ -struct XXH64_state_s { - unsigned long long total_len; - unsigned long long v1; - unsigned long long v2; - unsigned long long v3; - unsigned long long v4; - unsigned long long mem64[4]; - unsigned memsize; - unsigned reserved[2]; /* never read nor write, might be removed in a future version */ -}; /* typedef'd to XXH64_state_t */ -# endif - -# endif - - -#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) -# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */ -#endif - -#endif /* XXH_STATIC_LINKING_ONLY */ - - -#if defined (__cplusplus) -} -#endif - -#endif /* XXHASH_H_5627135585666179 */ diff --git a/r/R/inst/include/arrow/visitor.h b/r/R/inst/include/arrow/visitor.h deleted file mode 100644 index 1b40ce4efba..00000000000 --- a/r/R/inst/include/arrow/visitor.h +++ /dev/null @@ -1,138 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_VISITOR_H -#define ARROW_VISITOR_H - -#include "arrow/status.h" -#include "arrow/type_fwd.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class ARROW_EXPORT ArrayVisitor { - public: - virtual ~ArrayVisitor() = default; - - virtual Status Visit(const NullArray& array); - virtual Status Visit(const BooleanArray& array); - virtual Status Visit(const Int8Array& array); - virtual Status Visit(const Int16Array& array); - virtual Status Visit(const Int32Array& array); - virtual Status Visit(const Int64Array& array); - virtual Status Visit(const UInt8Array& array); - virtual Status Visit(const UInt16Array& array); - virtual Status Visit(const UInt32Array& array); - virtual Status Visit(const UInt64Array& array); - virtual Status Visit(const HalfFloatArray& array); - virtual Status Visit(const FloatArray& array); - virtual Status Visit(const DoubleArray& array); - virtual Status Visit(const StringArray& array); - virtual Status Visit(const BinaryArray& array); - virtual Status Visit(const FixedSizeBinaryArray& array); - virtual Status Visit(const Date32Array& array); - virtual Status Visit(const Date64Array& array); - virtual Status Visit(const Time32Array& array); - virtual Status Visit(const Time64Array& array); - virtual Status Visit(const TimestampArray& array); - virtual Status Visit(const DayTimeIntervalArray& array); - virtual Status Visit(const MonthIntervalArray& array); - virtual Status Visit(const DurationArray& array); - virtual Status Visit(const Decimal128Array& array); - virtual Status Visit(const ListArray& array); - virtual Status Visit(const FixedSizeListArray& array); - virtual Status Visit(const StructArray& array); - virtual Status Visit(const UnionArray& array); - virtual Status Visit(const DictionaryArray& array); - virtual Status Visit(const ExtensionArray& array); -}; - -class ARROW_EXPORT TypeVisitor { - public: - virtual ~TypeVisitor() = default; - - virtual Status Visit(const NullType& type); - virtual Status Visit(const BooleanType& type); - virtual Status Visit(const Int8Type& type); - virtual Status Visit(const Int16Type& type); - virtual Status Visit(const Int32Type& type); - virtual Status Visit(const Int64Type& type); - virtual Status Visit(const UInt8Type& type); - virtual Status Visit(const UInt16Type& type); - virtual Status Visit(const UInt32Type& type); - virtual Status Visit(const UInt64Type& type); - virtual Status Visit(const HalfFloatType& type); - virtual Status Visit(const FloatType& type); - virtual Status Visit(const DoubleType& type); - virtual Status Visit(const StringType& type); - virtual Status Visit(const BinaryType& type); - virtual Status Visit(const FixedSizeBinaryType& type); - virtual Status Visit(const Date64Type& type); - virtual Status Visit(const Date32Type& type); - virtual Status Visit(const Time32Type& type); - virtual Status Visit(const Time64Type& type); - virtual Status Visit(const TimestampType& type); - virtual Status Visit(const MonthIntervalType& type); - virtual Status Visit(const DayTimeIntervalType& type); - virtual Status Visit(const DurationType& type); - virtual Status Visit(const Decimal128Type& type); - virtual Status Visit(const ListType& type); - virtual Status Visit(const FixedSizeListType& type); - virtual Status Visit(const StructType& type); - virtual Status Visit(const UnionType& type); - virtual Status Visit(const DictionaryType& type); - virtual Status Visit(const ExtensionType& type); -}; - -class ARROW_EXPORT ScalarVisitor { - public: - virtual ~ScalarVisitor() = default; - - virtual Status Visit(const NullScalar& scalar); - virtual Status Visit(const BooleanScalar& scalar); - virtual Status Visit(const Int8Scalar& scalar); - virtual Status Visit(const Int16Scalar& scalar); - virtual Status Visit(const Int32Scalar& scalar); - virtual Status Visit(const Int64Scalar& scalar); - virtual Status Visit(const UInt8Scalar& scalar); - virtual Status Visit(const UInt16Scalar& scalar); - virtual Status Visit(const UInt32Scalar& scalar); - virtual Status Visit(const UInt64Scalar& scalar); - virtual Status Visit(const HalfFloatScalar& scalar); - virtual Status Visit(const FloatScalar& scalar); - virtual Status Visit(const DoubleScalar& scalar); - virtual Status Visit(const StringScalar& scalar); - virtual Status Visit(const BinaryScalar& scalar); - virtual Status Visit(const FixedSizeBinaryScalar& scalar); - virtual Status Visit(const Date64Scalar& scalar); - virtual Status Visit(const Date32Scalar& scalar); - virtual Status Visit(const Time32Scalar& scalar); - virtual Status Visit(const Time64Scalar& scalar); - virtual Status Visit(const TimestampScalar& scalar); - virtual Status Visit(const DayTimeIntervalScalar& scalar); - virtual Status Visit(const MonthIntervalScalar& scalar); - virtual Status Visit(const DurationScalar& scalar); - virtual Status Visit(const Decimal128Scalar& scalar); - virtual Status Visit(const ListScalar& scalar); - virtual Status Visit(const FixedSizeListScalar& scalar); - virtual Status Visit(const StructScalar& scalar); - virtual Status Visit(const DictionaryScalar& scalar); -}; - -} // namespace arrow - -#endif // ARROW_VISITOR_H diff --git a/r/R/inst/include/arrow/visitor_inline.h b/r/R/inst/include/arrow/visitor_inline.h deleted file mode 100644 index 01bf4426f24..00000000000 --- a/r/R/inst/include/arrow/visitor_inline.h +++ /dev/null @@ -1,277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Private header, not to be exported - -#ifndef ARROW_VISITOR_INLINE_H -#define ARROW_VISITOR_INLINE_H - -#include "arrow/array.h" -#include "arrow/extension_type.h" -#include "arrow/scalar.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/string_view.h" - -namespace arrow { - -#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION) \ - ACTION(Null); \ - ACTION(Boolean); \ - ACTION(Int8); \ - ACTION(UInt8); \ - ACTION(Int16); \ - ACTION(UInt16); \ - ACTION(Int32); \ - ACTION(UInt32); \ - ACTION(Int64); \ - ACTION(UInt64); \ - ACTION(HalfFloat); \ - ACTION(Float); \ - ACTION(Double); \ - ACTION(String); \ - ACTION(Binary); \ - ACTION(FixedSizeBinary); \ - ACTION(Duration); \ - ACTION(Date32); \ - ACTION(Date64); \ - ACTION(Timestamp); \ - ACTION(Time32); \ - ACTION(Time64); \ - ACTION(Decimal128); \ - ACTION(List); \ - ACTION(FixedSizeList); \ - ACTION(Struct); \ - ACTION(Union); \ - ACTION(Dictionary); \ - ACTION(Extension) - -#define TYPE_VISIT_INLINE(TYPE_CLASS) \ - case TYPE_CLASS##Type::type_id: \ - return visitor->Visit(internal::checked_cast(type)); - -template -inline Status VisitTypeInline(const DataType& type, VISITOR* visitor) { - switch (type.id()) { - ARROW_GENERATE_FOR_ALL_TYPES(TYPE_VISIT_INLINE); - case Type::INTERVAL: { - const auto& interval_type = dynamic_cast(type); - if (interval_type.interval_type() == IntervalType::MONTHS) { - return visitor->Visit(internal::checked_cast(type)); - } - if (interval_type.interval_type() == IntervalType::DAY_TIME) { - return visitor->Visit(internal::checked_cast(type)); - } - break; - } - default: - break; - } - return Status::NotImplemented("Type not implemented"); -} - -#undef TYPE_VISIT_INLINE - -#define ARRAY_VISIT_INLINE(TYPE_CLASS) \ - case TYPE_CLASS##Type::type_id: \ - return visitor->Visit( \ - internal::checked_cast::ArrayType&>( \ - array)); - -template -inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { - switch (array.type_id()) { - ARROW_GENERATE_FOR_ALL_TYPES(ARRAY_VISIT_INLINE); - case Type::INTERVAL: { - const auto& interval_type = dynamic_cast(*array.type()); - if (interval_type.interval_type() == IntervalType::MONTHS) { - return visitor->Visit(internal::checked_cast(array)); - } - if (interval_type.interval_type() == IntervalType::DAY_TIME) { - return visitor->Visit(internal::checked_cast(array)); - } - break; - } - - default: - break; - } - return Status::NotImplemented("Type not implemented"); -} - -// Visit an array's data values, in order, without overhead. -// -// The Visit function's `visitor` argument should define two public methods: -// - Status VisitNull() -// - Status VisitValue() -// -// The scalar value's type depends on the array data type: -// - the type's `c_type`, if any -// - for boolean arrays, a `bool` -// - for binary, string and fixed-size binary arrays, a `util::string_view` - -template -struct ArrayDataVisitor {}; - -template <> -struct ArrayDataVisitor { - template - static Status Visit(const ArrayData& arr, Visitor* visitor) { - if (arr.null_count != 0) { - internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); - internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length); - for (int64_t i = 0; i < arr.length; ++i) { - const bool is_null = valid_reader.IsNotSet(); - if (is_null) { - ARROW_RETURN_NOT_OK(visitor->VisitNull()); - } else { - ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet())); - } - valid_reader.Next(); - value_reader.Next(); - } - } else { - internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length); - for (int64_t i = 0; i < arr.length; ++i) { - ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet())); - value_reader.Next(); - } - } - return Status::OK(); - } -}; - -template -struct ArrayDataVisitor> { - template - static Status Visit(const ArrayData& arr, Visitor* visitor) { - using c_type = typename T::c_type; - const c_type* data = arr.GetValues(1); - - if (arr.null_count != 0) { - internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); - for (int64_t i = 0; i < arr.length; ++i) { - const bool is_null = valid_reader.IsNotSet(); - if (is_null) { - ARROW_RETURN_NOT_OK(visitor->VisitNull()); - } else { - ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i])); - } - valid_reader.Next(); - } - } else { - for (int64_t i = 0; i < arr.length; ++i) { - ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i])); - } - } - return Status::OK(); - } -}; - -template -struct ArrayDataVisitor> { - template - static Status Visit(const ArrayData& arr, Visitor* visitor) { - constexpr uint8_t empty_value = 0; - - const int32_t* offsets = arr.GetValues(1); - const uint8_t* data; - if (!arr.buffers[2]) { - data = &empty_value; - } else { - data = arr.GetValues(2); - } - - if (arr.null_count != 0) { - internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); - for (int64_t i = 0; i < arr.length; ++i) { - const bool is_null = valid_reader.IsNotSet(); - valid_reader.Next(); - if (is_null) { - ARROW_RETURN_NOT_OK(visitor->VisitNull()); - } else { - auto value = util::string_view(reinterpret_cast(data + offsets[i]), - offsets[i + 1] - offsets[i]); - ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); - } - } - } else { - for (int64_t i = 0; i < arr.length; ++i) { - auto value = util::string_view(reinterpret_cast(data + offsets[i]), - offsets[i + 1] - offsets[i]); - ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); - } - } - return Status::OK(); - } -}; - -template -struct ArrayDataVisitor> { - template - static Status Visit(const ArrayData& arr, Visitor* visitor) { - const auto& fw_type = internal::checked_cast(*arr.type); - - const int32_t byte_width = fw_type.byte_width(); - const uint8_t* data = arr.GetValues(1); - - if (arr.null_count != 0) { - internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); - for (int64_t i = 0; i < arr.length; ++i) { - const bool is_null = valid_reader.IsNotSet(); - valid_reader.Next(); - if (is_null) { - ARROW_RETURN_NOT_OK(visitor->VisitNull()); - } else { - auto value = util::string_view(reinterpret_cast(data), byte_width); - ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); - } - data += byte_width; - } - } else { - for (int64_t i = 0; i < arr.length; ++i) { - auto value = util::string_view(reinterpret_cast(data), byte_width); - ARROW_RETURN_NOT_OK(visitor->VisitValue(value)); - data += byte_width; - } - } - return Status::OK(); - } -}; - -#define SCALAR_VISIT_INLINE(TYPE_CLASS) \ - case TYPE_CLASS##Type::type_id: \ - return visitor->Visit(internal::checked_cast(scalar)); - -template -inline Status VisitScalarInline(const Scalar& scalar, VISITOR* visitor) { - switch (scalar.type->id()) { - ARROW_GENERATE_FOR_ALL_TYPES(SCALAR_VISIT_INLINE); - default: - break; - } - return Status::NotImplemented("Scalar visitor for type not implemented ", - scalar.type->ToString()); -} - -#undef TYPE_VISIT_INLINE - -} // namespace arrow - -#endif // ARROW_VISITOR_INLINE_H diff --git a/r/R/inst/include/parquet/api/io.h b/r/R/inst/include/parquet/api/io.h deleted file mode 100644 index f3092a6d7cb..00000000000 --- a/r/R/inst/include/parquet/api/io.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_API_IO_H -#define PARQUET_API_IO_H - -#include "parquet/deprecated_io.h" -#include "parquet/exception.h" - -#endif // PARQUET_API_IO_H diff --git a/r/R/inst/include/parquet/api/reader.h b/r/R/inst/include/parquet/api/reader.h deleted file mode 100644 index b29ca7205c4..00000000000 --- a/r/R/inst/include/parquet/api/reader.h +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_API_READER_H -#define PARQUET_API_READER_H - -// Column reader API -#include "parquet/column_reader.h" -#include "parquet/column_scanner.h" -#include "parquet/exception.h" -#include "parquet/file_reader.h" -#include "parquet/metadata.h" -#include "parquet/platform.h" -#include "parquet/printer.h" -#include "parquet/properties.h" - -// Schemas -#include "parquet/api/schema.h" - -// IO -#include "parquet/api/io.h" - -#endif // PARQUET_API_READER_H diff --git a/r/R/inst/include/parquet/api/schema.h b/r/R/inst/include/parquet/api/schema.h deleted file mode 100644 index 2e6c3b309ff..00000000000 --- a/r/R/inst/include/parquet/api/schema.h +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_API_SCHEMA_H -#define PARQUET_API_SCHEMA_H - -// Schemas -#include "parquet/schema.h" - -#endif // PARQUET_API_SCHEMA_H diff --git a/r/R/inst/include/parquet/api/writer.h b/r/R/inst/include/parquet/api/writer.h deleted file mode 100644 index 3b4e42f7aff..00000000000 --- a/r/R/inst/include/parquet/api/writer.h +++ /dev/null @@ -1,27 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_API_WRITER_H -#define PARQUET_API_WRITER_H - -#include "parquet/api/io.h" -#include "parquet/api/schema.h" -#include "parquet/column_writer.h" -#include "parquet/exception.h" -#include "parquet/file_writer.h" - -#endif // PARQUET_API_WRITER_H diff --git a/r/R/inst/include/parquet/arrow/reader.h b/r/R/inst/include/parquet/arrow/reader.h deleted file mode 100644 index acdda711071..00000000000 --- a/r/R/inst/include/parquet/arrow/reader.h +++ /dev/null @@ -1,356 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ARROW_READER_H -#define PARQUET_ARROW_READER_H - -#include -#include -#include -#include - -#include "parquet/platform.h" - -#include "arrow/io/interfaces.h" -#include "arrow/util/macros.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class MemoryPool; -class RecordBatchReader; -class Schema; -class Status; -class Table; - -} // namespace arrow - -namespace parquet { - -class FileMetaData; -class ParquetFileReader; -class ReaderProperties; - -namespace arrow { - -class ColumnChunkReader; -class ColumnReader; -class RowGroupReader; - -static constexpr bool DEFAULT_USE_THREADS = false; - -/// EXPERIMENTAL: Properties for configuring FileReader behavior. -class PARQUET_EXPORT ArrowReaderProperties { - public: - explicit ArrowReaderProperties(bool use_threads = DEFAULT_USE_THREADS) - : use_threads_(use_threads), read_dict_indices_() {} - - void set_use_threads(bool use_threads) { use_threads_ = use_threads; } - - bool use_threads() const { return use_threads_; } - - void set_read_dictionary(int column_index, bool read_dict) { - if (read_dict) { - read_dict_indices_.insert(column_index); - } else { - read_dict_indices_.erase(column_index); - } - } - bool read_dictionary(int column_index) const { - if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { - return true; - } else { - return false; - } - } - - private: - bool use_threads_; - std::unordered_set read_dict_indices_; -}; - -/// EXPERIMENTAL: Constructs the default ArrowReaderProperties -PARQUET_EXPORT -ArrowReaderProperties default_arrow_reader_properties(); - -// Arrow read adapter class for deserializing Parquet files as Arrow row -// batches. -// -// This interfaces caters for different use cases and thus provides different -// interfaces. In its most simplistic form, we cater for a user that wants to -// read the whole Parquet at once with the FileReader::ReadTable method. -// -// More advanced users that also want to implement parallelism on top of each -// single Parquet files should do this on the RowGroup level. For this, they can -// call FileReader::RowGroup(i)->ReadTable to receive only the specified -// RowGroup as a table. -// -// In the most advanced situation, where a consumer wants to independently read -// RowGroups in parallel and consume each column individually, they can call -// FileReader::RowGroup(i)->Column(j)->Read and receive an arrow::Column -// instance. -// -// TODO(wesm): nested data does not always make sense with this user -// interface unless you are only reading a single leaf node from a branch of -// a table. For example: -// -// repeated group data { -// optional group record { -// optional int32 val1; -// optional byte_array val2; -// optional bool val3; -// } -// optional int32 val4; -// } -// -// In the Parquet file, there are 3 leaf nodes: -// -// * data.record.val1 -// * data.record.val2 -// * data.record.val3 -// * data.val4 -// -// When materializing this data in an Arrow array, we would have: -// -// data: list), -// val3: bool, -// >, -// val4: int32 -// >> -// -// However, in the Parquet format, each leaf node has its own repetition and -// definition levels describing the structure of the intermediate nodes in -// this array structure. Thus, we will need to scan the leaf data for a group -// of leaf nodes part of the same type tree to create a single result Arrow -// nested array structure. -// -// This is additionally complicated "chunky" repeated fields or very large byte -// arrays -class PARQUET_EXPORT FileReader { - public: - FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader, - const ArrowReaderProperties& properties = default_arrow_reader_properties()); - - // Since the distribution of columns amongst a Parquet file's row groups may - // be uneven (the number of values in each column chunk can be different), we - // provide a column-oriented read interface. The ColumnReader hides the - // details of paging through the file's row groups and yielding - // fully-materialized arrow::Array instances - // - // Returns error status if the column of interest is not flat. - ::arrow::Status GetColumn(int i, std::unique_ptr* out); - - /// \brief Return arrow schema by apply selection of column indices. - /// \returns error status if passed wrong indices. - ::arrow::Status GetSchema(const std::vector& indices, - std::shared_ptr<::arrow::Schema>* out); - - // Read column as a whole into an Array. - ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - - /// \note Deprecated since 0.12 - ARROW_DEPRECATED("Use version with ChunkedArray output") - ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); - - // NOTE: Experimental API - // Reads a specific top level schema field into an Array - // The index i refers the index of the top level schema field, which may - // be nested or flat - e.g. - // - // 0 foo.bar - // foo.bar.baz - // foo.qux - // 1 foo2 - // 2 foo3 - // - // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - - /// \note Deprecated since 0.12 - ARROW_DEPRECATED("Use version with ChunkedArray output") - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); - - /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the - /// ordering in row_group_indices matters. - /// \returns error Status if row_group_indices contains invalid index - ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, - std::shared_ptr<::arrow::RecordBatchReader>* out); - - /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, - /// whose columns are selected by column_indices. The ordering in row_group_indices - /// and column_indices matter. - /// \returns error Status if either row_group_indices or column_indices contains invalid - /// index - ::arrow::Status GetRecordBatchReader(const std::vector& row_group_indices, - const std::vector& column_indices, - std::shared_ptr<::arrow::RecordBatchReader>* out); - - // Read a table of columns into a Table - ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); - - // Read a table of columns into a Table. Read only the indicated column - // indices (relative to the schema) - ::arrow::Status ReadTable(const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out); - - ::arrow::Status ReadRowGroup(int i, const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out); - - ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out); - - ::arrow::Status ReadRowGroups(const std::vector& row_groups, - const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out); - - ::arrow::Status ReadRowGroups(const std::vector& row_groups, - std::shared_ptr<::arrow::Table>* out); - - /// \brief Scan file contents with one thread, return number of rows - ::arrow::Status ScanContents(std::vector columns, const int32_t column_batch_size, - int64_t* num_rows); - - /// \brief Return a reader for the RowGroup, this object must not outlive the - /// FileReader. - std::shared_ptr RowGroup(int row_group_index); - - int num_row_groups() const; - - const ParquetFileReader* parquet_reader() const; - - /// Set the number of threads to use during reads of multiple columns. By - /// default only 1 thread is used - /// \deprecated Use set_use_threads instead. - ARROW_DEPRECATED("Use set_use_threads instead") - void set_num_threads(int num_threads); - - /// Set whether to use multiple threads during reads of multiple columns. - /// By default only one thread is used. - void set_use_threads(bool use_threads); - - virtual ~FileReader(); - - private: - friend ColumnChunkReader; - friend RowGroupReader; - - class PARQUET_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -class PARQUET_EXPORT RowGroupReader { - public: - std::shared_ptr Column(int column_index); - - ::arrow::Status ReadTable(const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out); - ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out); - - virtual ~RowGroupReader(); - - private: - friend FileReader; - RowGroupReader(FileReader::Impl* reader, int row_group_index); - - FileReader::Impl* impl_; - int row_group_index_; -}; - -class PARQUET_EXPORT ColumnChunkReader { - public: - ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); - - /// \note Deprecated since 0.12 - ARROW_DEPRECATED("Use version with ChunkedArray output") - ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); - - virtual ~ColumnChunkReader(); - - private: - friend RowGroupReader; - ColumnChunkReader(FileReader::Impl* impl, int row_group_index, int column_index); - - FileReader::Impl* impl_; - int column_index_; - int row_group_index_; -}; - -// At this point, the column reader is a stream iterator. It only knows how to -// read the next batch of values for a particular column from the file until it -// runs out. -// -// We also do not expose any internal Parquet details, such as row groups. This -// might change in the future. -class PARQUET_EXPORT ColumnReader { - public: - class PARQUET_NO_EXPORT ColumnReaderImpl; - virtual ~ColumnReader(); - - // Scan the next array of the indicated size. The actual size of the - // returned array may be less than the passed size depending how much data is - // available in the file. - // - // When all the data in the file has been exhausted, the result is set to - // nullptr. - // - // Returns Status::OK on a successful read, including if you have exhausted - // the data available in the file. - ::arrow::Status NextBatch(int64_t batch_size, - std::shared_ptr<::arrow::ChunkedArray>* out); - - /// \note Deprecated since 0.12 - ARROW_DEPRECATED("Use version with ChunkedArray output") - ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); - - private: - std::unique_ptr impl_; - explicit ColumnReader(std::unique_ptr impl); - - friend class FileReader; - friend class PrimitiveImpl; - friend class StructImpl; -}; - -// Helper function to create a file reader from an implementation of an Arrow -// random access file -// -// metadata : separately-computed file metadata, can be nullptr -PARQUET_EXPORT -::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - ::arrow::MemoryPool* allocator, - const ReaderProperties& properties, - const std::shared_ptr& metadata, - std::unique_ptr* reader); - -PARQUET_EXPORT -::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - ::arrow::MemoryPool* allocator, - std::unique_ptr* reader); - -PARQUET_EXPORT -::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - ::arrow::MemoryPool* allocator, - const ArrowReaderProperties& properties, - std::unique_ptr* reader); - -} // namespace arrow -} // namespace parquet - -#endif // PARQUET_ARROW_READER_H diff --git a/r/R/inst/include/parquet/arrow/record_reader.h b/r/R/inst/include/parquet/arrow/record_reader.h deleted file mode 100644 index 2ae26a5a47d..00000000000 --- a/r/R/inst/include/parquet/arrow/record_reader.h +++ /dev/null @@ -1,122 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_RECORD_READER_H -#define PARQUET_RECORD_READER_H - -#include -#include -#include - -#include "parquet/platform.h" - -namespace arrow { - -class Array; - -} // namespace arrow - -namespace parquet { - -class ColumnDescriptor; -class PageReader; - -namespace internal { - -/// \brief Stateful column reader that delimits semantic records for both flat -/// and nested columns -/// -/// \note API EXPERIMENTAL -/// \since 1.3.0 -class RecordReader { - public: - // So that we can create subclasses - class RecordReaderImpl; - - static std::shared_ptr Make( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - const bool read_dictionary = false); - - virtual ~RecordReader(); - - /// \brief Decoded definition levels - const int16_t* def_levels() const; - - /// \brief Decoded repetition levels - const int16_t* rep_levels() const; - - /// \brief Decoded values, including nulls, if any - const uint8_t* values() const; - - /// \brief Attempt to read indicated number of records from column chunk - /// \return number of records read - int64_t ReadRecords(int64_t num_records); - - /// \brief Pre-allocate space for data. Results in better flat read performance - void Reserve(int64_t num_values); - - /// \brief Clear consumed values and repetition/definition levels as the - /// result of calling ReadRecords - void Reset(); - - std::shared_ptr ReleaseValues(); - std::shared_ptr ReleaseIsValid(); - - /// \brief Number of values written including nulls (if any) - int64_t values_written() const; - - /// \brief Number of definition / repetition levels (from those that have - /// been decoded) that have been consumed inside the reader. - int64_t levels_position() const; - - /// \brief Number of definition / repetition levels that have been written - /// internally in the reader - int64_t levels_written() const; - - /// \brief Number of nulls in the leaf - int64_t null_count() const; - - /// \brief True if the leaf values are nullable - bool nullable_values() const; - - /// \brief Return true if the record reader has more internal data yet to - /// process - bool HasMoreData() const; - - /// \brief Advance record reader to the next row group - /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader - void SetPageReader(std::unique_ptr reader); - - void DebugPrintState(); - - // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output - std::vector> GetBuilderChunks(); - - private: - std::unique_ptr impl_; - explicit RecordReader(RecordReaderImpl* impl); - - static std::shared_ptr MakeByteArrayRecordReader( - const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, - const bool read_dictionary); -}; - -} // namespace internal -} // namespace parquet - -#endif // PARQUET_RECORD_READER_H diff --git a/r/R/inst/include/parquet/arrow/schema.h b/r/R/inst/include/parquet/arrow/schema.h deleted file mode 100644 index 52fb843e6c6..00000000000 --- a/r/R/inst/include/parquet/arrow/schema.h +++ /dev/null @@ -1,100 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ARROW_SCHEMA_H -#define PARQUET_ARROW_SCHEMA_H - -#include -#include -#include - -#include "parquet/metadata.h" -#include "parquet/platform.h" -#include "parquet/schema.h" - -namespace arrow { - -class Field; -class Schema; -class Status; - -} // namespace arrow - -namespace parquet { - -class WriterProperties; - -namespace arrow { - -class ArrowWriterProperties; - -PARQUET_EXPORT -::arrow::Status NodeToField(const schema::Node& node, - std::shared_ptr<::arrow::Field>* out); - -/// Convert parquet schema to arrow schema with selected indices -/// \param parquet_schema to be converted -/// \param column_indices indices of leaf nodes in parquet schema tree. Appearing ordering -/// matters for the converted schema. Repeated indices are ignored -/// except for the first one -/// \param key_value_metadata optional metadata, can be nullptr -/// \param out the corresponding arrow schema -/// \return Status::OK() on a successful conversion. -PARQUET_EXPORT -::arrow::Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, const std::vector& column_indices, - const std::shared_ptr& key_value_metadata, - std::shared_ptr<::arrow::Schema>* out); - -// Without indices -PARQUET_EXPORT -::arrow::Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, - const std::shared_ptr& key_value_metadata, - std::shared_ptr<::arrow::Schema>* out); - -// Without metadata -::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::vector& column_indices, - std::shared_ptr<::arrow::Schema>* out); - -// Without metadata or indices -::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - std::shared_ptr<::arrow::Schema>* out); - -::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - schema::NodePtr* out); - -::arrow::Status PARQUET_EXPORT -ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::shared_ptr* out); - -::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, - const WriterProperties& properties, - std::shared_ptr* out); - -PARQUET_EXPORT -int32_t DecimalSize(int32_t precision); - -} // namespace arrow - -} // namespace parquet - -#endif // PARQUET_ARROW_SCHEMA_H diff --git a/r/R/inst/include/parquet/arrow/test-util.h b/r/R/inst/include/parquet/arrow/test-util.h deleted file mode 100644 index b99e28f5e03..00000000000 --- a/r/R/inst/include/parquet/arrow/test-util.h +++ /dev/null @@ -1,485 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "arrow/api.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/random.h" -#include "arrow/type_traits.h" -#include "arrow/util/decimal.h" - -#include "parquet/arrow/record_reader.h" - -namespace parquet { - -using internal::RecordReader; - -namespace arrow { - -using ::arrow::Array; -using ::arrow::Status; - -template -struct DecimalWithPrecisionAndScale { - static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value"); - - using type = ::arrow::Decimal128Type; - static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id; - static constexpr int32_t precision = PRECISION; - static constexpr int32_t scale = PRECISION - 1; -}; - -template -using is_arrow_float = std::is_floating_point; - -template -using is_arrow_int = std::is_integral; - -template -using is_arrow_date = std::is_same; - -template -using is_arrow_string = std::is_same; - -template -using is_arrow_binary = std::is_same; - -template -using is_arrow_fixed_size_binary = std::is_same; - -template -using is_arrow_bool = std::is_same; - -template -typename std::enable_if::value, Status>::type NonNullArray( - size_t size, std::shared_ptr* out) { - using c_type = typename ArrowType::c_type; - std::vector values; - ::arrow::random_real(size, 0, static_cast(0), static_cast(1), &values); - ::arrow::NumericBuilder builder; - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); - return builder.Finish(out); -} - -template -typename std::enable_if< - is_arrow_int::value && !is_arrow_date::value, Status>::type -NonNullArray(size_t size, std::shared_ptr* out) { - std::vector values; - ::arrow::randint(size, 0, 64, &values); - - // Passing data type so this will work with TimestampType too - ::arrow::NumericBuilder builder(std::make_shared(), - ::arrow::default_memory_pool()); - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); - return builder.Finish(out); -} - -template -typename std::enable_if::value, Status>::type NonNullArray( - size_t size, std::shared_ptr* out) { - std::vector values; - ::arrow::randint(size, 0, 64, &values); - for (size_t i = 0; i < size; i++) { - values[i] *= 86400000; - } - - // Passing data type so this will work with TimestampType too - ::arrow::NumericBuilder builder(std::make_shared(), - ::arrow::default_memory_pool()); - builder.AppendValues(values.data(), values.size()); - return builder.Finish(out); -} - -template -typename std::enable_if< - is_arrow_string::value || is_arrow_binary::value, Status>::type -NonNullArray(size_t size, std::shared_ptr* out) { - using BuilderType = typename ::arrow::TypeTraits::BuilderType; - BuilderType builder; - for (size_t i = 0; i < size; i++) { - RETURN_NOT_OK(builder.Append("test-string")); - } - return builder.Finish(out); -} - -template -typename std::enable_if::value, Status>::type -NonNullArray(size_t size, std::shared_ptr* out) { - using BuilderType = typename ::arrow::TypeTraits::BuilderType; - // set byte_width to the length of "fixed": 5 - // todo: find a way to generate test data with more diversity. - BuilderType builder(::arrow::fixed_size_binary(5)); - for (size_t i = 0; i < size; i++) { - RETURN_NOT_OK(builder.Append("fixed")); - } - return builder.Finish(out); -} - -static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, - uint8_t* out) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(0, std::numeric_limits::max()); - const int32_t required_bytes = ::arrow::DecimalSize(precision); - constexpr int32_t byte_width = 16; - std::fill(out, out + byte_width * n, '\0'); - - for (int64_t i = 0; i < n; ++i, out += byte_width) { - std::generate(out, out + required_bytes, - [&d, &gen] { return static_cast(d(gen)); }); - - // sign extend if the sign bit is set for the last byte generated - // 0b10000000 == 0x80 == 128 - if ((out[required_bytes - 1] & '\x80') != 0) { - std::fill(out + required_bytes, out + byte_width, '\xFF'); - } - } -} - -template -typename std::enable_if< - std::is_same>::value, Status>::type -NonNullArray(size_t size, std::shared_ptr* out) { - constexpr int32_t kDecimalPrecision = precision; - constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale::scale; - - const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); - ::arrow::Decimal128Builder builder(type); - const int32_t byte_width = - static_cast(*type).byte_width(); - - constexpr int32_t seed = 0; - - std::shared_ptr out_buf; - RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, - &out_buf)); - random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data()); - - RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size)); - return builder.Finish(out); -} - -template -typename std::enable_if::value, Status>::type NonNullArray( - size_t size, std::shared_ptr* out) { - std::vector values; - ::arrow::randint(size, 0, 1, &values); - ::arrow::BooleanBuilder builder; - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size())); - return builder.Finish(out); -} - -// This helper function only supports (size/2) nulls. -template -typename std::enable_if::value, Status>::type NullableArray( - size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { - using c_type = typename ArrowType::c_type; - std::vector values; - ::arrow::random_real(size, seed, static_cast(-1e10), static_cast(1e10), - &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - ::arrow::NumericBuilder builder; - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); - return builder.Finish(out); -} - -// This helper function only supports (size/2) nulls. -template -typename std::enable_if< - is_arrow_int::value && !is_arrow_date::value, Status>::type -NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { - std::vector values; - - // Seed is random in Arrow right now - (void)seed; - ::arrow::randint(size, 0, 64, &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - // Passing data type so this will work with TimestampType too - ::arrow::NumericBuilder builder(std::make_shared(), - ::arrow::default_memory_pool()); - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); - return builder.Finish(out); -} - -template -typename std::enable_if::value, Status>::type NullableArray( - size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { - std::vector values; - - // Seed is random in Arrow right now - (void)seed; - ::arrow::randint(size, 0, 64, &values); - for (size_t i = 0; i < size; i++) { - values[i] *= 86400000; - } - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - // Passing data type so this will work with TimestampType too - ::arrow::NumericBuilder builder(std::make_shared(), - ::arrow::default_memory_pool()); - builder.AppendValues(values.data(), values.size(), valid_bytes.data()); - return builder.Finish(out); -} - -// This helper function only supports (size/2) nulls yet. -template -typename std::enable_if< - is_arrow_string::value || is_arrow_binary::value, Status>::type -NullableArray(size_t size, size_t num_nulls, uint32_t seed, - std::shared_ptr<::arrow::Array>* out) { - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - using BuilderType = typename ::arrow::TypeTraits::BuilderType; - BuilderType builder; - - const int kBufferSize = 10; - uint8_t buffer[kBufferSize]; - for (size_t i = 0; i < size; i++) { - if (!valid_bytes[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - ::arrow::random_bytes(kBufferSize, seed + static_cast(i), buffer); - RETURN_NOT_OK(builder.Append(buffer, kBufferSize)); - } - } - return builder.Finish(out); -} - -// This helper function only supports (size/2) nulls yet, -// same as NullableArray(..) -template -typename std::enable_if::value, Status>::type -NullableArray(size_t size, size_t num_nulls, uint32_t seed, - std::shared_ptr<::arrow::Array>* out) { - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - using BuilderType = typename ::arrow::TypeTraits::BuilderType; - const int byte_width = 10; - BuilderType builder(::arrow::fixed_size_binary(byte_width)); - - const int kBufferSize = byte_width; - uint8_t buffer[kBufferSize]; - for (size_t i = 0; i < size; i++) { - if (!valid_bytes[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - ::arrow::random_bytes(kBufferSize, seed + static_cast(i), buffer); - RETURN_NOT_OK(builder.Append(buffer)); - } - } - return builder.Finish(out); -} - -template -typename std::enable_if< - std::is_same>::value, Status>::type -NullableArray(size_t size, size_t num_nulls, uint32_t seed, - std::shared_ptr<::arrow::Array>* out) { - std::vector valid_bytes(size, '\1'); - - for (size_t i = 0; i < num_nulls; ++i) { - valid_bytes[i * 2] = '\0'; - } - - constexpr int32_t kDecimalPrecision = precision; - constexpr int32_t kDecimalScale = DecimalWithPrecisionAndScale::scale; - const auto type = ::arrow::decimal(kDecimalPrecision, kDecimalScale); - const int32_t byte_width = - static_cast(*type).byte_width(); - - std::shared_ptr<::arrow::Buffer> out_buf; - RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), size * byte_width, - &out_buf)); - - random_decimals(size, seed, precision, out_buf->mutable_data()); - - ::arrow::Decimal128Builder builder(type); - RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data())); - return builder.Finish(out); -} - -// This helper function only supports (size/2) nulls yet. -template -typename std::enable_if::value, Status>::type NullableArray( - size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr* out) { - std::vector values; - - // Seed is random in Arrow right now - (void)seed; - - ::arrow::randint(size, 0, 1, &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - ::arrow::BooleanBuilder builder; - RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data())); - return builder.Finish(out); -} - -/// Wrap an Array into a ListArray by splitting it up into size lists. -/// -/// This helper function only supports (size/2) nulls. -Status MakeListArray(const std::shared_ptr& values, int64_t size, - int64_t null_count, bool nullable_values, - std::shared_ptr<::arrow::ListArray>* out) { - // We always include an empty list - int64_t non_null_entries = size - null_count - 1; - int64_t length_per_entry = values->length() / non_null_entries; - - auto offsets = AllocateBuffer(); - RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t))); - int32_t* offsets_ptr = reinterpret_cast(offsets->mutable_data()); - - auto null_bitmap = AllocateBuffer(); - int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size); - RETURN_NOT_OK(null_bitmap->Resize(bitmap_size)); - uint8_t* null_bitmap_ptr = null_bitmap->mutable_data(); - memset(null_bitmap_ptr, 0, bitmap_size); - - int32_t current_offset = 0; - for (int64_t i = 0; i < size; i++) { - offsets_ptr[i] = current_offset; - if (!(((i % 2) == 0) && ((i / 2) < null_count))) { - // Non-null list (list with index 1 is always empty). - ::arrow::BitUtil::SetBit(null_bitmap_ptr, i); - if (i != 1) { - current_offset += static_cast(length_per_entry); - } - } - } - offsets_ptr[size] = static_cast(values->length()); - - auto value_field = ::arrow::field("item", values->type(), nullable_values); - *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets, - values, null_bitmap, null_count); - - return Status::OK(); -} - -// Make an array containing only empty lists, with a null values array -Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { - // Allocate an offsets buffer containing only zeroes - std::shared_ptr offsets_buffer; - const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t); - RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes, - &offsets_buffer)); - memset(offsets_buffer->mutable_data(), 0, offsets_nbytes); - - auto value_field = - ::arrow::field("item", ::arrow::float64(), false /* nullable_values */); - auto list_type = ::arrow::list(value_field); - - std::vector> child_buffers = {nullptr /* null bitmap */, - nullptr /* values */}; - auto child_data = - ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers)); - - std::vector> buffers = {nullptr /* bitmap */, offsets_buffer}; - auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers)); - array_data->child_data.push_back(child_data); - - *out_array = ::arrow::MakeArray(array_data); - return Status::OK(); -} - -static inline std::shared_ptr<::arrow::Column> MakeColumn( - const std::string& name, const std::shared_ptr& array, bool nullable) { - auto field = ::arrow::field(name, array->type(), nullable); - return std::make_shared<::arrow::Column>(field, array); -} - -static inline std::shared_ptr<::arrow::Column> MakeColumn( - const std::string& name, const std::vector>& arrays, - bool nullable) { - auto field = ::arrow::field(name, arrays[0]->type(), nullable); - return std::make_shared<::arrow::Column>(field, arrays); -} - -std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr& values, - bool nullable) { - std::shared_ptr<::arrow::Column> column = MakeColumn("col", values, nullable); - std::vector> columns({column}); - std::vector> fields({column->field()}); - auto schema = std::make_shared<::arrow::Schema>(fields); - return ::arrow::Table::Make(schema, columns); -} - -template -void ExpectArray(T* expected, Array* result) { - auto p_array = static_cast<::arrow::PrimitiveArray*>(result); - for (int i = 0; i < result->length(); i++) { - EXPECT_EQ(expected[i], reinterpret_cast(p_array->values()->data())[i]); - } -} - -template -void ExpectArrayT(void* expected, Array* result) { - ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result); - for (int64_t i = 0; i < result->length(); i++) { - EXPECT_EQ(reinterpret_cast(expected)[i], - reinterpret_cast( - p_array->values()->data())[i]); - } -} - -template <> -void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) { - ::arrow::BooleanBuilder builder; - ARROW_EXPECT_OK( - builder.AppendValues(reinterpret_cast(expected), result->length())); - - std::shared_ptr expected_array; - ARROW_EXPECT_OK(builder.Finish(&expected_array)); - EXPECT_TRUE(result->Equals(*expected_array)); -} - -} // namespace arrow - -} // namespace parquet diff --git a/r/R/inst/include/parquet/arrow/writer.h b/r/R/inst/include/parquet/arrow/writer.h deleted file mode 100644 index 97ed0f7a0ae..00000000000 --- a/r/R/inst/include/parquet/arrow/writer.h +++ /dev/null @@ -1,250 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ARROW_WRITER_H -#define PARQUET_ARROW_WRITER_H - -#include -#include - -#include "parquet/platform.h" -#include "parquet/properties.h" -#include "parquet/types.h" - -#include "arrow/type.h" - -namespace arrow { - -class Array; -class ChunkedArray; -class MemoryPool; -class Status; -class Table; - -namespace io { - -class OutputStream; - -} // namespace io - -} // namespace arrow - -namespace parquet { - -class FileMetaData; -class ParquetFileWriter; - -namespace arrow { - -class PARQUET_EXPORT ArrowWriterProperties { - public: - class Builder { - public: - Builder() - : write_timestamps_as_int96_(false), - coerce_timestamps_enabled_(false), - coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), - truncated_timestamps_allowed_(false) {} - virtual ~Builder() {} - - Builder* disable_deprecated_int96_timestamps() { - write_timestamps_as_int96_ = false; - return this; - } - - Builder* enable_deprecated_int96_timestamps() { - write_timestamps_as_int96_ = true; - return this; - } - - Builder* coerce_timestamps(::arrow::TimeUnit::type unit) { - coerce_timestamps_enabled_ = true; - coerce_timestamps_unit_ = unit; - return this; - } - - Builder* allow_truncated_timestamps() { - truncated_timestamps_allowed_ = true; - return this; - } - - Builder* disallow_truncated_timestamps() { - truncated_timestamps_allowed_ = false; - return this; - } - - std::shared_ptr build() { - return std::shared_ptr(new ArrowWriterProperties( - write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, - truncated_timestamps_allowed_)); - } - - private: - bool write_timestamps_as_int96_; - - bool coerce_timestamps_enabled_; - ::arrow::TimeUnit::type coerce_timestamps_unit_; - bool truncated_timestamps_allowed_; - }; - - bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } - - bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } - ::arrow::TimeUnit::type coerce_timestamps_unit() const { - return coerce_timestamps_unit_; - } - - bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } - - private: - explicit ArrowWriterProperties(bool write_nanos_as_int96, - bool coerce_timestamps_enabled, - ::arrow::TimeUnit::type coerce_timestamps_unit, - bool truncated_timestamps_allowed) - : write_timestamps_as_int96_(write_nanos_as_int96), - coerce_timestamps_enabled_(coerce_timestamps_enabled), - coerce_timestamps_unit_(coerce_timestamps_unit), - truncated_timestamps_allowed_(truncated_timestamps_allowed) {} - - const bool write_timestamps_as_int96_; - const bool coerce_timestamps_enabled_; - const ::arrow::TimeUnit::type coerce_timestamps_unit_; - const bool truncated_timestamps_allowed_; -}; - -std::shared_ptr PARQUET_EXPORT default_arrow_writer_properties(); - -/** - * Iterative API: - * Start a new RowGroup/Chunk with NewRowGroup - * Write column-by-column the whole column chunk - */ -class PARQUET_EXPORT FileWriter { - public: - FileWriter(::arrow::MemoryPool* pool, std::unique_ptr writer, - const std::shared_ptr<::arrow::Schema>& schema, - const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()); - - static ::arrow::Status Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, - const std::shared_ptr<::arrow::io::OutputStream>& sink, - const std::shared_ptr& properties, - std::unique_ptr* writer); - - static ::arrow::Status Open( - const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, - const std::shared_ptr<::arrow::io::OutputStream>& sink, - const std::shared_ptr& properties, - const std::shared_ptr& arrow_properties, - std::unique_ptr* writer); - - /// \brief Write a Table to Parquet. - ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size); - - ::arrow::Status NewRowGroup(int64_t chunk_size); - ::arrow::Status WriteColumnChunk(const ::arrow::Array& data); - - /// \brief Write ColumnChunk in row group using slice of a ChunkedArray - ::arrow::Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data, - const int64_t offset, const int64_t size); - ::arrow::Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data); - ::arrow::Status Close(); - - virtual ~FileWriter(); - - ::arrow::MemoryPool* memory_pool() const; - - const std::shared_ptr metadata() const; - - private: - class PARQUET_NO_EXPORT Impl; - std::unique_ptr impl_; - std::shared_ptr<::arrow::Schema> schema_; -}; - -/// \brief Write Parquet file metadata only to indicated Arrow OutputStream -PARQUET_EXPORT -::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); - -/** - * Write a Table to Parquet. - * - * The table shall only consist of columns of primitive type or of primitive lists. - */ -::arrow::Status PARQUET_EXPORT WriteTable( - const ::arrow::Table& table, ::arrow::MemoryPool* pool, - const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size, - const std::shared_ptr& properties = default_writer_properties(), - const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()); - -namespace internal { - -/** - * Timestamp conversion constants - */ -constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); - -template -inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { - int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; - (*impala_timestamp).value[2] = (uint32_t)julian_days; - - int64_t last_day_units = time % UnitPerDay; - int64_t* impala_last_day_nanos = reinterpret_cast(impala_timestamp); - *impala_last_day_nanos = last_day_units * NanosecondsPerUnit; -} - -constexpr int64_t kSecondsInNanos = INT64_C(1000000000); - -inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { - ArrowTimestampToImpalaTimestamp(seconds, - impala_timestamp); -} - -constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); - -inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, - Int96* impala_timestamp) { - ArrowTimestampToImpalaTimestamp( - milliseconds, impala_timestamp); -} - -constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); - -inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, - Int96* impala_timestamp) { - ArrowTimestampToImpalaTimestamp( - microseconds, impala_timestamp); -} - -constexpr int64_t kNanosecondsInNanos = INT64_C(1); - -inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, - Int96* impala_timestamp) { - ArrowTimestampToImpalaTimestamp( - nanoseconds, impala_timestamp); -} - -} // namespace internal - -} // namespace arrow - -} // namespace parquet - -#endif // PARQUET_ARROW_WRITER_H diff --git a/r/R/inst/include/parquet/bloom_filter.h b/r/R/inst/include/parquet/bloom_filter.h deleted file mode 100644 index 0285b8f9274..00000000000 --- a/r/R/inst/include/parquet/bloom_filter.h +++ /dev/null @@ -1,255 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_BLOOM_FILTER_H -#define PARQUET_BLOOM_FILTER_H - -#include -#include -#include - -#include "arrow/util/logging.h" -#include "parquet/hasher.h" -#include "parquet/platform.h" -#include "parquet/types.h" - -namespace arrow { - -class MemoryPool; - -} // namespace arrow - -namespace parquet { - -// A Bloom filter is a compact structure to indicate whether an item is not in a set or -// probably in a set. The Bloom filter usually consists of a bit set that represents a -// set of elements, a hash strategy and a Bloom filter algorithm. -class PARQUET_EXPORT BloomFilter { - public: - // Maximum Bloom filter size, it sets to HDFS default block size 128MB - // This value will be reconsidered when implementing Bloom filter producer. - static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024; - - /// Determine whether an element exist in set or not. - /// - /// @param hash the element to contain. - /// @return false if value is definitely not in set, and true means PROBABLY - /// in set. - virtual bool FindHash(uint64_t hash) const = 0; - - /// Insert element to set represented by Bloom filter bitset. - /// @param hash the hash of value to insert into Bloom filter. - virtual void InsertHash(uint64_t hash) = 0; - - /// Write this Bloom filter to an output stream. A Bloom filter structure should - /// include bitset length, hash strategy, algorithm, and bitset. - /// - /// @param sink the output stream to write - virtual void WriteTo(ArrowOutputStream* sink) const = 0; - - /// Get the number of bytes of bitset - virtual uint32_t GetBitsetSize() const = 0; - - /// Compute hash for 32 bits value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(int32_t value) const = 0; - - /// Compute hash for 64 bits value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(int64_t value) const = 0; - - /// Compute hash for float value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(float value) const = 0; - - /// Compute hash for double value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(double value) const = 0; - - /// Compute hash for Int96 value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(const Int96* value) const = 0; - - /// Compute hash for ByteArray value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(const ByteArray* value) const = 0; - - /// Compute hash for fixed byte array value by using its plain encoding result. - /// - /// @param value the value address. - /// @param len the value length. - /// @return hash result. - virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; - - virtual ~BloomFilter() {} - - protected: - // Hash strategy available for Bloom filter. - enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 }; - - // Bloom filter algorithm. - enum class Algorithm : uint32_t { BLOCK = 0 }; -}; - -// The BlockSplitBloomFilter is implemented using block-based Bloom filters from -// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to -// hash the item to a tiny Bloom filter which size fit a single cache line or smaller. -// -// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom -// filter is 32 bytes to take advantage of 32-byte SIMD instructions. -class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { - public: - /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function. - BlockSplitBloomFilter(); - - /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within - /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be - /// rounded up/down to lower/upper bound if num_bytes is out of range and also - /// will be rounded up to a power of 2. - /// - /// @param num_bytes The number of bytes to store Bloom filter bitset. - void Init(uint32_t num_bytes); - - /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying - /// bitset because the given bitset may not satisfy the 32-byte alignment requirement - /// which may lead to segfault when performing SIMD instructions. It is the caller's - /// responsibility to free the bitset passed in. This is used when reconstructing - /// a Bloom filter from a parquet file. - /// - /// @param bitset The given bitset to initialize the Bloom filter. - /// @param num_bytes The number of bytes of given bitset. - void Init(const uint8_t* bitset, uint32_t num_bytes); - - // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter. - static constexpr uint32_t kMinimumBloomFilterBytes = 32; - - /// Calculate optimal size according to the number of distinct values and false - /// positive probability. - /// - /// @param ndv The number of distinct values. - /// @param fpp The false positive probability. - /// @return it always return a value between kMinimumBloomFilterBytes and - /// kMaximumBloomFilterBytes, and the return value is always a power of 2 - static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { - DCHECK(fpp > 0.0 && fpp < 1.0); - const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); - uint32_t num_bits; - - // Handle overflow. - if (m < 0 || m > kMaximumBloomFilterBytes << 3) { - num_bits = static_cast(kMaximumBloomFilterBytes << 3); - } else { - num_bits = static_cast(m); - } - - // Round up to lower bound - if (num_bits < kMinimumBloomFilterBytes << 3) { - num_bits = kMinimumBloomFilterBytes << 3; - } - - // Get next power of 2 if bits is not power of 2. - if ((num_bits & (num_bits - 1)) != 0) { - num_bits = static_cast(::arrow::BitUtil::NextPower2(num_bits)); - } - - // Round down to upper bound - if (num_bits > kMaximumBloomFilterBytes << 3) { - num_bits = kMaximumBloomFilterBytes << 3; - } - - return num_bits; - } - - bool FindHash(uint64_t hash) const override; - void InsertHash(uint64_t hash) override; - void WriteTo(ArrowOutputStream* sink) const override; - uint32_t GetBitsetSize() const override { return num_bytes_; } - - uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); } - uint64_t Hash(float value) const override { return hasher_->Hash(value); } - uint64_t Hash(double value) const override { return hasher_->Hash(value); } - uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); } - uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); } - uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); } - uint64_t Hash(const FLBA* value, uint32_t len) const override { - return hasher_->Hash(value, len); - } - - /// Deserialize the Bloom filter from an input stream. It is used when reconstructing - /// a Bloom filter from a parquet filter. - /// - /// @param input_stream The input stream from which to construct the Bloom filter - /// @return The BlockSplitBloomFilter. - static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream); - - private: - // Bytes in a tiny Bloom filter block. - static constexpr int kBytesPerFilterBlock = 32; - - // The number of bits to be set in each tiny Bloom filter - static constexpr int kBitsSetPerBlock = 8; - - // A mask structure used to set bits in each tiny Bloom filter. - struct BlockMask { - uint32_t item[kBitsSetPerBlock]; - }; - - // The block-based algorithm needs eight odd SALT values to calculate eight indexes - // of bit to set, one bit in each 32-bit word. - static constexpr uint32_t SALT[kBitsSetPerBlock] = { - 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, - 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U}; - - /// Set bits in mask array according to input key. - /// @param key the value to calculate mask values. - /// @param mask the mask array is used to set inside a block - void SetMask(uint32_t key, BlockMask& mask) const; - - // Memory pool to allocate aligned buffer for bitset - ::arrow::MemoryPool* pool_; - - // The underlying buffer of bitset. - std::shared_ptr data_; - - // The number of bytes of Bloom filter bitset. - uint32_t num_bytes_; - - // Hash strategy used in this Bloom filter. - HashStrategy hash_strategy_; - - // Algorithm used in this Bloom filter. - Algorithm algorithm_; - - // The hash pointer points to actual hash class used. - std::unique_ptr hasher_; -}; - -} // namespace parquet - -#endif // PARQUET_BLOOM_FILTER_H diff --git a/r/R/inst/include/parquet/column_page.h b/r/R/inst/include/parquet/column_page.h deleted file mode 100644 index 66a5bf332de..00000000000 --- a/r/R/inst/include/parquet/column_page.h +++ /dev/null @@ -1,173 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module defines an abstract interface for iterating through pages in a -// Parquet column chunk within a row group. It could be extended in the future -// to iterate through all data pages in all chunks in a file. - -#ifndef PARQUET_COLUMN_PAGE_H -#define PARQUET_COLUMN_PAGE_H - -#include -#include -#include - -#include "parquet/statistics.h" -#include "parquet/types.h" - -namespace parquet { - -// TODO: Parallel processing is not yet safe because of memory-ownership -// semantics (the PageReader may or may not own the memory referenced by a -// page) -// -// TODO(wesm): In the future Parquet implementations may store the crc code -// in format::PageHeader. parquet-mr currently does not, so we also skip it -// here, both on the read and write path -class Page { - public: - Page(const std::shared_ptr& buffer, PageType::type type) - : buffer_(buffer), type_(type) {} - - PageType::type type() const { return type_; } - - std::shared_ptr buffer() const { return buffer_; } - - // @returns: a pointer to the page's data - const uint8_t* data() const { return buffer_->data(); } - - // @returns: the total size in bytes of the page's data buffer - int32_t size() const { return static_cast(buffer_->size()); } - - private: - std::shared_ptr buffer_; - PageType::type type_; -}; - -/// \brief Base type for DataPageV1 and DataPageV2 including common attributes -class DataPage : public Page { - public: - int32_t num_values() const { return num_values_; } - Encoding::type encoding() const { return encoding_; } - const EncodedStatistics& statistics() const { return statistics_; } - - protected: - DataPage(PageType::type type, const std::shared_ptr& buffer, int32_t num_values, - Encoding::type encoding, - const EncodedStatistics& statistics = EncodedStatistics()) - : Page(buffer, type), - num_values_(num_values), - encoding_(encoding), - statistics_(statistics) {} - - int32_t num_values_; - Encoding::type encoding_; - EncodedStatistics statistics_; -}; - -class DataPageV1 : public DataPage { - public: - DataPageV1(const std::shared_ptr& buffer, int32_t num_values, - Encoding::type encoding, Encoding::type definition_level_encoding, - Encoding::type repetition_level_encoding, - const EncodedStatistics& statistics = EncodedStatistics()) - : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, statistics), - definition_level_encoding_(definition_level_encoding), - repetition_level_encoding_(repetition_level_encoding) {} - - Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } - - Encoding::type definition_level_encoding() const { return definition_level_encoding_; } - - private: - Encoding::type definition_level_encoding_; - Encoding::type repetition_level_encoding_; -}; - -class CompressedDataPage : public DataPageV1 { - public: - CompressedDataPage(const std::shared_ptr& buffer, int32_t num_values, - Encoding::type encoding, Encoding::type definition_level_encoding, - Encoding::type repetition_level_encoding, int64_t uncompressed_size, - const EncodedStatistics& statistics = EncodedStatistics()) - : DataPageV1(buffer, num_values, encoding, definition_level_encoding, - repetition_level_encoding, statistics), - uncompressed_size_(uncompressed_size) {} - - int64_t uncompressed_size() const { return uncompressed_size_; } - - private: - int64_t uncompressed_size_; -}; - -class DataPageV2 : public DataPage { - public: - DataPageV2(const std::shared_ptr& buffer, int32_t num_values, int32_t num_nulls, - int32_t num_rows, Encoding::type encoding, - int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, - bool is_compressed = false) - : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding), - num_nulls_(num_nulls), - num_rows_(num_rows), - definition_levels_byte_length_(definition_levels_byte_length), - repetition_levels_byte_length_(repetition_levels_byte_length), - is_compressed_(is_compressed) {} - - int32_t num_nulls() const { return num_nulls_; } - - int32_t num_rows() const { return num_rows_; } - - int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } - - int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } - - bool is_compressed() const { return is_compressed_; } - - private: - int32_t num_nulls_; - int32_t num_rows_; - int32_t definition_levels_byte_length_; - int32_t repetition_levels_byte_length_; - bool is_compressed_; - - // TODO(wesm): format::DataPageHeaderV2.statistics -}; - -class DictionaryPage : public Page { - public: - DictionaryPage(const std::shared_ptr& buffer, int32_t num_values, - Encoding::type encoding, bool is_sorted = false) - : Page(buffer, PageType::DICTIONARY_PAGE), - num_values_(num_values), - encoding_(encoding), - is_sorted_(is_sorted) {} - - int32_t num_values() const { return num_values_; } - - Encoding::type encoding() const { return encoding_; } - - bool is_sorted() const { return is_sorted_; } - - private: - int32_t num_values_; - Encoding::type encoding_; - bool is_sorted_; -}; - -} // namespace parquet - -#endif // PARQUET_COLUMN_PAGE_H diff --git a/r/R/inst/include/parquet/column_reader.h b/r/R/inst/include/parquet/column_reader.h deleted file mode 100644 index e7d6afbb467..00000000000 --- a/r/R/inst/include/parquet/column_reader.h +++ /dev/null @@ -1,255 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "parquet/encoding.h" -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace arrow { - -namespace BitUtil { -class BitReader; -} // namespace BitUtil - -namespace util { -class RleDecoder; -} // namespace util - -} // namespace arrow - -namespace parquet { - -class DictionaryPage; -class Page; - -// 16 MB is the default maximum page header size -static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024; - -// 16 KB is the default expected page header size -static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024; - -class PARQUET_EXPORT LevelDecoder { - public: - LevelDecoder(); - ~LevelDecoder(); - - // Initialize the LevelDecoder state with new data - // and return the number of bytes consumed - int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values, - const uint8_t* data); - - // Decodes a batch of levels into an array and returns the number of levels decoded - int Decode(int batch_size, int16_t* levels); - - private: - int bit_width_; - int num_values_remaining_; - Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_; - std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_; -}; - -// Abstract page iterator interface. This way, we can feed column pages to the -// ColumnReader through whatever mechanism we choose -class PARQUET_EXPORT PageReader { - public: - virtual ~PageReader() = default; - - static std::unique_ptr Open( - const std::shared_ptr& stream, int64_t total_num_rows, - Compression::type codec, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - // @returns: shared_ptr(nullptr) on EOS, std::shared_ptr - // containing new Page otherwise - virtual std::shared_ptr NextPage() = 0; - - virtual void set_max_page_header_size(uint32_t size) = 0; -}; - -class PARQUET_EXPORT ColumnReader { - public: - virtual ~ColumnReader() = default; - - static std::shared_ptr Make( - const ColumnDescriptor* descr, std::unique_ptr pager, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - // Returns true if there are still values in this column. - virtual bool HasNext() = 0; - - virtual Type::type type() const = 0; - - virtual const ColumnDescriptor* descr() const = 0; -}; - -// API to read values from a single column. This is a main client facing API. -template -class TypedColumnReader : public ColumnReader { - public: - typedef typename DType::c_type T; - - // Read a batch of repetition levels, definition levels, and values from the - // column. - // - // Since null values are not stored in the values, the number of values read - // may be less than the number of repetition and definition levels. With - // nested data this is almost certainly true. - // - // Set def_levels or rep_levels to nullptr if you want to skip reading them. - // This is only safe if you know through some other source that there are no - // undefined values. - // - // To fully exhaust a row group, you must read batches until the number of - // values read reaches the number of stored values according to the metadata. - // - // This API is the same for both V1 and V2 of the DataPage - // - // @returns: actual number of levels read (see values_read for number of values read) - virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, - T* values, int64_t* values_read) = 0; - - /// Read a batch of repetition levels, definition levels, and values from the - /// column and leave spaces for null entries on the lowest level in the values - /// buffer. - /// - /// In comparision to ReadBatch the length of repetition and definition levels - /// is the same as of the number of values read for max_definition_level == 1. - /// In the case of max_definition_level > 1, the repetition and definition - /// levels are larger than the values but the values include the null entries - /// with definition_level == (max_definition_level - 1). - /// - /// To fully exhaust a row group, you must read batches until the number of - /// values read reaches the number of stored values according to the metadata. - /// - /// @param batch_size the number of levels to read - /// @param[out] def_levels The Parquet definition levels, output has - /// the length levels_read. - /// @param[out] rep_levels The Parquet repetition levels, output has - /// the length levels_read. - /// @param[out] values The values in the lowest nested level including - /// spacing for nulls on the lowest levels; output has the length - /// values_read. - /// @param[out] valid_bits Memory allocated for a bitmap that indicates if - /// the row is null or on the maximum definition level. For performance - /// reasons the underlying buffer should be able to store 1 bit more than - /// required. If this requires an additional byte, this byte is only read - /// but never written to. - /// @param valid_bits_offset The offset in bits of the valid_bits where the - /// first relevant bit resides. - /// @param[out] levels_read The number of repetition/definition levels that were read. - /// @param[out] values_read The number of values read, this includes all - /// non-null entries as well as all null-entries on the lowest level - /// (i.e. definition_level == max_definition_level - 1) - /// @param[out] null_count The number of nulls on the lowest levels. - /// (i.e. (values_read - null_count) is total number of non-null entries) - virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, - int16_t* rep_levels, T* values, uint8_t* valid_bits, - int64_t valid_bits_offset, int64_t* levels_read, - int64_t* values_read, int64_t* null_count) = 0; - - // Skip reading levels - // Returns the number of levels skipped - virtual int64_t Skip(int64_t num_rows_to_skip) = 0; -}; - -namespace internal { - -static inline void DefinitionLevelsToBitmap( - const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset) { - // We assume here that valid_bits is large enough to accommodate the - // additional definition levels and the ones that have already been written - ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, - valid_bits_offset + num_def_levels); - - // TODO(itaiin): As an interim solution we are splitting the code path here - // between repeated+flat column reads, and non-repeated+nested reads. - // Those paths need to be merged in the future - for (int i = 0; i < num_def_levels; ++i) { - if (def_levels[i] == max_definition_level) { - valid_bits_writer.Set(); - } else if (max_repetition_level > 0) { - // repetition+flat case - if (def_levels[i] == (max_definition_level - 1)) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - continue; - } - } else { - // non-repeated+nested case - if (def_levels[i] < max_definition_level) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - throw ParquetException("definition level exceeds maximum"); - } - } - - valid_bits_writer.Next(); - } - valid_bits_writer.Finish(); - *values_read = valid_bits_writer.position(); -} - -} // namespace internal - -namespace internal { - -// TODO(itaiin): another code path split to merge when the general case is done -static inline bool HasSpacedValues(const ColumnDescriptor* descr) { - if (descr->max_repetition_level() > 0) { - // repeated+flat case - return !descr->schema_node()->is_required(); - } else { - // non-repeated+nested case - // Find if a node forces nulls in the lowest level along the hierarchy - const schema::Node* node = descr->schema_node().get(); - while (node) { - if (node->is_optional()) { - return true; - } - node = node->parent(); - } - return false; - } -} - -} // namespace internal - -using BoolReader = TypedColumnReader; -using Int32Reader = TypedColumnReader; -using Int64Reader = TypedColumnReader; -using Int96Reader = TypedColumnReader; -using FloatReader = TypedColumnReader; -using DoubleReader = TypedColumnReader; -using ByteArrayReader = TypedColumnReader; -using FixedLenByteArrayReader = TypedColumnReader; - -} // namespace parquet diff --git a/r/R/inst/include/parquet/column_scanner.h b/r/R/inst/include/parquet/column_scanner.h deleted file mode 100644 index 9f65d1866b9..00000000000 --- a/r/R/inst/include/parquet/column_scanner.h +++ /dev/null @@ -1,265 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_COLUMN_SCANNER_H -#define PARQUET_COLUMN_SCANNER_H - -#include -#include -#include -#include -#include -#include - -#include "parquet/column_reader.h" -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace parquet { - -static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128; - -class PARQUET_EXPORT Scanner { - public: - explicit Scanner(std::shared_ptr reader, - int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : batch_size_(batch_size), - level_offset_(0), - levels_buffered_(0), - value_buffer_(AllocateBuffer(pool)), - value_offset_(0), - values_buffered_(0), - reader_(reader) { - def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0); - rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0); - } - - virtual ~Scanner() {} - - static std::shared_ptr Make( - std::shared_ptr col_reader, - int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0; - - bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); } - - const ColumnDescriptor* descr() const { return reader_->descr(); } - - int64_t batch_size() const { return batch_size_; } - - void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; } - - protected: - int64_t batch_size_; - - std::vector def_levels_; - std::vector rep_levels_; - int level_offset_; - int levels_buffered_; - - std::shared_ptr value_buffer_; - int value_offset_; - int64_t values_buffered_; - - private: - std::shared_ptr reader_; -}; - -template -class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner { - public: - typedef typename DType::c_type T; - - explicit TypedScanner(std::shared_ptr reader, - int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : Scanner(reader, batch_size, pool) { - typed_reader_ = static_cast*>(reader.get()); - int value_byte_size = type_traits::value_byte_size; - PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size)); - values_ = reinterpret_cast(value_buffer_->mutable_data()); - } - - virtual ~TypedScanner() {} - - bool NextLevels(int16_t* def_level, int16_t* rep_level) { - if (level_offset_ == levels_buffered_) { - levels_buffered_ = static_cast( - typed_reader_->ReadBatch(static_cast(batch_size_), def_levels_.data(), - rep_levels_.data(), values_, &values_buffered_)); - - value_offset_ = 0; - level_offset_ = 0; - if (!levels_buffered_) { - return false; - } - } - *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0; - *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0; - level_offset_++; - return true; - } - - bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) { - if (level_offset_ == levels_buffered_) { - if (!HasNext()) { - // Out of data pages - return false; - } - } - - NextLevels(def_level, rep_level); - *is_null = *def_level < descr()->max_definition_level(); - - if (*is_null) { - return true; - } - - if (value_offset_ == values_buffered_) { - throw ParquetException("Value was non-null, but has not been buffered"); - } - *val = values_[value_offset_++]; - return true; - } - - // Returns true if there is a next value - bool NextValue(T* val, bool* is_null) { - if (level_offset_ == levels_buffered_) { - if (!HasNext()) { - // Out of data pages - return false; - } - } - - // Out of values - int16_t def_level = -1; - int16_t rep_level = -1; - NextLevels(&def_level, &rep_level); - *is_null = def_level < descr()->max_definition_level(); - - if (*is_null) { - return true; - } - - if (value_offset_ == values_buffered_) { - throw ParquetException("Value was non-null, but has not been buffered"); - } - *val = values_[value_offset_++]; - return true; - } - - virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) { - T val; - int16_t def_level = -1; - int16_t rep_level = -1; - bool is_null = false; - char buffer[80]; - - if (!Next(&val, &def_level, &rep_level, &is_null)) { - throw ParquetException("No more values buffered"); - } - - if (with_levels) { - out << " D:" << def_level << " R:" << rep_level << " "; - if (!is_null) { - out << "V:"; - } - } - - if (is_null) { - std::string null_fmt = format_fwf(width); - snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL"); - } else { - FormatValue(&val, buffer, sizeof(buffer), width); - } - out << buffer; - } - - private: - // The ownership of this object is expressed through the reader_ variable in the base - TypedColumnReader* typed_reader_; - - inline void FormatValue(void* val, char* buffer, int bufsize, int width); - - T* values_; -}; - -template -inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, - int width) { - std::string fmt = format_fwf(width); - snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast(val)); -} - -template <> -inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, - int width) { - std::string fmt = format_fwf(width); - std::string result = Int96ToString(*reinterpret_cast(val)); - snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); -} - -template <> -inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, - int width) { - std::string fmt = format_fwf(width); - std::string result = ByteArrayToString(*reinterpret_cast(val)); - snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); -} - -template <> -inline void TypedScanner::FormatValue(void* val, char* buffer, int bufsize, - int width) { - std::string fmt = format_fwf(width); - std::string result = FixedLenByteArrayToString( - *reinterpret_cast(val), descr()->type_length()); - snprintf(buffer, bufsize, fmt.c_str(), result.c_str()); -} - -typedef TypedScanner BoolScanner; -typedef TypedScanner Int32Scanner; -typedef TypedScanner Int64Scanner; -typedef TypedScanner Int96Scanner; -typedef TypedScanner FloatScanner; -typedef TypedScanner DoubleScanner; -typedef TypedScanner ByteArrayScanner; -typedef TypedScanner FixedLenByteArrayScanner; - -template -int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels, - uint8_t* values, int64_t* values_buffered, - parquet::ColumnReader* reader) { - typedef typename RType::T Type; - auto typed_reader = static_cast(reader); - auto vals = reinterpret_cast(&values[0]); - return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals, - values_buffered); -} - -int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels, - int16_t* rep_levels, uint8_t* values, - int64_t* values_buffered, - parquet::ColumnReader* reader); - -} // namespace parquet - -#endif // PARQUET_COLUMN_SCANNER_H diff --git a/r/R/inst/include/parquet/column_writer.h b/r/R/inst/include/parquet/column_writer.h deleted file mode 100644 index 023b96585eb..00000000000 --- a/r/R/inst/include/parquet/column_writer.h +++ /dev/null @@ -1,192 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/memory_pool.h" - -#include "parquet/column_page.h" -#include "parquet/encoding.h" -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace arrow { - -namespace BitUtil { -class BitWriter; -} // namespace BitUtil - -namespace util { -class RleEncoder; -} // namespace util - -} // namespace arrow - -namespace parquet { - -class ColumnChunkMetaDataBuilder; -class WriterProperties; - -class PARQUET_EXPORT LevelEncoder { - public: - LevelEncoder(); - ~LevelEncoder(); - - static int MaxBufferSize(Encoding::type encoding, int16_t max_level, - int num_buffered_values); - - // Initialize the LevelEncoder. - void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values, - uint8_t* data, int data_size); - - // Encodes a batch of levels from an array and returns the number of levels encoded - int Encode(int batch_size, const int16_t* levels); - - int32_t len() { - if (encoding_ != Encoding::RLE) { - throw ParquetException("Only implemented for RLE encoding"); - } - return rle_length_; - } - - private: - int bit_width_; - int rle_length_; - Encoding::type encoding_; - std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_; - std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_; -}; - -class PARQUET_EXPORT PageWriter { - public: - virtual ~PageWriter() {} - - static std::unique_ptr Open( - const std::shared_ptr& sink, Compression::type codec, - ColumnChunkMetaDataBuilder* metadata, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), - bool buffered_row_group = false); - - // The Column Writer decides if dictionary encoding is used if set and - // if the dictionary encoding has fallen back to default encoding on reaching dictionary - // page limit - virtual void Close(bool has_dictionary, bool fallback) = 0; - - virtual int64_t WriteDataPage(const CompressedDataPage& page) = 0; - - virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0; - - virtual bool has_compressor() = 0; - - virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0; -}; - -static constexpr int WRITE_BATCH_SIZE = 1000; -class PARQUET_EXPORT ColumnWriter { - public: - virtual ~ColumnWriter() = default; - - static std::shared_ptr Make(ColumnChunkMetaDataBuilder*, - std::unique_ptr, - const WriterProperties* properties); - - /// \brief Closes the ColumnWriter, commits any buffered values to pages. - /// \return Total size of the column in bytes - virtual int64_t Close() = 0; - - /// \brief The physical Parquet type of the column - virtual Type::type type() const = 0; - - /// \brief The schema for the column - virtual const ColumnDescriptor* descr() const = 0; - - /// \brief The number of rows written so far - virtual int64_t rows_written() const = 0; - - /// \brief The total size of the compressed pages + page headers. Some values - /// might be still buffered an not written to a page yet - virtual int64_t total_compressed_bytes() const = 0; - - /// \brief The total number of bytes written as serialized data and - /// dictionary pages to the ColumnChunk so far - virtual int64_t total_bytes_written() const = 0; - - /// \brief The file-level writer properties - virtual const WriterProperties* properties() = 0; -}; - -// API to write values to a single column. This is the main client facing API. -template -class TypedColumnWriter : public ColumnWriter { - public: - using T = typename DType::c_type; - - // Write a batch of repetition levels, definition levels, and values to the - // column. - virtual void WriteBatch(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const T* values) = 0; - - /// Write a batch of repetition levels, definition levels, and values to the - /// column. - /// - /// In comparision to WriteBatch the length of repetition and definition levels - /// is the same as of the number of values read for max_definition_level == 1. - /// In the case of max_definition_level > 1, the repetition and definition - /// levels are larger than the values but the values include the null entries - /// with definition_level == (max_definition_level - 1). Thus we have to differentiate - /// in the parameters of this function if the input has the length of num_values or the - /// _number of rows in the lowest nesting level_. - /// - /// In the case that the most inner node in the Parquet is required, the _number of rows - /// in the lowest nesting level_ is equal to the number of non-null values. If the - /// inner-most schema node is optional, the _number of rows in the lowest nesting level_ - /// also includes all values with definition_level == (max_definition_level - 1). - /// - /// @param num_values number of levels to write. - /// @param def_levels The Parquet definiton levels, length is num_values - /// @param rep_levels The Parquet repetition levels, length is num_values - /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting - /// level. The length is number of rows in the lowest nesting level. - /// @param valid_bits_offset The offset in bits of the valid_bits where the - /// first relevant bit resides. - /// @param values The values in the lowest nested level including - /// spacing for nulls on the lowest levels; input has the length - /// of the number of rows on the lowest nesting level. - virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values) = 0; - - // Estimated size of the values that are not written to a page yet - virtual int64_t EstimatedBufferedValueBytes() const = 0; -}; - -using BoolWriter = TypedColumnWriter; -using Int32Writer = TypedColumnWriter; -using Int64Writer = TypedColumnWriter; -using Int96Writer = TypedColumnWriter; -using FloatWriter = TypedColumnWriter; -using DoubleWriter = TypedColumnWriter; -using ByteArrayWriter = TypedColumnWriter; -using FixedLenByteArrayWriter = TypedColumnWriter; - -} // namespace parquet diff --git a/r/R/inst/include/parquet/deprecated_io.h b/r/R/inst/include/parquet/deprecated_io.h deleted file mode 100644 index 8dfdeda5d24..00000000000 --- a/r/R/inst/include/parquet/deprecated_io.h +++ /dev/null @@ -1,135 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// DEPRECATED IO INTERFACES: We have transitioned to using the Apache -// Arrow file input and output abstract interfaces defined in -// arrow/io/interfaces.h. These legacy interfaces are being preserved -// through a wrapper layer for one to two releases - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "arrow/buffer.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" -#include "arrow/memory_pool.h" - -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/types.h" - -namespace parquet { - -class PARQUET_EXPORT FileInterface { - public: - virtual ~FileInterface() = default; - - // Close the file - virtual void Close() = 0; - - // Return the current position in the file relative to the start - virtual int64_t Tell() = 0; -}; - -/// It is the responsibility of implementations to mind threadsafety of shared -/// resources -class PARQUET_EXPORT RandomAccessSource : virtual public FileInterface { - public: - virtual ~RandomAccessSource() = default; - - virtual int64_t Size() const = 0; - - // Returns bytes read - virtual int64_t Read(int64_t nbytes, uint8_t* out) = 0; - - virtual std::shared_ptr Read(int64_t nbytes) = 0; - - virtual std::shared_ptr ReadAt(int64_t position, int64_t nbytes) = 0; - - /// Returns bytes read - virtual int64_t ReadAt(int64_t position, int64_t nbytes, uint8_t* out) = 0; -}; - -class PARQUET_EXPORT OutputStream : virtual public FileInterface { - public: - virtual ~OutputStream() = default; - - // Copy bytes into the output stream - virtual void Write(const uint8_t* data, int64_t length) = 0; -}; - -// ---------------------------------------------------------------------- -// Wrapper classes - -class PARQUET_EXPORT ParquetInputWrapper : public ::arrow::io::RandomAccessFile { - public: - explicit ParquetInputWrapper(std::unique_ptr source); - explicit ParquetInputWrapper(RandomAccessSource* source); - - ~ParquetInputWrapper() override; - - // FileInterface - ::arrow::Status Close() override; - ::arrow::Status Tell(int64_t* position) const override; - bool closed() const override; - - // Seekable - ::arrow::Status Seek(int64_t position) override; - - // InputStream / RandomAccessFile - ::arrow::Status Read(int64_t nbytes, int64_t* bytes_read, void* out) override; - ::arrow::Status Read(int64_t nbytes, std::shared_ptr* out) override; - ::arrow::Status ReadAt(int64_t position, int64_t nbytes, - std::shared_ptr* out) override; - ::arrow::Status GetSize(int64_t* size) override; - - private: - std::unique_ptr owned_source_; - RandomAccessSource* source_; - bool closed_; -}; - -class PARQUET_EXPORT ParquetOutputWrapper : public ::arrow::io::OutputStream { - public: - explicit ParquetOutputWrapper(const std::shared_ptr<::parquet::OutputStream>& sink); - explicit ParquetOutputWrapper(std::unique_ptr<::parquet::OutputStream> sink); - explicit ParquetOutputWrapper(::parquet::OutputStream* sink); - - ~ParquetOutputWrapper() override; - - // FileInterface - ::arrow::Status Close() override; - ::arrow::Status Tell(int64_t* position) const override; - bool closed() const override; - - // Writable - ::arrow::Status Write(const void* data, int64_t nbytes) override; - - private: - std::unique_ptr<::parquet::OutputStream> owned_sink_; - std::shared_ptr<::parquet::OutputStream> shared_sink_; - ::parquet::OutputStream* sink_; - bool closed_; -}; - -} // namespace parquet diff --git a/r/R/inst/include/parquet/encoding.h b/r/R/inst/include/parquet/encoding.h deleted file mode 100644 index 28a9b98716f..00000000000 --- a/r/R/inst/include/parquet/encoding.h +++ /dev/null @@ -1,358 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/types.h" - -namespace parquet { - -class ColumnDescriptor; - -// Untyped base for all encoders -class Encoder { - public: - virtual ~Encoder() = default; - - virtual int64_t EstimatedDataEncodedSize() = 0; - virtual std::shared_ptr FlushValues() = 0; - virtual Encoding::type encoding() const = 0; - - virtual ::arrow::MemoryPool* memory_pool() const = 0; -}; - -// Base class for value encoders. Since encoders may or not have state (e.g., -// dictionary encoding) we use a class instance to maintain any state. -// -// TODO(wesm): Encode interface API is temporary -template -class TypedEncoder : virtual public Encoder { - public: - typedef typename DType::c_type T; - - virtual void Put(const T* src, int num_values) = 0; - - virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) { - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer( - this->memory_pool(), num_values * sizeof(T), &buffer)); - int32_t num_valid_values = 0; - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - T* data = reinterpret_cast(buffer->mutable_data()); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - data[num_valid_values++] = src[i]; - } - valid_bits_reader.Next(); - } - Put(data, num_valid_values); - } -}; - -// Base class for dictionary encoders -template -class DictEncoder : virtual public TypedEncoder { - public: - /// Writes out any buffered indices to buffer preceded by the bit width of this data. - /// Returns the number of bytes written. - /// If the supplied buffer is not big enough, returns -1. - /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize() - /// to size buffer. - virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0; - - virtual int dict_encoded_size() = 0; - // virtual int dict_encoded_size() { return dict_encoded_size_; } - - virtual int bit_width() const = 0; - - /// Writes out the encoded dictionary to buffer. buffer must be preallocated to - /// dict_encoded_size() bytes. - virtual void WriteDict(uint8_t* buffer) = 0; - - virtual int num_entries() const = 0; -}; - -// ---------------------------------------------------------------------- -// Value decoding - -class Decoder { - public: - virtual ~Decoder() = default; - - // Sets the data for a new page. This will be called multiple times on the same - // decoder and should reset all internal state. - virtual void SetData(int num_values, const uint8_t* data, int len) = 0; - - // Returns the number of values left (for the last call to SetData()). This is - // the number of values left in this page. - virtual int values_left() const = 0; - virtual Encoding::type encoding() const = 0; -}; - -template -class TypedDecoder : virtual public Decoder { - public: - using T = typename DType::c_type; - - // Subclasses should override the ones they support. In each of these functions, - // the decoder would decode put to 'max_values', storing the result in 'buffer'. - // The function returns the number of values decoded, which should be max_values - // except for end of the current data page. - virtual int Decode(T* buffer, int max_values) = 0; - - // Decode the values in this data page but leave spaces for null entries. - // - // num_values is the size of the def_levels and buffer arrays including the number of - // null values. - virtual int DecodeSpaced(T* buffer, int num_values, int null_count, - const uint8_t* valid_bits, int64_t valid_bits_offset) { - int values_to_read = num_values - null_count; - int values_read = Decode(buffer, values_to_read); - if (values_read != values_to_read) { - throw ParquetException("Number of values / definition_levels read did not match"); - } - - // Depending on the number of nulls, some of the value slots in buffer may - // be uninitialized, and this will cause valgrind warnings / potentially UB - memset(static_cast(buffer + values_read), 0, - (num_values - values_read) * sizeof(T)); - - // Add spacing for null entries. As we have filled the buffer from the front, - // we need to add the spacing from the back. - int values_to_move = values_read; - for (int i = num_values - 1; i >= 0; i--) { - if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - buffer[i] = buffer[--values_to_move]; - } - } - return num_values; - } -}; - -template -class DictDecoder : virtual public TypedDecoder { - public: - virtual void SetDict(TypedDecoder* dictionary) = 0; -}; - -// ---------------------------------------------------------------------- -// TypedEncoder specializations, traits, and factory functions - -class BooleanEncoder : virtual public TypedEncoder { - public: - using TypedEncoder::Put; - virtual void Put(const std::vector& src, int num_values) = 0; -}; - -using Int32Encoder = TypedEncoder; -using Int64Encoder = TypedEncoder; -using Int96Encoder = TypedEncoder; -using FloatEncoder = TypedEncoder; -using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : virtual public TypedEncoder {}; -class FLBAEncoder : virtual public TypedEncoder {}; - -class BooleanDecoder : virtual public TypedDecoder { - public: - using TypedDecoder::Decode; - virtual int Decode(uint8_t* buffer, int max_values) = 0; -}; - -using Int32Decoder = TypedDecoder; -using Int64Decoder = TypedDecoder; -using Int96Decoder = TypedDecoder; -using FloatDecoder = TypedDecoder; -using DoubleDecoder = TypedDecoder; - -class ByteArrayDecoder : virtual public TypedDecoder { - public: - using TypedDecoder::DecodeSpaced; - - class WrappedBuilderInterface { - public: - virtual void Reserve(int64_t values) = 0; - virtual void Append(const uint8_t* value, uint32_t length) = 0; - virtual void AppendNull() = 0; - virtual ~WrappedBuilderInterface() = default; - }; - - template - class WrappedBuilder : public WrappedBuilderInterface { - public: - explicit WrappedBuilder(Builder* builder) : builder_(builder) {} - - void Reserve(int64_t values) override { - PARQUET_THROW_NOT_OK(builder_->Reserve(values)); - } - void Append(const uint8_t* value, uint32_t length) override { - PARQUET_THROW_NOT_OK(builder_->Append(value, length)); - } - - void AppendNull() override { PARQUET_THROW_NOT_OK(builder_->AppendNull()); } - - private: - Builder* builder_; - }; - - template - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, Builder* builder) { - int result = 0; - WrappedBuilder wrapped_builder(builder); - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, &wrapped_builder, &result)); - return result; - } - - template - int DecodeArrowNonNull(int num_values, Builder* builder) { - int result = 0; - WrappedBuilder wrapped_builder(builder); - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, &wrapped_builder, &result)); - return result; - } - - private: - virtual ::arrow::Status DecodeArrow(int num_values, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, - WrappedBuilderInterface* builder, - int* values_decoded) = 0; - - virtual ::arrow::Status DecodeArrowNonNull(int num_values, - WrappedBuilderInterface* builder, - int* values_decoded) = 0; -}; - -class FLBADecoder : virtual public TypedDecoder { - public: - using TypedDecoder::DecodeSpaced; - - // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if - // there is value in adding specialized read methods for - // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type - // then perhaps not -}; - -template -struct EncodingTraits {}; - -template <> -struct EncodingTraits { - using Encoder = BooleanEncoder; - using Decoder = BooleanDecoder; -}; - -template <> -struct EncodingTraits { - using Encoder = Int32Encoder; - using Decoder = Int32Decoder; -}; - -template <> -struct EncodingTraits { - using Encoder = Int64Encoder; - using Decoder = Int64Decoder; -}; - -template <> -struct EncodingTraits { - using Encoder = Int96Encoder; - using Decoder = Int96Decoder; -}; - -template <> -struct EncodingTraits { - using Encoder = FloatEncoder; - using Decoder = FloatDecoder; -}; - -template <> -struct EncodingTraits { - using Encoder = DoubleEncoder; - using Decoder = DoubleDecoder; -}; - -template <> -struct EncodingTraits { - using Encoder = ByteArrayEncoder; - using Decoder = ByteArrayDecoder; -}; - -template <> -struct EncodingTraits { - using Encoder = FLBAEncoder; - using Decoder = FLBADecoder; -}; - -PARQUET_EXPORT -std::unique_ptr MakeEncoder( - Type::type type_num, Encoding::type encoding, bool use_dictionary = false, - const ColumnDescriptor* descr = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - -template -std::unique_ptr::Encoder> MakeTypedEncoder( - Encoding::type encoding, bool use_dictionary = false, - const ColumnDescriptor* descr = NULLPTR, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - using OutType = typename EncodingTraits::Encoder; - std::unique_ptr base = - MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool); - return std::unique_ptr(dynamic_cast(base.release())); -} - -PARQUET_EXPORT -std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encoding, - const ColumnDescriptor* descr = NULLPTR); - -namespace detail { - -PARQUET_EXPORT -std::unique_ptr MakeDictDecoder(Type::type type_num, - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool); - -} // namespace detail - -template -std::unique_ptr> MakeDictDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - using OutType = DictDecoder; - auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); - return std::unique_ptr(dynamic_cast(decoder.release())); -} - -template -std::unique_ptr::Decoder> MakeTypedDecoder( - Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) { - using OutType = typename EncodingTraits::Decoder; - std::unique_ptr base = MakeDecoder(DType::type_num, encoding, descr); - return std::unique_ptr(dynamic_cast(base.release())); -} - -} // namespace parquet diff --git a/r/R/inst/include/parquet/encryption_internal.h b/r/R/inst/include/parquet/encryption_internal.h deleted file mode 100644 index af668dc4136..00000000000 --- a/r/R/inst/include/parquet/encryption_internal.h +++ /dev/null @@ -1,114 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ENCRYPTION_INTERNAL_H -#define PARQUET_ENCRYPTION_INTERNAL_H - -#include -#include -#include - -#include "parquet/properties.h" -#include "parquet/types.h" - -using parquet::ParquetCipher; - -namespace parquet { -namespace encryption { - -constexpr int kGcmTagLength = 16; -constexpr int kNonceLength = 12; - -// Module types -constexpr int8_t kFooter = 0; -constexpr int8_t kColumnMetaData = 1; -constexpr int8_t kDataPage = 2; -constexpr int8_t kDictionaryPage = 3; -constexpr int8_t kDataPageHeader = 4; -constexpr int8_t kDictionaryPageHeader = 5; -constexpr int8_t kColumnIndex = 6; -constexpr int8_t kOffsetIndex = 7; - -/// Performs AES encryption operations with GCM or CTR ciphers. -class AesEncryptor { - public: - static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_encryptors); - - ~AesEncryptor(); - - /// Size difference between plaintext and ciphertext, for this cipher. - int CiphertextSizeDelta(); - - /// Encrypts plaintext with the key and aad. Key length is passed only for validation. - /// If different from value in constructor, exception will be thrown. - int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* ciphertext); - - /// Encrypts plaintext footer, in order to compute footer signature (tag). - int SignedFooterEncrypt(const uint8_t* footer, int footer_len, uint8_t* key, - int key_len, uint8_t* aad, int aad_len, uint8_t* nonce, - uint8_t* encrypted_footer); - - void WipeOut(); - - private: - /// Can serve one key length only. Possible values: 16, 24, 32 bytes. - explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata); - // PIMPL Idiom - class AesEncryptorImpl; - std::unique_ptr impl_; -}; - -/// Performs AES decryption operations with GCM or CTR ciphers. -class AesDecryptor { - public: - static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata, - std::shared_ptr> all_decryptors); - - ~AesDecryptor(); - void WipeOut(); - - /// Size difference between plaintext and ciphertext, for this cipher. - int CiphertextSizeDelta(); - - /// Decrypts ciphertext with the key and aad. Key length is passed only for - /// validation. If different from value in constructor, exception will be thrown. - int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* key, int key_len, - uint8_t* aad, int aad_len, uint8_t* plaintext); - - private: - /// Can serve one key length only. Possible values: 16, 24, 32 bytes. - explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata); - // PIMPL Idiom - class AesDecryptorImpl; - std::unique_ptr impl_; -}; - -std::string CreateModuleAad(const std::string& file_aad, int8_t module_type, - int16_t row_group_ordinal, int16_t column_ordinal, - int16_t page_ordinal); - -std::string CreateFooterAad(const std::string& aad_prefix_bytes); - -// Update last two bytes of page (or page header) module AAD -void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal); - -} // namespace encryption -} // namespace parquet - -#endif // PARQUET_ENCRYPTION_INTERNAL_H diff --git a/r/R/inst/include/parquet/exception.h b/r/R/inst/include/parquet/exception.h deleted file mode 100644 index 7db3ab756f0..00000000000 --- a/r/R/inst/include/parquet/exception.h +++ /dev/null @@ -1,91 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_EXCEPTION_H -#define PARQUET_EXCEPTION_H - -#include -#include -#include - -#include "arrow/status.h" -#include "parquet/platform.h" - -// PARQUET-1085 -#if !defined(ARROW_UNUSED) -#define ARROW_UNUSED(x) UNUSED(x) -#endif - -#define PARQUET_CATCH_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) { \ - return ::arrow::Status::IOError(e.what()); \ - } - -#define PARQUET_IGNORE_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - ARROW_UNUSED(_s); \ - } while (0) - -#define PARQUET_THROW_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) { \ - std::stringstream ss; \ - ss << "Arrow error: " << _s.ToString(); \ - throw ::parquet::ParquetException(ss.str()); \ - } \ - } while (0) - -namespace parquet { - -class ParquetException : public std::exception { - public: - PARQUET_NORETURN static void EofException(const std::string& msg = "") { - std::stringstream ss; - ss << "Unexpected end of stream"; - if (!msg.empty()) { - ss << ": " << msg; - } - throw ParquetException(ss.str()); - } - - PARQUET_NORETURN static void NYI(const std::string& msg = "") { - std::stringstream ss; - ss << "Not yet implemented: " << msg << "."; - throw ParquetException(ss.str()); - } - - explicit ParquetException(const char* msg) : msg_(msg) {} - - explicit ParquetException(const std::string& msg) : msg_(msg) {} - - explicit ParquetException(const char* msg, std::exception&) : msg_(msg) {} - - ~ParquetException() throw() override {} - - const char* what() const throw() override { return msg_.c_str(); } - - private: - std::string msg_; -}; - -} // namespace parquet - -#endif // PARQUET_EXCEPTION_H diff --git a/r/R/inst/include/parquet/file_reader.h b/r/R/inst/include/parquet/file_reader.h deleted file mode 100644 index 214cf112600..00000000000 --- a/r/R/inst/include/parquet/file_reader.h +++ /dev/null @@ -1,141 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_FILE_READER_H -#define PARQUET_FILE_READER_H - -#include -#include -#include -#include - -#include "parquet/metadata.h" // IWYU pragma:: keep -#include "parquet/platform.h" -#include "parquet/properties.h" - -namespace parquet { - -class ColumnReader; -class FileMetaData; -class PageReader; -class RandomAccessSource; -class RowGroupMetaData; - -class PARQUET_EXPORT RowGroupReader { - public: - // Forward declare a virtual class 'Contents' to aid dependency injection and more - // easily create test fixtures - // An implementation of the Contents class is defined in the .cc file - struct Contents { - virtual ~Contents() {} - virtual std::unique_ptr GetColumnPageReader(int i) = 0; - virtual const RowGroupMetaData* metadata() const = 0; - virtual const ReaderProperties* properties() const = 0; - }; - - explicit RowGroupReader(std::unique_ptr contents); - - // Returns the rowgroup metadata - const RowGroupMetaData* metadata() const; - - // Construct a ColumnReader for the indicated row group-relative - // column. Ownership is shared with the RowGroupReader. - std::shared_ptr Column(int i); - - std::unique_ptr GetColumnPageReader(int i); - - private: - // Holds a pointer to an instance of Contents implementation - std::unique_ptr contents_; -}; - -class PARQUET_EXPORT ParquetFileReader { - public: - // Declare a virtual class 'Contents' to aid dependency injection and more - // easily create test fixtures - // An implementation of the Contents class is defined in the .cc file - struct PARQUET_EXPORT Contents { - static std::unique_ptr Open( - const std::shared_ptr<::arrow::io::RandomAccessFile>& source, - const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = NULLPTR); - - virtual ~Contents() = default; - // Perform any cleanup associated with the file contents - virtual void Close() = 0; - virtual std::shared_ptr GetRowGroup(int i) = 0; - virtual std::shared_ptr metadata() const = 0; - }; - - ParquetFileReader(); - ~ParquetFileReader(); - - // Create a reader from some implementation of parquet-cpp's generic file - // input interface - // - // If you cannot provide exclusive access to your file resource, create a - // subclass of RandomAccessSource that wraps the shared resource - ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version") - static std::unique_ptr Open( - std::unique_ptr source, - const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = NULLPTR); - - // Create a file reader instance from an Arrow file object. Thread-safety is - // the responsibility of the file implementation - static std::unique_ptr Open( - const std::shared_ptr<::arrow::io::RandomAccessFile>& source, - const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = NULLPTR); - - // API Convenience to open a serialized Parquet file on disk, using Arrow IO - // interfaces. - static std::unique_ptr OpenFile( - const std::string& path, bool memory_map = true, - const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = NULLPTR); - - void Open(std::unique_ptr contents); - void Close(); - - // The RowGroupReader is owned by the FileReader - std::shared_ptr RowGroup(int i); - - // Returns the file metadata. Only one instance is ever created - std::shared_ptr metadata() const; - - private: - // Holds a pointer to an instance of Contents implementation - std::unique_ptr contents_; -}; - -// Read only Parquet file metadata -std::shared_ptr PARQUET_EXPORT -ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); - -/// \brief Scan all values in file. Useful for performance testing -/// \param[in] columns the column numbers to scan. If empty scans all -/// \param[in] column_batch_size number of values to read at a time when scanning column -/// \param[in] reader a ParquetFileReader instance -/// \return number of semantic rows in file -PARQUET_EXPORT -int64_t ScanFileContents(std::vector columns, const int32_t column_batch_size, - ParquetFileReader* reader); - -} // namespace parquet - -#endif // PARQUET_FILE_READER_H diff --git a/r/R/inst/include/parquet/file_writer.h b/r/R/inst/include/parquet/file_writer.h deleted file mode 100644 index cd512cf817d..00000000000 --- a/r/R/inst/include/parquet/file_writer.h +++ /dev/null @@ -1,237 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_FILE_WRITER_H -#define PARQUET_FILE_WRITER_H - -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/metadata.h" -#include "parquet/platform.h" -#include "parquet/properties.h" -#include "parquet/schema.h" - -namespace arrow { - -class MemoryPool; - -namespace io { - -class OutputStream; - -} // namespace io -} // namespace arrow - -namespace parquet { - -class ColumnWriter; -class OutputStream; - -class PARQUET_EXPORT RowGroupWriter { - public: - // Forward declare a virtual class 'Contents' to aid dependency injection and more - // easily create test fixtures - // An implementation of the Contents class is defined in the .cc file - struct Contents { - virtual ~Contents() = default; - virtual int num_columns() const = 0; - virtual int64_t num_rows() const = 0; - - // to be used only with ParquetFileWriter::AppendRowGroup - virtual ColumnWriter* NextColumn() = 0; - // to be used only with ParquetFileWriter::AppendBufferedRowGroup - virtual ColumnWriter* column(int i) = 0; - - virtual int current_column() const = 0; - virtual void Close() = 0; - - // total bytes written by the page writer - virtual int64_t total_bytes_written() const = 0; - // total bytes still compressed but not written - virtual int64_t total_compressed_bytes() const = 0; - }; - - explicit RowGroupWriter(std::unique_ptr contents); - - /// Construct a ColumnWriter for the indicated row group-relative column. - /// - /// To be used only with ParquetFileWriter::AppendRowGroup - /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only - /// valid until the next call to NextColumn or Close. As the contents are - /// directly written to the sink, once a new column is started, the contents - /// of the previous one cannot be modified anymore. - ColumnWriter* NextColumn(); - /// Index of currently written column - int current_column(); - void Close(); - - int num_columns() const; - - /// Construct a ColumnWriter for the indicated row group column. - /// - /// To be used only with ParquetFileWriter::AppendBufferedRowGroup - /// Ownership is solely within the RowGroupWriter. The ColumnWriter is - /// valid until Close. The contents are buffered in memory and written to sink - /// on Close - ColumnWriter* column(int i); - - /** - * Number of rows that shall be written as part of this RowGroup. - */ - int64_t num_rows() const; - - int64_t total_bytes_written() const; - int64_t total_compressed_bytes() const; - - private: - // Holds a pointer to an instance of Contents implementation - std::unique_ptr contents_; -}; - -ARROW_DEPRECATED("Use version with arrow::io::OutputStream*") -PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); - -PARQUET_EXPORT -void WriteFileMetaData(const FileMetaData& file_metadata, - ::arrow::io::OutputStream* sink); - -class PARQUET_EXPORT ParquetFileWriter { - public: - // Forward declare a virtual class 'Contents' to aid dependency injection and more - // easily create test fixtures - // An implementation of the Contents class is defined in the .cc file - struct Contents { - Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema, - const std::shared_ptr& key_value_metadata) - : schema_(), key_value_metadata_(key_value_metadata) { - schema_.Init(schema); - } - virtual ~Contents() {} - // Perform any cleanup associated with the file contents - virtual void Close() = 0; - - /// \note Deprecated since 1.3.0 - RowGroupWriter* AppendRowGroup(int64_t num_rows); - - virtual RowGroupWriter* AppendRowGroup() = 0; - virtual RowGroupWriter* AppendBufferedRowGroup() = 0; - - virtual int64_t num_rows() const = 0; - virtual int num_columns() const = 0; - virtual int num_row_groups() const = 0; - - virtual const std::shared_ptr& properties() const = 0; - - const std::shared_ptr& key_value_metadata() const { - return key_value_metadata_; - } - - // Return const-pointer to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const { return &schema_; } - - SchemaDescriptor schema_; - - /// This should be the only place this is stored. Everything else is a const reference - std::shared_ptr key_value_metadata_; - - const std::shared_ptr metadata() const { return file_metadata_; } - std::shared_ptr file_metadata_; - }; - - ParquetFileWriter(); - ~ParquetFileWriter(); - - static std::unique_ptr Open( - const std::shared_ptr<::arrow::io::OutputStream>& sink, - const std::shared_ptr& schema, - const std::shared_ptr& properties = default_writer_properties(), - const std::shared_ptr& key_value_metadata = NULLPTR); - - ARROW_DEPRECATED("Use version with arrow::io::OutputStream") - static std::unique_ptr Open( - const std::shared_ptr& sink, - const std::shared_ptr& schema, - const std::shared_ptr& properties = default_writer_properties(), - const std::shared_ptr& key_value_metadata = NULLPTR); - - void Open(std::unique_ptr contents); - void Close(); - - // Construct a RowGroupWriter for the indicated number of rows. - // - // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid - // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. - // @param num_rows The number of rows that are stored in the new RowGroup - // - // \deprecated Since 1.3.0 - RowGroupWriter* AppendRowGroup(int64_t num_rows); - - /// Construct a RowGroupWriter with an arbitrary number of rows. - /// - /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid - /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. - RowGroupWriter* AppendRowGroup(); - - /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. - /// Use this if you want to write a RowGroup based on a certain size - /// - /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid - /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. - RowGroupWriter* AppendBufferedRowGroup(); - - /// Number of columns. - /// - /// This number is fixed during the lifetime of the writer as it is determined via - /// the schema. - int num_columns() const; - - /// Number of rows in the yet started RowGroups. - /// - /// Changes on the addition of a new RowGroup. - int64_t num_rows() const; - - /// Number of started RowGroups. - int num_row_groups() const; - - /// Configuration passed to the writer, e.g. the used Parquet format version. - const std::shared_ptr& properties() const; - - /// Returns the file schema descriptor - const SchemaDescriptor* schema() const; - - /// Returns a column descriptor in schema - const ColumnDescriptor* descr(int i) const; - - /// Returns the file custom metadata - const std::shared_ptr& key_value_metadata() const; - - /// Returns the file metadata, only available after calling Close(). - const std::shared_ptr metadata() const; - - private: - // Holds a pointer to an instance of Contents implementation - std::unique_ptr contents_; - std::shared_ptr file_metadata_; -}; - -} // namespace parquet - -#endif // PARQUET_FILE_WRITER_H diff --git a/r/R/inst/include/parquet/hasher.h b/r/R/inst/include/parquet/hasher.h deleted file mode 100644 index 233262ebdd6..00000000000 --- a/r/R/inst/include/parquet/hasher.h +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_HASHER_H -#define PARQUET_HASHER_H - -#include -#include "parquet/types.h" - -namespace parquet { -// Abstract class for hash -class Hasher { - public: - /// Compute hash for 32 bits value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(int32_t value) const = 0; - - /// Compute hash for 64 bits value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(int64_t value) const = 0; - - /// Compute hash for float value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(float value) const = 0; - - /// Compute hash for double value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(double value) const = 0; - - /// Compute hash for Int96 value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(const Int96* value) const = 0; - - /// Compute hash for ByteArray value by using its plain encoding result. - /// - /// @param value the value to hash. - /// @return hash result. - virtual uint64_t Hash(const ByteArray* value) const = 0; - - /// Compute hash for fixed byte array value by using its plain encoding result. - /// - /// @param value the value address. - /// @param len the value length. - virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; - - virtual ~Hasher() = default; -}; - -} // namespace parquet - -#endif // PARQUET_HASHER_H diff --git a/r/R/inst/include/parquet/metadata.h b/r/R/inst/include/parquet/metadata.h deleted file mode 100644 index 4a7ae447bdd..00000000000 --- a/r/R/inst/include/parquet/metadata.h +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_FILE_METADATA_H -#define PARQUET_FILE_METADATA_H - -#include -#include -#include -#include - -#include "arrow/util/key_value_metadata.h" -#include "arrow/util/macros.h" - -#include "parquet/platform.h" -#include "parquet/properties.h" -#include "parquet/types.h" - -namespace parquet { - -class ColumnDescriptor; -class EncodedStatistics; -class Statistics; -class SchemaDescriptor; - -namespace schema { - -class ColumnPath; - -} // namespace schema - -using KeyValueMetadata = ::arrow::KeyValueMetadata; - -class PARQUET_EXPORT ApplicationVersion { - public: - // Known Versions with Issues - static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); - static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); - static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); - static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); - // Regular expression for the version format - // major . minor . patch unknown - prerelease.x + build info - // Eg: 1.5.0ab-cdh5.5.0+cd - static constexpr char const* VERSION_FORMAT = - "^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$"; - // Regular expression for the application format - // application_name version VERSION_FORMAT (build build_name) - // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) - static constexpr char const* APPLICATION_FORMAT = - "(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)"; - - // Application that wrote the file. e.g. "IMPALA" - std::string application_; - // Build name - std::string build_; - - // Version of the application that wrote the file, expressed as - // (..). Unmatched parts default to 0. - // "1.2.3" => {1, 2, 3} - // "1.2" => {0, 0, 0} - // "1.2-cdh5" => {0, 0, 0} - // TODO (majetideepak): Implement support for pre_release - struct { - int major; - int minor; - int patch; - std::string unknown; - std::string pre_release; - std::string build_info; - } version; - - ApplicationVersion() {} - explicit ApplicationVersion(const std::string& created_by); - ApplicationVersion(const std::string& application, int major, int minor, int patch); - - // Returns true if version is strictly less than other_version - bool VersionLt(const ApplicationVersion& other_version) const; - - // Returns true if version is strictly less than other_version - bool VersionEq(const ApplicationVersion& other_version) const; - - // Checks if the Version has the correct statistics for a given column - bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics, - SortOrder::type sort_order = SortOrder::SIGNED) const; -}; - -class PARQUET_EXPORT ColumnChunkMetaData { - public: - // API convenience to get a MetaData accessor - static std::unique_ptr Make( - const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); - - ~ColumnChunkMetaData(); - - // column chunk - int64_t file_offset() const; - - // parameter is only used when a dataset is spread across multiple files - const std::string& file_path() const; - - // column metadata - Type::type type() const; - int64_t num_values() const; - std::shared_ptr path_in_schema() const; - bool is_stats_set() const; - std::shared_ptr statistics() const; - Compression::type compression() const; - const std::vector& encodings() const; - bool has_dictionary_page() const; - int64_t dictionary_page_offset() const; - int64_t data_page_offset() const; - bool has_index_page() const; - int64_t index_page_offset() const; - int64_t total_compressed_size() const; - int64_t total_uncompressed_size() const; - - private: - explicit ColumnChunkMetaData(const void* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = NULLPTR); - // PIMPL Idiom - class ColumnChunkMetaDataImpl; - std::unique_ptr impl_; -}; - -class PARQUET_EXPORT RowGroupMetaData { - public: - // API convenience to get a MetaData accessor - static std::unique_ptr Make( - const void* metadata, const SchemaDescriptor* schema, - const ApplicationVersion* writer_version = NULLPTR); - - ~RowGroupMetaData(); - - // row-group metadata - int num_columns() const; - int64_t num_rows() const; - int64_t total_byte_size() const; - // Return const-pointer to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const; - std::unique_ptr ColumnChunk(int i) const; - - private: - explicit RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema, - const ApplicationVersion* writer_version = NULLPTR); - // PIMPL Idiom - class RowGroupMetaDataImpl; - std::unique_ptr impl_; -}; - -class FileMetaDataBuilder; - -class PARQUET_EXPORT FileMetaData { - public: - // API convenience to get a MetaData accessor - static std::shared_ptr Make(const void* serialized_metadata, - uint32_t* metadata_len); - - ~FileMetaData(); - - // file metadata - uint32_t size() const; - int num_columns() const; - int64_t num_rows() const; - int num_row_groups() const; - ParquetVersion::type version() const; - const std::string& created_by() const; - int num_schema_elements() const; - std::unique_ptr RowGroup(int i) const; - - const ApplicationVersion& writer_version() const; - - void WriteTo(::arrow::io::OutputStream* dst) const; - - // Return const-pointer to make it clear that this object is not to be copied - const SchemaDescriptor* schema() const; - - std::shared_ptr key_value_metadata() const; - - // Set file_path ColumnChunk fields to a particular value - void set_file_path(const std::string& path); - - private: - friend FileMetaDataBuilder; - explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len); - - // PIMPL Idiom - FileMetaData(); - class FileMetaDataImpl; - std::unique_ptr impl_; -}; - -// Builder API -class PARQUET_EXPORT ColumnChunkMetaDataBuilder { - public: - // API convenience to get a MetaData reader - static std::unique_ptr Make( - const std::shared_ptr& props, const ColumnDescriptor* column); - - static std::unique_ptr Make( - const std::shared_ptr& props, const ColumnDescriptor* column, - void* contents); - - ~ColumnChunkMetaDataBuilder(); - - // column chunk - // Used when a dataset is spread across multiple files - void set_file_path(const std::string& path); - // column metadata - void SetStatistics(const EncodedStatistics& stats); - // get the column descriptor - const ColumnDescriptor* descr() const; - // commit the metadata - void Finish(int64_t num_values, int64_t dictonary_page_offset, - int64_t index_page_offset, int64_t data_page_offset, - int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, - bool dictionary_fallback); - - // The metadata contents, suitable for passing to ColumnChunkMetaData::Make - const void* contents() const; - - // For writing metadata at end of column chunk - void WriteTo(::arrow::io::OutputStream* sink); - - private: - explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, - const ColumnDescriptor* column); - explicit ColumnChunkMetaDataBuilder(const std::shared_ptr& props, - const ColumnDescriptor* column, void* contents); - // PIMPL Idiom - class ColumnChunkMetaDataBuilderImpl; - std::unique_ptr impl_; -}; - -class PARQUET_EXPORT RowGroupMetaDataBuilder { - public: - // API convenience to get a MetaData reader - static std::unique_ptr Make( - const std::shared_ptr& props, const SchemaDescriptor* schema_, - void* contents); - - ~RowGroupMetaDataBuilder(); - - ColumnChunkMetaDataBuilder* NextColumnChunk(); - int num_columns(); - int64_t num_rows(); - int current_column() const; - - void set_num_rows(int64_t num_rows); - - // commit the metadata - void Finish(int64_t total_bytes_written); - - private: - explicit RowGroupMetaDataBuilder(const std::shared_ptr& props, - const SchemaDescriptor* schema_, void* contents); - // PIMPL Idiom - class RowGroupMetaDataBuilderImpl; - std::unique_ptr impl_; -}; - -class PARQUET_EXPORT FileMetaDataBuilder { - public: - // API convenience to get a MetaData reader - static std::unique_ptr Make( - const SchemaDescriptor* schema, const std::shared_ptr& props, - const std::shared_ptr& key_value_metadata = NULLPTR); - - ~FileMetaDataBuilder(); - - // The prior RowGroupMetaDataBuilder (if any) is destroyed - RowGroupMetaDataBuilder* AppendRowGroup(); - - // Complete the Thrift structure - std::unique_ptr Finish(); - - private: - explicit FileMetaDataBuilder( - const SchemaDescriptor* schema, const std::shared_ptr& props, - const std::shared_ptr& key_value_metadata = NULLPTR); - // PIMPL Idiom - class FileMetaDataBuilderImpl; - std::unique_ptr impl_; -}; - -PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); - -} // namespace parquet - -#endif // PARQUET_FILE_METADATA_H diff --git a/r/R/inst/include/parquet/murmur3.h b/r/R/inst/include/parquet/murmur3.h deleted file mode 100644 index d12ae0238b0..00000000000 --- a/r/R/inst/include/parquet/murmur3.h +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef PARQUET_MURMURHASH3_H_ -#define PARQUET_MURMURHASH3_H_ - -#include - -#include "parquet/hasher.h" -#include "parquet/platform.h" -#include "parquet/types.h" - -namespace parquet { - -/// Source: -/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class) -class PARQUET_EXPORT MurmurHash3 : public Hasher { - public: - MurmurHash3() : seed_(DEFAULT_SEED) {} - uint64_t Hash(int32_t value) const override; - uint64_t Hash(int64_t value) const override; - uint64_t Hash(float value) const override; - uint64_t Hash(double value) const override; - uint64_t Hash(const Int96* value) const override; - uint64_t Hash(const ByteArray* value) const override; - uint64_t Hash(const FLBA* val, uint32_t len) const override; - - private: - // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated - // by System.nanoTime() of java. - static constexpr int DEFAULT_SEED = 1361930890; - - uint32_t seed_; -}; - -} // namespace parquet - -#endif // PARQUET_MURMURHASH3_H_ diff --git a/r/R/inst/include/parquet/platform.h b/r/R/inst/include/parquet/platform.h deleted file mode 100644 index 25d8dd4d94d..00000000000 --- a/r/R/inst/include/parquet/platform.h +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/buffer.h" // IWYU pragma: export -#include "arrow/io/interfaces.h" // IWYU pragma: export -#include "arrow/io/memory.h" // IWYU pragma: export -#include "arrow/memory_pool.h" // IWYU pragma: export -#include "arrow/status.h" // IWYU pragma: export -#include "arrow/util/bit-util.h" // IWYU pragma: export -#include "arrow/util/macros.h" // IWYU pragma: export -#include "arrow/util/string_view.h" // IWYU pragma: export - -#if defined(_WIN32) || defined(__CYGWIN__) - -#if defined(_MSC_VER) -#pragma warning(push) -// Disable warning for STL types usage in DLL interface -// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports -#pragma warning(disable : 4275 4251) -// Disable diamond inheritance warnings -#pragma warning(disable : 4250) -// Disable macro redefinition warnings -#pragma warning(disable : 4005) -// Disable extern before exported template warnings -#pragma warning(disable : 4910) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -#ifdef PARQUET_STATIC -#define PARQUET_EXPORT -#elif defined(PARQUET_EXPORTING) -#define PARQUET_EXPORT __declspec(dllexport) -#else -#define PARQUET_EXPORT __declspec(dllimport) -#endif - -#define PARQUET_NO_EXPORT - -#else // Not Windows -#ifndef PARQUET_EXPORT -#define PARQUET_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef PARQUET_NO_EXPORT -#define PARQUET_NO_EXPORT __attribute__((visibility("hidden"))) -#endif -#endif // Non-Windows - -// This is a complicated topic, some reading on it: -// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ -#if defined(_MSC_VER) || defined(__clang__) -#define PARQUET_TEMPLATE_CLASS_EXPORT -#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT -#else -#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT -#define PARQUET_TEMPLATE_EXPORT -#endif - -#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN - -#define PARQUET_NORETURN ARROW_NORETURN -#define PARQUET_DEPRECATED ARROW_DEPRECATED - -// If ARROW_VALGRIND set when compiling unit tests, also define -// PARQUET_VALGRIND -#ifdef ARROW_VALGRIND -#define PARQUET_VALGRIND -#endif - -namespace parquet { - -namespace BitUtil = ::arrow::BitUtil; - -using Buffer = ::arrow::Buffer; -using MemoryPool = ::arrow::MemoryPool; -using MutableBuffer = ::arrow::MutableBuffer; -using ResizableBuffer = ::arrow::ResizableBuffer; -using ResizableBuffer = ::arrow::ResizableBuffer; -using ArrowInputFile = ::arrow::io::RandomAccessFile; -using ArrowInputStream = ::arrow::io::InputStream; -using ArrowOutputStream = ::arrow::io::OutputStream; -using string_view = ::arrow::util::string_view; - -constexpr int64_t kDefaultOutputStreamSize = 1024; - -PARQUET_EXPORT -std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream( - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - -PARQUET_EXPORT -std::shared_ptr AllocateBuffer( - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0); - -} // namespace parquet diff --git a/r/R/inst/include/parquet/printer.h b/r/R/inst/include/parquet/printer.h deleted file mode 100644 index 751b8a44d07..00000000000 --- a/r/R/inst/include/parquet/printer.h +++ /dev/null @@ -1,49 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_FILE_PRINTER_H -#define PARQUET_FILE_PRINTER_H - -#include -#include - -#include "parquet/platform.h" - -namespace parquet { - -class ParquetFileReader; - -class PARQUET_EXPORT ParquetFilePrinter { - private: - ParquetFileReader* fileReader; - - public: - explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {} - ~ParquetFilePrinter() {} - - void DebugPrint(std::ostream& stream, std::list selected_columns, - bool print_values = false, bool format_dump = false, - bool print_key_value_metadata = false, - const char* filename = "No Name"); - - void JSONPrint(std::ostream& stream, std::list selected_columns, - const char* filename = "No Name"); -}; - -} // namespace parquet - -#endif // PARQUET_FILE_PRINTER_H diff --git a/r/R/inst/include/parquet/properties.h b/r/R/inst/include/parquet/properties.h deleted file mode 100644 index 7277f3a61e6..00000000000 --- a/r/R/inst/include/parquet/properties.h +++ /dev/null @@ -1,428 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_COLUMN_PROPERTIES_H -#define PARQUET_COLUMN_PROPERTIES_H - -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/parquet_version.h" -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace parquet { - -struct ParquetVersion { - enum type { PARQUET_1_0, PARQUET_2_0 }; -}; - -static int64_t DEFAULT_BUFFER_SIZE = 0; -static bool DEFAULT_USE_BUFFERED_STREAM = false; - -class PARQUET_EXPORT ReaderProperties { - public: - explicit ReaderProperties(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : pool_(pool) { - buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM; - buffer_size_ = DEFAULT_BUFFER_SIZE; - } - - ::arrow::MemoryPool* memory_pool() const { return pool_; } - - std::shared_ptr GetStream(std::shared_ptr source, - int64_t start, int64_t num_bytes); - - bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; } - - void enable_buffered_stream() { buffered_stream_enabled_ = true; } - - void disable_buffered_stream() { buffered_stream_enabled_ = false; } - - void set_buffer_size(int64_t buf_size) { buffer_size_ = buf_size; } - - int64_t buffer_size() const { return buffer_size_; } - - private: - ::arrow::MemoryPool* pool_; - int64_t buffer_size_; - bool buffered_stream_enabled_; -}; - -ReaderProperties PARQUET_EXPORT default_reader_properties(); - -static constexpr int64_t DEFAULT_PAGE_SIZE = 1024 * 1024; -static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true; -static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = DEFAULT_PAGE_SIZE; -static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; -static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024; -static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; -static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; -static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN; -static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION = - ParquetVersion::PARQUET_1_0; -static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; -static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; - -class PARQUET_EXPORT ColumnProperties { - public: - ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, - Compression::type codec = DEFAULT_COMPRESSION_TYPE, - bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, - bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, - size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE) - : encoding_(encoding), - codec_(codec), - dictionary_enabled_(dictionary_enabled), - statistics_enabled_(statistics_enabled), - max_stats_size_(max_stats_size) {} - - void set_encoding(Encoding::type encoding) { encoding_ = encoding; } - - void set_compression(Compression::type codec) { codec_ = codec; } - - void set_dictionary_enabled(bool dictionary_enabled) { - dictionary_enabled_ = dictionary_enabled; - } - - void set_statistics_enabled(bool statistics_enabled) { - statistics_enabled_ = statistics_enabled; - } - - void set_max_statistics_size(size_t max_stats_size) { - max_stats_size_ = max_stats_size; - } - - Encoding::type encoding() const { return encoding_; } - - Compression::type compression() const { return codec_; } - - bool dictionary_enabled() const { return dictionary_enabled_; } - - bool statistics_enabled() const { return statistics_enabled_; } - - size_t max_statistics_size() const { return max_stats_size_; } - - private: - Encoding::type encoding_; - Compression::type codec_; - bool dictionary_enabled_; - bool statistics_enabled_; - size_t max_stats_size_; -}; - -class PARQUET_EXPORT WriterProperties { - public: - class Builder { - public: - Builder() - : pool_(::arrow::default_memory_pool()), - dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT), - write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), - max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), - pagesize_(DEFAULT_PAGE_SIZE), - version_(DEFAULT_WRITER_VERSION), - created_by_(DEFAULT_CREATED_BY) {} - virtual ~Builder() {} - - Builder* memory_pool(::arrow::MemoryPool* pool) { - pool_ = pool; - return this; - } - - Builder* enable_dictionary() { - default_column_properties_.set_dictionary_enabled(true); - return this; - } - - Builder* disable_dictionary() { - default_column_properties_.set_dictionary_enabled(false); - return this; - } - - Builder* enable_dictionary(const std::string& path) { - dictionary_enabled_[path] = true; - return this; - } - - Builder* enable_dictionary(const std::shared_ptr& path) { - return this->enable_dictionary(path->ToDotString()); - } - - Builder* disable_dictionary(const std::string& path) { - dictionary_enabled_[path] = false; - return this; - } - - Builder* disable_dictionary(const std::shared_ptr& path) { - return this->disable_dictionary(path->ToDotString()); - } - - Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) { - dictionary_pagesize_limit_ = dictionary_psize_limit; - return this; - } - - Builder* write_batch_size(int64_t write_batch_size) { - write_batch_size_ = write_batch_size; - return this; - } - - Builder* max_row_group_length(int64_t max_row_group_length) { - max_row_group_length_ = max_row_group_length; - return this; - } - - Builder* data_pagesize(int64_t pg_size) { - pagesize_ = pg_size; - return this; - } - - Builder* version(ParquetVersion::type version) { - version_ = version; - return this; - } - - Builder* created_by(const std::string& created_by) { - created_by_ = created_by; - return this; - } - - /** - * Define the encoding that is used when we don't utilise dictionary encoding. - * - * This either apply if dictionary encoding is disabled or if we fallback - * as the dictionary grew too large. - */ - Builder* encoding(Encoding::type encoding_type) { - if (encoding_type == Encoding::PLAIN_DICTIONARY || - encoding_type == Encoding::RLE_DICTIONARY) { - throw ParquetException("Can't use dictionary encoding as fallback encoding"); - } - - default_column_properties_.set_encoding(encoding_type); - return this; - } - - /** - * Define the encoding that is used when we don't utilise dictionary encoding. - * - * This either apply if dictionary encoding is disabled or if we fallback - * as the dictionary grew too large. - */ - Builder* encoding(const std::string& path, Encoding::type encoding_type) { - if (encoding_type == Encoding::PLAIN_DICTIONARY || - encoding_type == Encoding::RLE_DICTIONARY) { - throw ParquetException("Can't use dictionary encoding as fallback encoding"); - } - - encodings_[path] = encoding_type; - return this; - } - - /** - * Define the encoding that is used when we don't utilise dictionary encoding. - * - * This either apply if dictionary encoding is disabled or if we fallback - * as the dictionary grew too large. - */ - Builder* encoding(const std::shared_ptr& path, - Encoding::type encoding_type) { - return this->encoding(path->ToDotString(), encoding_type); - } - - Builder* compression(Compression::type codec) { - default_column_properties_.set_compression(codec); - return this; - } - - Builder* max_statistics_size(size_t max_stats_sz) { - default_column_properties_.set_max_statistics_size(max_stats_sz); - return this; - } - - Builder* compression(const std::string& path, Compression::type codec) { - codecs_[path] = codec; - return this; - } - - Builder* compression(const std::shared_ptr& path, - Compression::type codec) { - return this->compression(path->ToDotString(), codec); - } - - Builder* enable_statistics() { - default_column_properties_.set_statistics_enabled(true); - return this; - } - - Builder* disable_statistics() { - default_column_properties_.set_statistics_enabled(false); - return this; - } - - Builder* enable_statistics(const std::string& path) { - statistics_enabled_[path] = true; - return this; - } - - Builder* enable_statistics(const std::shared_ptr& path) { - return this->enable_statistics(path->ToDotString()); - } - - Builder* disable_statistics(const std::string& path) { - statistics_enabled_[path] = false; - return this; - } - - Builder* disable_statistics(const std::shared_ptr& path) { - return this->disable_statistics(path->ToDotString()); - } - - std::shared_ptr build() { - std::unordered_map column_properties; - auto get = [&](const std::string& key) -> ColumnProperties& { - auto it = column_properties.find(key); - if (it == column_properties.end()) - return column_properties[key] = default_column_properties_; - else - return it->second; - }; - - for (const auto& item : encodings_) get(item.first).set_encoding(item.second); - for (const auto& item : codecs_) get(item.first).set_compression(item.second); - for (const auto& item : dictionary_enabled_) - get(item.first).set_dictionary_enabled(item.second); - for (const auto& item : statistics_enabled_) - get(item.first).set_statistics_enabled(item.second); - - return std::shared_ptr( - new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_, - max_row_group_length_, pagesize_, version_, created_by_, - default_column_properties_, column_properties)); - } - - private: - ::arrow::MemoryPool* pool_; - int64_t dictionary_pagesize_limit_; - int64_t write_batch_size_; - int64_t max_row_group_length_; - int64_t pagesize_; - ParquetVersion::type version_; - std::string created_by_; - - // Settings used for each column unless overridden in any of the maps below - ColumnProperties default_column_properties_; - std::unordered_map encodings_; - std::unordered_map codecs_; - std::unordered_map dictionary_enabled_; - std::unordered_map statistics_enabled_; - }; - - inline ::arrow::MemoryPool* memory_pool() const { return pool_; } - - inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; } - - inline int64_t write_batch_size() const { return write_batch_size_; } - - inline int64_t max_row_group_length() const { return max_row_group_length_; } - - inline int64_t data_pagesize() const { return pagesize_; } - - inline ParquetVersion::type version() const { return parquet_version_; } - - inline std::string created_by() const { return parquet_created_by_; } - - inline Encoding::type dictionary_index_encoding() const { - if (parquet_version_ == ParquetVersion::PARQUET_1_0) { - return Encoding::PLAIN_DICTIONARY; - } else { - return Encoding::RLE_DICTIONARY; - } - } - - inline Encoding::type dictionary_page_encoding() const { - if (parquet_version_ == ParquetVersion::PARQUET_1_0) { - return Encoding::PLAIN_DICTIONARY; - } else { - return Encoding::PLAIN; - } - } - - const ColumnProperties& column_properties( - const std::shared_ptr& path) const { - auto it = column_properties_.find(path->ToDotString()); - if (it != column_properties_.end()) return it->second; - return default_column_properties_; - } - - Encoding::type encoding(const std::shared_ptr& path) const { - return column_properties(path).encoding(); - } - - Compression::type compression(const std::shared_ptr& path) const { - return column_properties(path).compression(); - } - - bool dictionary_enabled(const std::shared_ptr& path) const { - return column_properties(path).dictionary_enabled(); - } - - bool statistics_enabled(const std::shared_ptr& path) const { - return column_properties(path).statistics_enabled(); - } - - size_t max_statistics_size(const std::shared_ptr& path) const { - return column_properties(path).max_statistics_size(); - } - - private: - explicit WriterProperties( - ::arrow::MemoryPool* pool, int64_t dictionary_pagesize_limit, - int64_t write_batch_size, int64_t max_row_group_length, int64_t pagesize, - ParquetVersion::type version, const std::string& created_by, - const ColumnProperties& default_column_properties, - const std::unordered_map& column_properties) - : pool_(pool), - dictionary_pagesize_limit_(dictionary_pagesize_limit), - write_batch_size_(write_batch_size), - max_row_group_length_(max_row_group_length), - pagesize_(pagesize), - parquet_version_(version), - parquet_created_by_(created_by), - default_column_properties_(default_column_properties), - column_properties_(column_properties) {} - - ::arrow::MemoryPool* pool_; - int64_t dictionary_pagesize_limit_; - int64_t write_batch_size_; - int64_t max_row_group_length_; - int64_t pagesize_; - ParquetVersion::type parquet_version_; - std::string parquet_created_by_; - ColumnProperties default_column_properties_; - std::unordered_map column_properties_; -}; - -std::shared_ptr PARQUET_EXPORT default_writer_properties(); - -} // namespace parquet - -#endif // PARQUET_COLUMN_PROPERTIES_H diff --git a/r/R/inst/include/parquet/schema-internal.h b/r/R/inst/include/parquet/schema-internal.h deleted file mode 100644 index 42eac097ade..00000000000 --- a/r/R/inst/include/parquet/schema-internal.h +++ /dev/null @@ -1,139 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module contains the logical parquet-cpp types (independent of Thrift -// structures), schema nodes, and related type tools - -#ifndef PARQUET_SCHEMA_INTERNAL_H -#define PARQUET_SCHEMA_INTERNAL_H - -#include -#include -#include -#include -#include - -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace parquet { - -namespace format { -class SchemaElement; -} - -namespace schema { - -inline bool str_endswith_tuple(const std::string& str) { - if (str.size() >= 6) { - return str.substr(str.size() - 6, 6) == "_tuple"; - } - return false; -} - -// Special case mentioned in the format spec: -// If the name is array or ends in _tuple, this should be a list of struct -// even for single child elements. -inline bool HasStructListName(const GroupNode& node) { - return (node.name() == "array" || str_endswith_tuple(node.name())); -} - -// TODO(itaiin): This aux. function is to be deleted once repeated structs are supported -inline bool IsSimpleStruct(const Node* node) { - if (!node->is_group()) return false; - if (node->is_repeated()) return false; - if (node->logical_type() == LogicalType::LIST) return false; - // Special case mentioned in the format spec: - // If the name is array or ends in _tuple, this should be a list of struct - // even for single child elements. - auto group = static_cast(node); - if (group->field_count() == 1 && HasStructListName(*group)) return false; - - return true; -} - -// Coalesce a list of schema fields indices which are the roots of the -// columns referred by a list of column indices -inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr, - const std::vector& column_indices, - std::vector* out) { - const GroupNode* group = descr.group_node(); - std::unordered_set already_added; - out->clear(); - for (auto& column_idx : column_indices) { - auto field_node = descr.GetColumnRoot(column_idx); - auto field_idx = group->FieldIndex(*field_node); - if (field_idx < 0) { - return false; - } - auto insertion = already_added.insert(field_idx); - if (insertion.second) { - out->push_back(field_idx); - } - } - - return true; -} - -// ---------------------------------------------------------------------- -// Conversion from Parquet Thrift metadata - -std::shared_ptr FromParquet( - const std::vector& schema); - -class FlatSchemaConverter { - public: - FlatSchemaConverter(const format::SchemaElement* elements, int length) - : elements_(elements), length_(length), pos_(0), current_id_(0) {} - - std::unique_ptr Convert(); - - private: - const format::SchemaElement* elements_; - int length_; - int pos_; - int current_id_; - - int next_id() { return current_id_++; } - - const format::SchemaElement& Next(); - - std::unique_ptr NextNode(); -}; - -// ---------------------------------------------------------------------- -// Conversion to Parquet Thrift metadata - -void ToParquet(const GroupNode* schema, std::vector* out); - -// Converts nested parquet schema back to a flat vector of Thrift structs -class SchemaFlattener { - public: - SchemaFlattener(const GroupNode* schema, std::vector* out); - - void Flatten(); - - private: - const GroupNode* root_; - std::vector* elements_; -}; - -} // namespace schema -} // namespace parquet - -#endif // PARQUET_SCHEMA_INTERNAL_H diff --git a/r/R/inst/include/parquet/schema.h b/r/R/inst/include/parquet/schema.h deleted file mode 100644 index e35d6599fe0..00000000000 --- a/r/R/inst/include/parquet/schema.h +++ /dev/null @@ -1,470 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module contains the logical parquet-cpp types (independent of Thrift -// structures), schema nodes, and related type tools - -#ifndef PARQUET_SCHEMA_TYPES_H -#define PARQUET_SCHEMA_TYPES_H - -#include -#include -#include -#include -#include -#include - -#include "arrow/util/macros.h" - -#include "parquet/platform.h" -#include "parquet/types.h" - -namespace parquet { - -class SchemaDescriptor; - -namespace schema { - -class Node; - -// List encodings: using the terminology from Impala to define different styles -// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since -// the converted type named in the Parquet metadata is ConvertedType::LIST we -// use that terminology here. It also helps distinguish from the *_ARRAY -// primitive types. -// -// One-level encoding: Only allows required lists with required cells -// repeated value_type name -// -// Two-level encoding: Enables optional lists with only required cells -// group list -// repeated value_type item -// -// Three-level encoding: Enables optional lists with optional cells -// group bag -// repeated group list -// value_type item -// -// 2- and 1-level encoding are respectively equivalent to 3-level encoding with -// the non-repeated nodes set to required. -// -// The "official" encoding recommended in the Parquet spec is the 3-level, and -// we use that as the default when creating list types. For semantic completeness -// we allow the other two. Since all types of encodings will occur "in the -// wild" we need to be able to interpret the associated definition levels in -// the context of the actual encoding used in the file. -// -// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated -// SchemaElement, which could make things challenging if we are trying to infer -// that a sequence of nodes semantically represents an array according to one -// of these encodings (versus a struct containing an array). We should refuse -// the temptation to guess, as they say. -struct ListEncoding { - enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; -}; - -class PARQUET_EXPORT ColumnPath { - public: - ColumnPath() : path_() {} - explicit ColumnPath(const std::vector& path) : path_(path) {} - explicit ColumnPath(std::vector&& path) : path_(path) {} - - static std::shared_ptr FromDotString(const std::string& dotstring); - static std::shared_ptr FromNode(const Node& node); - - std::shared_ptr extend(const std::string& node_name) const; - std::string ToDotString() const; - const std::vector& ToDotVector() const; - - protected: - std::vector path_; -}; - -class GroupNode; - -// Base class for logical schema types. A type has a name, repetition level, -// and optionally a logical type (ConvertedType in Parquet metadata parlance) -class PARQUET_EXPORT Node { - public: - enum type { PRIMITIVE, GROUP }; - - Node(Node::type type, const std::string& name, Repetition::type repetition, - LogicalType::type logical_type = LogicalType::NONE, int id = -1) - : type_(type), - name_(name), - repetition_(repetition), - logical_type_(logical_type), - id_(id), - parent_(NULLPTR) {} - - Node(Node::type type, const std::string& name, Repetition::type repetition, - std::shared_ptr logical_annotation, int id = -1) - : type_(type), - name_(name), - repetition_(repetition), - logical_annotation_(logical_annotation), - id_(id), - parent_(NULLPTR) {} - - virtual ~Node() {} - - bool is_primitive() const { return type_ == Node::PRIMITIVE; } - - bool is_group() const { return type_ == Node::GROUP; } - - bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } - - bool is_repeated() const { return repetition_ == Repetition::REPEATED; } - - bool is_required() const { return repetition_ == Repetition::REQUIRED; } - - virtual bool Equals(const Node* other) const = 0; - - const std::string& name() const { return name_; } - - Node::type node_type() const { return type_; } - - Repetition::type repetition() const { return repetition_; } - - LogicalType::type logical_type() const { return logical_type_; } - - const std::shared_ptr& logical_annotation() const { - return logical_annotation_; - } - - int id() const { return id_; } - - const Node* parent() const { return parent_; } - - const std::shared_ptr path() const; - - virtual void ToParquet(void* element) const = 0; - - // Node::Visitor abstract class for walking schemas with the visitor pattern - class Visitor { - public: - virtual ~Visitor() {} - - virtual void Visit(Node* node) = 0; - }; - class ConstVisitor { - public: - virtual ~ConstVisitor() {} - - virtual void Visit(const Node* node) = 0; - }; - - virtual void Visit(Visitor* visitor) = 0; - virtual void VisitConst(ConstVisitor* visitor) const = 0; - - protected: - friend class GroupNode; - - Node::type type_; - std::string name_; - Repetition::type repetition_; - LogicalType::type logical_type_; - std::shared_ptr logical_annotation_; - int id_; - // Nodes should not be shared, they have a single parent. - const Node* parent_; - - bool EqualsInternal(const Node* other) const; - void SetParent(const Node* p_parent); - - private: - PARQUET_DISALLOW_COPY_AND_ASSIGN(Node); -}; - -// Save our breath all over the place with these typedefs -typedef std::shared_ptr NodePtr; -typedef std::vector NodeVector; - -// A type that is one of the primitive Parquet storage types. In addition to -// the other type metadata (name, repetition level, logical type), also has the -// physical storage type and their type-specific metadata (byte width, decimal -// parameters) -class PARQUET_EXPORT PrimitiveNode : public Node { - public: - static std::unique_ptr FromParquet(const void* opaque_element, int id); - - static inline NodePtr Make(const std::string& name, Repetition::type repetition, - Type::type type, - LogicalType::type logical_type = LogicalType::NONE, - int length = -1, int precision = -1, int scale = -1) { - return NodePtr(new PrimitiveNode(name, repetition, type, logical_type, length, - precision, scale)); - } - - static inline NodePtr Make(const std::string& name, Repetition::type repetition, - std::shared_ptr logical_annotation, - Type::type primitive_type, int primitive_length = -1) { - return NodePtr(new PrimitiveNode(name, repetition, logical_annotation, primitive_type, - primitive_length)); - } - - bool Equals(const Node* other) const override; - - Type::type physical_type() const { return physical_type_; } - - ColumnOrder column_order() const { return column_order_; } - - void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; } - - int32_t type_length() const { return type_length_; } - - const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } - - void ToParquet(void* element) const override; - void Visit(Visitor* visitor) override; - void VisitConst(ConstVisitor* visitor) const override; - - private: - PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, - LogicalType::type logical_type = LogicalType::NONE, int length = -1, - int precision = -1, int scale = -1, int id = -1); - - PrimitiveNode(const std::string& name, Repetition::type repetition, - std::shared_ptr logical_annotation, - Type::type primitive_type, int primitive_length = -1, int id = -1); - - Type::type physical_type_; - int32_t type_length_; - DecimalMetadata decimal_metadata_; - ColumnOrder column_order_; - - // For FIXED_LEN_BYTE_ARRAY - void SetTypeLength(int32_t length) { type_length_ = length; } - - bool EqualsInternal(const PrimitiveNode* other) const; - - FRIEND_TEST(TestPrimitiveNode, Attrs); - FRIEND_TEST(TestPrimitiveNode, Equals); - FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); - FRIEND_TEST(TestPrimitiveNode, FromParquet); -}; - -class PARQUET_EXPORT GroupNode : public Node { - public: - static std::unique_ptr FromParquet(const void* opaque_element, int id, - const NodeVector& fields); - - static inline NodePtr Make(const std::string& name, Repetition::type repetition, - const NodeVector& fields, - LogicalType::type logical_type = LogicalType::NONE) { - return NodePtr(new GroupNode(name, repetition, fields, logical_type)); - } - - static inline NodePtr Make( - const std::string& name, Repetition::type repetition, const NodeVector& fields, - std::shared_ptr logical_annotation) { - return NodePtr(new GroupNode(name, repetition, fields, logical_annotation)); - } - - bool Equals(const Node* other) const override; - - NodePtr field(int i) const { return fields_[i]; } - // Get the index of a field by its name, or negative value if not found. - // If several fields share the same name, it is unspecified which one - // is returned. - int FieldIndex(const std::string& name) const; - // Get the index of a field by its node, or negative value if not found. - int FieldIndex(const Node& node) const; - - int field_count() const { return static_cast(fields_.size()); } - - void ToParquet(void* element) const override; - void Visit(Visitor* visitor) override; - void VisitConst(ConstVisitor* visitor) const override; - - private: - GroupNode(const std::string& name, Repetition::type repetition, - const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE, - int id = -1); - - GroupNode(const std::string& name, Repetition::type repetition, - const NodeVector& fields, - std::shared_ptr logical_annotation, int id = -1); - - NodeVector fields_; - bool EqualsInternal(const GroupNode* other) const; - - // Mapping between field name to the field index - std::unordered_multimap field_name_to_idx_; - - FRIEND_TEST(TestGroupNode, Attrs); - FRIEND_TEST(TestGroupNode, Equals); - FRIEND_TEST(TestGroupNode, FieldIndex); - FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName); -}; - -// ---------------------------------------------------------------------- -// Convenience primitive type factory functions - -#define PRIMITIVE_FACTORY(FuncName, TYPE) \ - static inline NodePtr FuncName(const std::string& name, \ - Repetition::type repetition = Repetition::OPTIONAL) { \ - return PrimitiveNode::Make(name, repetition, Type::TYPE); \ - } - -PRIMITIVE_FACTORY(Boolean, BOOLEAN); -PRIMITIVE_FACTORY(Int32, INT32); -PRIMITIVE_FACTORY(Int64, INT64); -PRIMITIVE_FACTORY(Int96, INT96); -PRIMITIVE_FACTORY(Float, FLOAT); -PRIMITIVE_FACTORY(Double, DOUBLE); -PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY); - -void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream, - int indent_width = 2); - -} // namespace schema - -// The ColumnDescriptor encapsulates information necessary to interpret -// primitive column data in the context of a particular schema. We have to -// examine the node structure of a column's path to the root in the schema tree -// to be able to reassemble the nested structure from the repetition and -// definition levels. -class PARQUET_EXPORT ColumnDescriptor { - public: - ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, - int16_t max_repetition_level, - const SchemaDescriptor* schema_descr = NULLPTR); - - bool Equals(const ColumnDescriptor& other) const; - - int16_t max_definition_level() const { return max_definition_level_; } - - int16_t max_repetition_level() const { return max_repetition_level_; } - - Type::type physical_type() const { return primitive_node_->physical_type(); } - - LogicalType::type logical_type() const { return primitive_node_->logical_type(); } - - const std::shared_ptr& logical_annotation() const { - return primitive_node_->logical_annotation(); - } - - ColumnOrder column_order() const { return primitive_node_->column_order(); } - - SortOrder::type sort_order() const { - auto la = logical_annotation(); - auto pt = physical_type(); - return la ? GetSortOrder(la, pt) : GetSortOrder(logical_type(), pt); - } - - const std::string& name() const { return primitive_node_->name(); } - - const std::shared_ptr path() const; - - const schema::NodePtr& schema_node() const { return node_; } - - std::string ToString() const; - - int type_length() const; - - int type_precision() const; - - int type_scale() const; - - private: - schema::NodePtr node_; - const schema::PrimitiveNode* primitive_node_; - - int16_t max_definition_level_; - int16_t max_repetition_level_; -}; - -// Container for the converted Parquet schema with a computed information from -// the schema analysis needed for file reading -// -// * Column index to Node -// * Max repetition / definition levels for each primitive node -// -// The ColumnDescriptor objects produced by this class can be used to assist in -// the reconstruction of fully materialized data structures from the -// repetition-definition level encoding of nested data -// -// TODO(wesm): this object can be recomputed from a Schema -class PARQUET_EXPORT SchemaDescriptor { - public: - SchemaDescriptor() {} - ~SchemaDescriptor() {} - - // Analyze the schema - void Init(std::unique_ptr schema); - void Init(const schema::NodePtr& schema); - - const ColumnDescriptor* Column(int i) const; - - // Get the index of a column by its dotstring path, or negative value if not found. - // If several columns share the same dotstring path, it is unspecified which one - // is returned. - int ColumnIndex(const std::string& node_path) const; - // Get the index of a column by its node, or negative value if not found. - int ColumnIndex(const schema::Node& node) const; - - bool Equals(const SchemaDescriptor& other) const; - - // The number of physical columns appearing in the file - int num_columns() const { return static_cast(leaves_.size()); } - - const schema::NodePtr& schema_root() const { return schema_; } - - const schema::GroupNode* group_node() const { return group_node_; } - - // Returns the root (child of the schema root) node of the leaf(column) node - const schema::Node* GetColumnRoot(int i) const; - - const std::string& name() const { return group_node_->name(); } - - std::string ToString() const; - - void updateColumnOrders(const std::vector& column_orders); - - private: - friend class ColumnDescriptor; - - // Root Node - schema::NodePtr schema_; - // Root Node - const schema::GroupNode* group_node_; - - void BuildTree(const schema::NodePtr& node, int16_t max_def_level, - int16_t max_rep_level, const schema::NodePtr& base); - - // Result of leaf node / tree analysis - std::vector leaves_; - - // Mapping between leaf nodes and root group of leaf (first node - // below the schema's root group) - // - // For example, the leaf `a.b.c.d` would have a link back to `a` - // - // -- a <------ - // -- -- b | - // -- -- -- c | - // -- -- -- -- d - std::unordered_map leaf_to_base_; - - // Mapping between ColumnPath DotString to the leaf index - std::unordered_multimap leaf_to_idx_; -}; - -} // namespace parquet - -#endif // PARQUET_SCHEMA_TYPES_H diff --git a/r/R/inst/include/parquet/statistics.h b/r/R/inst/include/parquet/statistics.h deleted file mode 100644 index 2dc78da4c3c..00000000000 --- a/r/R/inst/include/parquet/statistics.h +++ /dev/null @@ -1,307 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include - -#include "parquet/platform.h" -#include "parquet/schema.h" -#include "parquet/types.h" - -namespace parquet { - -// ---------------------------------------------------------------------- -// Value comparator interfaces - -/// \brief Base class for value comparators. Generally used with -/// TypedComparator -class PARQUET_EXPORT Comparator { - public: - virtual ~Comparator() {} - - /// \brief Create a comparator explicitly from physical type and - /// sort order - /// \param[in] physical_type the physical type for the typed - /// comparator - /// \param[in] sort_order either SortOrder::SIGNED or - /// SortOrder::UNSIGNED - /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only - static std::shared_ptr Make(Type::type physical_type, - SortOrder::type sort_order, - int type_length = -1); - - /// \brief Create typed comparator inferring default sort order from - /// ColumnDescriptor - /// \param[in] descr the Parquet column schema - static std::shared_ptr Make(const ColumnDescriptor* descr); -}; - -/// \brief Interface for comparison of physical types according to the -/// semantics of a particular logical type. -template -class TypedComparator : public Comparator { - public: - using T = typename DType::c_type; - - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(Type::type physical_type, - SortOrder::type sort_order, - int type_length = -1) { - return std::static_pointer_cast>( - Comparator::Make(physical_type, sort_order, type_length)); - } - - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(const ColumnDescriptor* descr) { - return std::static_pointer_cast>(Comparator::Make(descr)); - } - - /// \brief Scalar comparison of two elements, return true if first - /// is strictly less than the second - virtual bool Compare(const T& a, const T& b) = 0; - - /// \brief Compute maximum and minimum elements in a batch of - /// elements without any nulls - virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0; - - /// \brief Compute maximum and minimum elements in a batch of - /// elements with accompanying bitmap indicating which elements are - /// included (bit set) and excluded (bit not set) - /// - /// \param[in] values the sequence of values - /// \param[in] length the length of the sequence - /// \param[in] valid_bits a bitmap indicating which elements are - /// included (1) or excluded (0) - /// \param[in] valid_bits_offset the bit offset into the bitmap of - /// the first element in the sequence - /// \param[out] out_min the returned minimum element - /// \param[out] out_max the returned maximum element - virtual void GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits, - int64_t valid_bits_offset, T* out_min, T* out_max) = 0; -}; - -// ---------------------------------------------------------------------- - -/// \brief Structure represented encoded statistics to be written to -/// and from Parquet serialized metadata -class PARQUET_EXPORT EncodedStatistics { - std::shared_ptr max_, min_; - bool is_signed_ = false; - - public: - EncodedStatistics() - : max_(std::make_shared()), min_(std::make_shared()) {} - - const std::string& max() const { return *max_; } - const std::string& min() const { return *min_; } - - int64_t null_count = 0; - int64_t distinct_count = 0; - - bool has_min = false; - bool has_max = false; - bool has_null_count = false; - bool has_distinct_count = false; - - // From parquet-mr - // Don't write stats larger than the max size rather than truncating. The - // rationale is that some engines may use the minimum value in the page as - // the true minimum for aggregations and there is no way to mark that a - // value has been truncated and is a lower bound and not in the page. - void ApplyStatSizeLimits(size_t length) { - if (max_->length() > length) { - has_max = false; - } - if (min_->length() > length) { - has_min = false; - } - } - - inline bool is_set() const { - return has_min || has_max || has_null_count || has_distinct_count; - } - - inline bool is_signed() const { return is_signed_; } - - inline void set_is_signed(bool is_signed) { is_signed_ = is_signed; } - - inline EncodedStatistics& set_max(const std::string& value) { - *max_ = value; - has_max = true; - return *this; - } - - inline EncodedStatistics& set_min(const std::string& value) { - *min_ = value; - has_min = true; - return *this; - } - - inline EncodedStatistics& set_null_count(int64_t value) { - null_count = value; - has_null_count = true; - return *this; - } - - inline EncodedStatistics& set_distinct_count(int64_t value) { - distinct_count = value; - has_distinct_count = true; - return *this; - } -}; - -/// \brief Base type for computing column statistics while writing a file -class PARQUET_EXPORT Statistics { - public: - virtual ~Statistics() {} - - /// \brief Create a new statistics instance given a column schema - /// definition - /// \param[in] descr the column schema - /// \param[in] pool a memory pool to use for any memory allocations, optional - static std::shared_ptr Make( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - /// \brief Create a new statistics instance given a column schema - /// definition and pre-existing state - /// \param[in] descr the column schema - /// \param[in] encoded_min the encoded minimum value - /// \param[in] encoded_max the encoded maximum value - /// \param[in] num_values total number of values - /// \param[in] null_count number of null values - /// \param[in] distinct_count number of distinct values - /// \param[in] has_min_max whether the min/max statistics are set - /// \param[in] pool a memory pool to use for any memory allocations, optional - static std::shared_ptr Make( - const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - /// \brief The number of null values, may not be set - virtual int64_t null_count() const = 0; - - /// \brief The number of distinct values, may not be set - virtual int64_t distinct_count() const = 0; - - /// \brief The total number of values in the column - virtual int64_t num_values() const = 0; - - /// \brief Return true if the min and max statistics are set. Obtain - /// with TypedStatistics::min and max - virtual bool HasMinMax() const = 0; - - /// \brief Reset state of object to initial (no data observed) state - virtual void Reset() = 0; - - /// \brief Plain-encoded minimum value - virtual std::string EncodeMin() = 0; - - /// \brief Plain-encoded maximum value - virtual std::string EncodeMax() = 0; - - /// \brief The finalized encoded form of the statistics for transport - virtual EncodedStatistics Encode() = 0; - - /// \brief The physical type of the column schema - virtual Type::type physical_type() const = 0; - - protected: - static std::shared_ptr Make(Type::type physical_type, const void* min, - const void* max, int64_t num_values, - int64_t null_count, int64_t distinct_count); -}; - -/// \brief A typed implementation of Statistics -template -class TypedStatistics : public Statistics { - public: - using T = typename DType::c_type; - - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, pool)); - } - - /// \brief Create Statistics initialized to a particular state - /// \param[in] min the minimum value - /// \param[in] max the minimum value - /// \param[in] num_values number of values - /// \param[in] null_count number of null values - /// \param[in] distinct_count number of distinct values - static std::shared_ptr> Make(const T& min, const T& max, - int64_t num_values, - int64_t null_count, - int64_t distinct_count) { - return std::static_pointer_cast>(Statistics::Make( - DType::type_num, &min, &max, num_values, null_count, distinct_count)); - } - - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, pool)); - } - - /// \brief The current minimum value - virtual const T& min() const = 0; - - /// \brief The current maximum value - virtual const T& max() const = 0; - - /// \brief Update state with state of another Statistics object - virtual void Merge(const TypedStatistics& other) = 0; - - /// \brief Batch statistics update - virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0; - - /// \brief Batch statistics update with supplied validity bitmap - virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits, - int64_t valid_bits_spaced, int64_t num_not_null, - int64_t num_null) = 0; - - /// \brief Set min and max values to particular values - virtual void SetMinMax(const T& min, const T& max) = 0; -}; - -#ifndef ARROW_NO_DEPRECATED_API -// TODO(wesm): Remove after Arrow 0.14.0 -using RowGroupStatistics = Statistics; -#endif - -using BoolStatistics = TypedStatistics; -using Int32Statistics = TypedStatistics; -using Int64Statistics = TypedStatistics; -using FloatStatistics = TypedStatistics; -using DoubleStatistics = TypedStatistics; -using ByteArrayStatistics = TypedStatistics; -using FLBAStatistics = TypedStatistics; - -} // namespace parquet diff --git a/r/R/inst/include/parquet/test-util.h b/r/R/inst/include/parquet/test-util.h deleted file mode 100644 index c49dcda181b..00000000000 --- a/r/R/inst/include/parquet/test-util.h +++ /dev/null @@ -1,710 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This module defines an abstract interface for iterating through pages in a -// Parquet column chunk within a row group. It could be extended in the future -// to iterate through all data pages in all chunks in a file. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "arrow/testing/util.h" - -#include "parquet/column_page.h" -#include "parquet/column_reader.h" -#include "parquet/column_writer.h" -#include "parquet/encoding.h" -#include "parquet/platform.h" - -namespace parquet { - -static constexpr int FLBA_LENGTH = 12; - -inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) { - return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH); -} - -namespace test { - -typedef ::testing::Types - ParquetTypes; - -class ParquetTestException : public parquet::ParquetException { - using ParquetException::ParquetException; -}; - -const char* get_data_dir(); -std::string get_bad_data_dir(); - -std::string get_data_file(const std::string& filename, bool is_good = true); - -template -static inline void assert_vector_equal(const std::vector& left, - const std::vector& right) { - ASSERT_EQ(left.size(), right.size()); - - for (size_t i = 0; i < left.size(); ++i) { - ASSERT_EQ(left[i], right[i]) << i; - } -} - -template -static inline bool vector_equal(const std::vector& left, const std::vector& right) { - if (left.size() != right.size()) { - return false; - } - - for (size_t i = 0; i < left.size(); ++i) { - if (left[i] != right[i]) { - std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i] - << std::endl; - return false; - } - } - - return true; -} - -template -static std::vector slice(const std::vector& values, int start, int end) { - if (end < start) { - return std::vector(0); - } - - std::vector out(end - start); - for (int i = start; i < end; ++i) { - out[i - start] = values[i]; - } - return out; -} - -void random_bytes(int n, uint32_t seed, std::vector* out); -void random_bools(int n, double p, uint32_t seed, bool* out); - -template -inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) { - std::default_random_engine gen(seed); - std::uniform_int_distribution d(min_value, max_value); - for (int i = 0; i < n; ++i) { - out[i] = d(gen); - } -} - -template <> -inline void random_numbers(int n, uint32_t seed, float min_value, float max_value, - float* out) { - std::default_random_engine gen(seed); - std::uniform_real_distribution d(min_value, max_value); - for (int i = 0; i < n; ++i) { - out[i] = d(gen); - } -} - -template <> -inline void random_numbers(int n, uint32_t seed, double min_value, double max_value, - double* out) { - std::default_random_engine gen(seed); - std::uniform_real_distribution d(min_value, max_value); - for (int i = 0; i < n; ++i) { - out[i] = d(gen); - } -} - -void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, - Int96* out); - -void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out); - -void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, - int max_size); - -void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size); - -template -std::shared_ptr EncodeValues(Encoding::type encoding, bool use_dictionary, - const Sequence& values, int length, - const ColumnDescriptor* descr) { - auto encoder = MakeTypedEncoder(encoding, use_dictionary, descr); - encoder->Put(values, length); - return encoder->FlushValues(); -} - -template -static void InitValues(int num_values, std::vector& values, - std::vector& buffer) { - random_numbers(num_values, 0, std::numeric_limits::min(), - std::numeric_limits::max(), values.data()); -} - -template -static void InitDictValues(int num_values, int num_dicts, std::vector& values, - std::vector& buffer) { - int repeat_factor = num_values / num_dicts; - InitValues(num_dicts, values, buffer); - // add some repeated values - for (int j = 1; j < repeat_factor; ++j) { - for (int i = 0; i < num_dicts; ++i) { - std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T)); - } - } - // computed only dict_per_page * repeat_factor - 1 values < num_values - // compute remaining - for (int i = num_dicts * repeat_factor; i < num_values; ++i) { - std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T)); - } -} - -class MockPageReader : public PageReader { - public: - explicit MockPageReader(const std::vector>& pages) - : pages_(pages), page_index_(0) {} - - std::shared_ptr NextPage() override { - if (page_index_ == static_cast(pages_.size())) { - // EOS to consumer - return std::shared_ptr(nullptr); - } - return pages_[page_index_++]; - } - - // No-op - void set_max_page_header_size(uint32_t size) override {} - - private: - std::vector> pages_; - int page_index_; -}; - -// TODO(wesm): this is only used for testing for now. Refactor to form part of -// primary file write path -template -class DataPageBuilder { - public: - typedef typename Type::c_type T; - - // This class writes data and metadata to the passed inputs - explicit DataPageBuilder(ArrowOutputStream* sink) - : sink_(sink), - num_values_(0), - encoding_(Encoding::PLAIN), - definition_level_encoding_(Encoding::RLE), - repetition_level_encoding_(Encoding::RLE), - have_def_levels_(false), - have_rep_levels_(false), - have_values_(false) {} - - void AppendDefLevels(const std::vector& levels, int16_t max_level, - Encoding::type encoding = Encoding::RLE) { - AppendLevels(levels, max_level, encoding); - - num_values_ = std::max(static_cast(levels.size()), num_values_); - definition_level_encoding_ = encoding; - have_def_levels_ = true; - } - - void AppendRepLevels(const std::vector& levels, int16_t max_level, - Encoding::type encoding = Encoding::RLE) { - AppendLevels(levels, max_level, encoding); - - num_values_ = std::max(static_cast(levels.size()), num_values_); - repetition_level_encoding_ = encoding; - have_rep_levels_ = true; - } - - void AppendValues(const ColumnDescriptor* d, const std::vector& values, - Encoding::type encoding = Encoding::PLAIN) { - std::shared_ptr values_sink = EncodeValues( - encoding, false, values.data(), static_cast(values.size()), d); - PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size())); - - num_values_ = std::max(static_cast(values.size()), num_values_); - encoding_ = encoding; - have_values_ = true; - } - - int32_t num_values() const { return num_values_; } - - Encoding::type encoding() const { return encoding_; } - - Encoding::type rep_level_encoding() const { return repetition_level_encoding_; } - - Encoding::type def_level_encoding() const { return definition_level_encoding_; } - - private: - ArrowOutputStream* sink_; - - int32_t num_values_; - Encoding::type encoding_; - Encoding::type definition_level_encoding_; - Encoding::type repetition_level_encoding_; - - bool have_def_levels_; - bool have_rep_levels_; - bool have_values_; - - // Used internally for both repetition and definition levels - void AppendLevels(const std::vector& levels, int16_t max_level, - Encoding::type encoding) { - if (encoding != Encoding::RLE) { - ParquetException::NYI("only rle encoding currently implemented"); - } - - // TODO: compute a more precise maximum size for the encoded levels - std::vector encode_buffer(levels.size() * 2); - - // We encode into separate memory from the output stream because the - // RLE-encoded bytes have to be preceded in the stream by their absolute - // size. - LevelEncoder encoder; - encoder.Init(encoding, max_level, static_cast(levels.size()), - encode_buffer.data(), static_cast(encode_buffer.size())); - - encoder.Encode(static_cast(levels.size()), levels.data()); - - int32_t rle_bytes = encoder.len(); - PARQUET_THROW_NOT_OK( - sink_->Write(reinterpret_cast(&rle_bytes), sizeof(int32_t))); - PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes)); - } -}; - -template <> -inline void DataPageBuilder::AppendValues(const ColumnDescriptor* d, - const std::vector& values, - Encoding::type encoding) { - if (encoding != Encoding::PLAIN) { - ParquetException::NYI("only plain encoding currently implemented"); - } - - auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, d); - dynamic_cast(encoder.get()) - ->Put(values, static_cast(values.size())); - std::shared_ptr buffer = encoder->FlushValues(); - PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size())); - - num_values_ = std::max(static_cast(values.size()), num_values_); - encoding_ = encoding; - have_values_ = true; -} - -template -static std::shared_ptr MakeDataPage( - const ColumnDescriptor* d, const std::vector& values, - int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size, - const std::vector& def_levels, int16_t max_def_level, - const std::vector& rep_levels, int16_t max_rep_level) { - int num_values = 0; - - auto page_stream = CreateOutputStream(); - test::DataPageBuilder page_builder(page_stream.get()); - - if (!rep_levels.empty()) { - page_builder.AppendRepLevels(rep_levels, max_rep_level); - } - if (!def_levels.empty()) { - page_builder.AppendDefLevels(def_levels, max_def_level); - } - - if (encoding == Encoding::PLAIN) { - page_builder.AppendValues(d, values, encoding); - num_values = page_builder.num_values(); - } else { // DICTIONARY PAGES - PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size)); - num_values = std::max(page_builder.num_values(), num_vals); - } - - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(page_stream->Finish(&buffer)); - - return std::make_shared(buffer, num_values, encoding, - page_builder.def_level_encoding(), - page_builder.rep_level_encoding()); -} - -template -class DictionaryPageBuilder { - public: - typedef typename TYPE::c_type TC; - static constexpr int TN = TYPE::type_num; - using SpecializedEncoder = typename EncodingTraits::Encoder; - - // This class writes data and metadata to the passed inputs - explicit DictionaryPageBuilder(const ColumnDescriptor* d) - : num_dict_values_(0), have_values_(false) { - auto encoder = MakeTypedEncoder(Encoding::PLAIN, true, d); - dict_traits_ = dynamic_cast*>(encoder.get()); - encoder_.reset(dynamic_cast(encoder.release())); - } - - ~DictionaryPageBuilder() {} - - std::shared_ptr AppendValues(const std::vector& values) { - int num_values = static_cast(values.size()); - // Dictionary encoding - encoder_->Put(values.data(), num_values); - num_dict_values_ = dict_traits_->num_entries(); - have_values_ = true; - return encoder_->FlushValues(); - } - - std::shared_ptr WriteDict() { - std::shared_ptr dict_buffer = - AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size()); - dict_traits_->WriteDict(dict_buffer->mutable_data()); - return dict_buffer; - } - - int32_t num_values() const { return num_dict_values_; } - - private: - DictEncoder* dict_traits_; - std::unique_ptr encoder_; - int32_t num_dict_values_; - bool have_values_; -}; - -template <> -inline DictionaryPageBuilder::DictionaryPageBuilder( - const ColumnDescriptor* d) { - ParquetException::NYI("only plain encoding currently implemented for boolean"); -} - -template <> -inline std::shared_ptr DictionaryPageBuilder::WriteDict() { - ParquetException::NYI("only plain encoding currently implemented for boolean"); - return nullptr; -} - -template <> -inline std::shared_ptr DictionaryPageBuilder::AppendValues( - const std::vector& values) { - ParquetException::NYI("only plain encoding currently implemented for boolean"); - return nullptr; -} - -template -inline static std::shared_ptr MakeDictPage( - const ColumnDescriptor* d, const std::vector& values, - const std::vector& values_per_page, Encoding::type encoding, - std::vector>& rle_indices) { - test::DictionaryPageBuilder page_builder(d); - int num_pages = static_cast(values_per_page.size()); - int value_start = 0; - - for (int i = 0; i < num_pages; i++) { - rle_indices.push_back(page_builder.AppendValues( - slice(values, value_start, value_start + values_per_page[i]))); - value_start += values_per_page[i]; - } - - auto buffer = page_builder.WriteDict(); - - return std::make_shared(buffer, page_builder.num_values(), - Encoding::PLAIN); -} - -// Given def/rep levels and values create multiple dict pages -template -inline static void PaginateDict(const ColumnDescriptor* d, - const std::vector& values, - const std::vector& def_levels, - int16_t max_def_level, - const std::vector& rep_levels, - int16_t max_rep_level, int num_levels_per_page, - const std::vector& values_per_page, - std::vector>& pages, - Encoding::type encoding = Encoding::RLE_DICTIONARY) { - int num_pages = static_cast(values_per_page.size()); - std::vector> rle_indices; - std::shared_ptr dict_page = - MakeDictPage(d, values, values_per_page, encoding, rle_indices); - pages.push_back(dict_page); - int def_level_start = 0; - int def_level_end = 0; - int rep_level_start = 0; - int rep_level_end = 0; - for (int i = 0; i < num_pages; i++) { - if (max_def_level > 0) { - def_level_start = i * num_levels_per_page; - def_level_end = (i + 1) * num_levels_per_page; - } - if (max_rep_level > 0) { - rep_level_start = i * num_levels_per_page; - rep_level_end = (i + 1) * num_levels_per_page; - } - std::shared_ptr data_page = MakeDataPage( - d, {}, values_per_page[i], encoding, rle_indices[i]->data(), - static_cast(rle_indices[i]->size()), - slice(def_levels, def_level_start, def_level_end), max_def_level, - slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); - pages.push_back(data_page); - } -} - -// Given def/rep levels and values create multiple plain pages -template -static inline void PaginatePlain(const ColumnDescriptor* d, - const std::vector& values, - const std::vector& def_levels, - int16_t max_def_level, - const std::vector& rep_levels, - int16_t max_rep_level, int num_levels_per_page, - const std::vector& values_per_page, - std::vector>& pages, - Encoding::type encoding = Encoding::PLAIN) { - int num_pages = static_cast(values_per_page.size()); - int def_level_start = 0; - int def_level_end = 0; - int rep_level_start = 0; - int rep_level_end = 0; - int value_start = 0; - for (int i = 0; i < num_pages; i++) { - if (max_def_level > 0) { - def_level_start = i * num_levels_per_page; - def_level_end = (i + 1) * num_levels_per_page; - } - if (max_rep_level > 0) { - rep_level_start = i * num_levels_per_page; - rep_level_end = (i + 1) * num_levels_per_page; - } - std::shared_ptr page = MakeDataPage( - d, slice(values, value_start, value_start + values_per_page[i]), - values_per_page[i], encoding, nullptr, 0, - slice(def_levels, def_level_start, def_level_end), max_def_level, - slice(rep_levels, rep_level_start, rep_level_end), max_rep_level); - pages.push_back(page); - value_start += values_per_page[i]; - } -} - -// Generates pages from randomly generated data -template -static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page, - std::vector& def_levels, - std::vector& rep_levels, - std::vector& values, - std::vector& buffer, - std::vector>& pages, - Encoding::type encoding = Encoding::PLAIN) { - int num_levels = levels_per_page * num_pages; - int num_values = 0; - uint32_t seed = 0; - int16_t zero = 0; - int16_t max_def_level = d->max_definition_level(); - int16_t max_rep_level = d->max_repetition_level(); - std::vector values_per_page(num_pages, levels_per_page); - // Create definition levels - if (max_def_level > 0) { - def_levels.resize(num_levels); - random_numbers(num_levels, seed, zero, max_def_level, def_levels.data()); - for (int p = 0; p < num_pages; p++) { - int num_values_per_page = 0; - for (int i = 0; i < levels_per_page; i++) { - if (def_levels[i + p * levels_per_page] == max_def_level) { - num_values_per_page++; - num_values++; - } - } - values_per_page[p] = num_values_per_page; - } - } else { - num_values = num_levels; - } - // Create repitition levels - if (max_rep_level > 0) { - rep_levels.resize(num_levels); - random_numbers(num_levels, seed, zero, max_rep_level, rep_levels.data()); - } - // Create values - values.resize(num_values); - if (encoding == Encoding::PLAIN) { - InitValues(num_values, values, buffer); - PaginatePlain(d, values, def_levels, max_def_level, rep_levels, max_rep_level, - levels_per_page, values_per_page, pages); - } else if (encoding == Encoding::RLE_DICTIONARY || - encoding == Encoding::PLAIN_DICTIONARY) { - // Calls InitValues and repeats the data - InitDictValues(num_values, levels_per_page, values, buffer); - PaginateDict(d, values, def_levels, max_def_level, rep_levels, max_rep_level, - levels_per_page, values_per_page, pages); - } - - return num_values; -} - -// ---------------------------------------------------------------------- -// Test data generation - -template <> -void inline InitValues(int num_values, std::vector& values, - std::vector& buffer) { - values = {}; - ::arrow::random_is_valid(num_values, 1., &values, - static_cast(::arrow::random_seed())); -} - -template <> -inline void InitValues(int num_values, std::vector& values, - std::vector& buffer) { - int max_byte_array_len = 12; - int num_bytes = static_cast(max_byte_array_len + sizeof(uint32_t)); - size_t nbytes = num_values * num_bytes; - buffer.resize(nbytes); - random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len); -} - -inline void InitWideByteArrayValues(int num_values, std::vector& values, - std::vector& buffer, int min_len, - int max_len) { - int num_bytes = static_cast(max_len + sizeof(uint32_t)); - size_t nbytes = num_values * num_bytes; - buffer.resize(nbytes); - random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len); -} - -template <> -inline void InitValues(int num_values, std::vector& values, - std::vector& buffer) { - size_t nbytes = num_values * FLBA_LENGTH; - buffer.resize(nbytes); - random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data()); -} - -template <> -inline void InitValues(int num_values, std::vector& values, - std::vector& buffer) { - random_Int96_numbers(num_values, 0, std::numeric_limits::min(), - std::numeric_limits::max(), values.data()); -} - -inline std::string TestColumnName(int i) { - std::stringstream col_name; - col_name << "column_" << i; - return col_name.str(); -} - -// This class lives here because of its dependency on the InitValues specializations. -template -class PrimitiveTypedTest : public ::testing::Test { - public: - typedef typename TestType::c_type T; - - void SetUpSchema(Repetition::type repetition, int num_columns = 1) { - std::vector fields; - - for (int i = 0; i < num_columns; ++i) { - std::string name = TestColumnName(i); - fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num, - LogicalType::NONE, FLBA_LENGTH)); - } - node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields); - schema_.Init(node_); - } - - void GenerateData(int64_t num_values); - void SetupValuesOut(int64_t num_values); - void SyncValuesOut(); - - protected: - schema::NodePtr node_; - SchemaDescriptor schema_; - - // Input buffers - std::vector values_; - - std::vector def_levels_; - - std::vector buffer_; - // Pointer to the values, needed as we cannot use std::vector::data() - T* values_ptr_; - std::vector bool_buffer_; - - // Output buffers - std::vector values_out_; - std::vector bool_buffer_out_; - T* values_out_ptr_; -}; - -template -inline void PrimitiveTypedTest::SyncValuesOut() {} - -template <> -inline void PrimitiveTypedTest::SyncValuesOut() { - std::vector::const_iterator source_iterator = bool_buffer_out_.begin(); - std::vector::iterator destination_iterator = values_out_.begin(); - while (source_iterator != bool_buffer_out_.end()) { - *destination_iterator++ = *source_iterator++ != 0; - } -} - -template -inline void PrimitiveTypedTest::SetupValuesOut(int64_t num_values) { - values_out_.clear(); - values_out_.resize(num_values); - values_out_ptr_ = values_out_.data(); -} - -template <> -inline void PrimitiveTypedTest::SetupValuesOut(int64_t num_values) { - values_out_.clear(); - values_out_.resize(num_values); - - bool_buffer_out_.clear(); - bool_buffer_out_.resize(num_values); - // Write once to all values so we can copy it without getting Valgrind errors - // about uninitialised values. - std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true); - values_out_ptr_ = reinterpret_cast(bool_buffer_out_.data()); -} - -template -inline void PrimitiveTypedTest::GenerateData(int64_t num_values) { - def_levels_.resize(num_values); - values_.resize(num_values); - - InitValues(static_cast(num_values), values_, buffer_); - values_ptr_ = values_.data(); - - std::fill(def_levels_.begin(), def_levels_.end(), 1); -} - -template <> -inline void PrimitiveTypedTest::GenerateData(int64_t num_values) { - def_levels_.resize(num_values); - values_.resize(num_values); - - InitValues(static_cast(num_values), values_, buffer_); - bool_buffer_.resize(num_values); - std::copy(values_.begin(), values_.end(), bool_buffer_.begin()); - values_ptr_ = reinterpret_cast(bool_buffer_.data()); - - std::fill(def_levels_.begin(), def_levels_.end(), 1); -} - -} // namespace test -} // namespace parquet diff --git a/r/R/inst/include/parquet/thrift.h b/r/R/inst/include/parquet/thrift.h deleted file mode 100644 index ffefd12900c..00000000000 --- a/r/R/inst/include/parquet/thrift.h +++ /dev/null @@ -1,214 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/util/windows_compatibility.h" - -#include -// Check if thrift version < 0.11.0 -// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp -#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR) -#include -#else -#include -#endif -#include - -// TCompactProtocol requires some #defines to work right. -#define SIGNED_RIGHT_SHIFT_IS 1 -#define ARITHMETIC_RIGHT_SHIFT 1 -#include -#include -#include - -#include -#include -#include - -#include "arrow/util/logging.h" -#include "parquet/exception.h" -#include "parquet/platform.h" -#include "parquet/statistics.h" - -#include "parquet/parquet_types.h" // IYWU pragma: export - -namespace parquet { - -// Check if thrift version < 0.11.0 -// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp -#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR) -using ::boost::shared_ptr; -#else -using ::std::shared_ptr; -#endif - -// ---------------------------------------------------------------------- -// Convert Thrift enums to / from parquet enums - -static inline Type::type FromThrift(format::Type::type type) { - return static_cast(type); -} - -static inline LogicalType::type FromThrift(format::ConvertedType::type type) { - // item 0 is NONE - return static_cast(static_cast(type) + 1); -} - -static inline Repetition::type FromThrift(format::FieldRepetitionType::type type) { - return static_cast(type); -} - -static inline Encoding::type FromThrift(format::Encoding::type type) { - return static_cast(type); -} - -static inline Compression::type FromThrift(format::CompressionCodec::type type) { - return static_cast(type); -} - -static inline format::Type::type ToThrift(Type::type type) { - return static_cast(type); -} - -static inline format::ConvertedType::type ToThrift(LogicalType::type type) { - // item 0 is NONE - DCHECK_NE(type, LogicalType::NONE); - return static_cast(static_cast(type) - 1); -} - -static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) { - return static_cast(type); -} - -static inline format::Encoding::type ToThrift(Encoding::type type) { - return static_cast(type); -} - -static inline format::CompressionCodec::type ToThrift(Compression::type type) { - return static_cast(type); -} - -static inline format::Statistics ToThrift(const EncodedStatistics& stats) { - format::Statistics statistics; - if (stats.has_min) { - statistics.__set_min_value(stats.min()); - // If the order is SIGNED, then the old min value must be set too. - // This for backward compatibility - if (stats.is_signed()) { - statistics.__set_min(stats.min()); - } - } - if (stats.has_max) { - statistics.__set_max_value(stats.max()); - // If the order is SIGNED, then the old max value must be set too. - // This for backward compatibility - if (stats.is_signed()) { - statistics.__set_max(stats.max()); - } - } - if (stats.has_null_count) { - statistics.__set_null_count(stats.null_count); - } - if (stats.has_distinct_count) { - statistics.__set_distinct_count(stats.distinct_count); - } - - return statistics; -} - -// ---------------------------------------------------------------------- -// Thrift struct serialization / deserialization utilities - -using ThriftBuffer = apache::thrift::transport::TMemoryBuffer; - -// Deserialize a thrift message from buf/len. buf/len must at least contain -// all the bytes needed to store the thrift message. On return, len will be -// set to the actual length of the header. -template -inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg) { - // Deserialize msg bytes into c++ thrift msg using memory transport. - shared_ptr tmem_transport( - new ThriftBuffer(const_cast(buf), *len)); - apache::thrift::protocol::TCompactProtocolFactoryT tproto_factory; - shared_ptr tproto = // - tproto_factory.getProtocol(tmem_transport); - try { - deserialized_msg->read(tproto.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't deserialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - uint32_t bytes_left = tmem_transport->available_read(); - *len = *len - bytes_left; -} - -/// Utility class to serialize thrift objects to a binary format. This object -/// should be reused if possible to reuse the underlying memory. -/// Note: thrift will encode NULLs into the serialized buffer so it is not valid -/// to treat it as a string. -class ThriftSerializer { - public: - explicit ThriftSerializer(int initial_buffer_size = 1024) - : mem_buffer_(new ThriftBuffer(initial_buffer_size)) { - apache::thrift::protocol::TCompactProtocolFactoryT factory; - protocol_ = factory.getProtocol(mem_buffer_); - } - - /// Serialize obj into a memory buffer. The result is returned in buffer/len. The - /// memory returned is owned by this object and will be invalid when another object - /// is serialized. - template - void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) { - SerializeObject(obj); - mem_buffer_->getBuffer(buffer, len); - } - - template - void SerializeToString(const T* obj, std::string* result) { - SerializeObject(obj); - *result = mem_buffer_->getBufferAsString(); - } - - template - int64_t Serialize(const T* obj, ArrowOutputStream* out) { - uint8_t* out_buffer; - uint32_t out_length; - SerializeToBuffer(obj, &out_length, &out_buffer); - PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length)); - return static_cast(out_length); - } - - private: - template - void SerializeObject(const T* obj) { - try { - mem_buffer_->resetBuffer(); - obj->write(protocol_.get()); - } catch (std::exception& e) { - std::stringstream ss; - ss << "Couldn't serialize thrift: " << e.what() << "\n"; - throw ParquetException(ss.str()); - } - } - - shared_ptr mem_buffer_; - shared_ptr protocol_; -}; - -} // namespace parquet diff --git a/r/R/inst/include/parquet/types.h b/r/R/inst/include/parquet/types.h deleted file mode 100644 index 779ea6b9b5b..00000000000 --- a/r/R/inst/include/parquet/types.h +++ /dev/null @@ -1,662 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_TYPES_H -#define PARQUET_TYPES_H - -#include -#include -#include -#include -#include -#include -#include - -#include "parquet/platform.h" - -namespace arrow { -namespace util { - -class Codec; - -} // namespace util -} // namespace arrow - -namespace parquet { - -// ---------------------------------------------------------------------- -// Metadata enums to match Thrift metadata -// -// The reason we maintain our own enums is to avoid transitive dependency on -// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the -// public API. After building parquet-cpp, you should not need to include -// Thrift headers in your application. This means some boilerplate to convert -// between our types and Parquet's Thrift types. -// -// We can also add special values like NONE to distinguish between metadata -// values being set and not set. As an example consider ConvertedType and -// CompressionCodec - -// Mirrors parquet::Type -struct Type { - enum type { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - INT96 = 3, - FLOAT = 4, - DOUBLE = 5, - BYTE_ARRAY = 6, - FIXED_LEN_BYTE_ARRAY = 7, - // Should always be last element. - UNDEFINED = 8 - }; -}; - -// Mirrors parquet::ConvertedType -struct LogicalType { - enum type { - NONE, - UTF8, - MAP, - MAP_KEY_VALUE, - LIST, - ENUM, - DECIMAL, - DATE, - TIME_MILLIS, - TIME_MICROS, - TIMESTAMP_MILLIS, - TIMESTAMP_MICROS, - UINT_8, - UINT_16, - UINT_32, - UINT_64, - INT_8, - INT_16, - INT_32, - INT_64, - JSON, - BSON, - INTERVAL, - NA = 25, - // Should always be last element. - UNDEFINED = 26 - }; -}; - -namespace format { - -class LogicalType; - -} - -// Mirrors parquet::FieldRepetitionType -struct Repetition { - enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 }; -}; - -// Reference: -// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/ -// format/converter/ParquetMetadataConverter.java -// Sort order for page and column statistics. Types are associated with sort -// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are -// aggregated using a sort order. As of parquet-format version 2.3.1, the -// order used to aggregate stats is always SIGNED and is not stored in the -// Parquet file. These stats are discarded for types that need unsigned. -// See PARQUET-686. -struct SortOrder { - enum type { SIGNED, UNSIGNED, UNKNOWN }; -}; - -namespace schema { - -struct DecimalMetadata { - bool isset; - int32_t scale; - int32_t precision; -}; - -} // namespace schema - -/// \brief Implementation of parquet.thrift LogicalType annotations. -class PARQUET_EXPORT LogicalAnnotation { - public: - struct Type { - enum type { - UNKNOWN = 0, - STRING = 1, - MAP, - LIST, - ENUM, - DECIMAL, - DATE, - TIME, - TIMESTAMP, - INTERVAL, - INT, - NIL, // Thrift NullType - JSON, - BSON, - UUID, - NONE - }; - }; - - struct TimeUnit { - enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS }; - }; - - /// \brief If possible, return an annotation equivalent to the given legacy converted - /// type (and decimal metadata if applicable). - static std::shared_ptr FromConvertedType( - const parquet::LogicalType::type converted_type, - const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1, - -1}); - - /// \brief Return the annotation represented by the Thrift intermediary object. - static std::shared_ptr FromThrift( - const parquet::format::LogicalType& thrift_logical_type); - - /// \brief Return the explicitly requested annotation type. - static std::shared_ptr String(); - static std::shared_ptr Map(); - static std::shared_ptr List(); - static std::shared_ptr Enum(); - static std::shared_ptr Decimal(int32_t precision, - int32_t scale = 0); - static std::shared_ptr Date(); - static std::shared_ptr Time( - bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); - static std::shared_ptr Timestamp( - bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); - static std::shared_ptr Interval(); - static std::shared_ptr Int(int bit_width, bool is_signed); - static std::shared_ptr Null(); - static std::shared_ptr JSON(); - static std::shared_ptr BSON(); - static std::shared_ptr UUID(); - static std::shared_ptr None(); - static std::shared_ptr Unknown(); - - /// \brief Return true if this annotation is consistent with the given underlying - /// physical type. - bool is_applicable(parquet::Type::type primitive_type, - int32_t primitive_length = -1) const; - - /// \brief Return true if this annotation is equivalent to the given legacy converted - /// type (and decimal metadata if applicable). - bool is_compatible(parquet::LogicalType::type converted_type, - parquet::schema::DecimalMetadata converted_decimal_metadata = { - false, -1, -1}) const; - - /// \brief If possible, return the legacy converted type (and decimal metadata if - /// applicable) equivalent to this annotation. - parquet::LogicalType::type ToConvertedType( - parquet::schema::DecimalMetadata* out_decimal_metadata) const; - - /// \brief Return a printable representation of this annotation. - std::string ToString() const; - - /// \brief Return a JSON representation of this annotation. - std::string ToJSON() const; - - /// \brief Return a serializable Thrift object for this annotation. - parquet::format::LogicalType ToThrift() const; - - /// \brief Return true if the given annotation is equivalent to this annotation. - bool Equals(const LogicalAnnotation& other) const; - - /// \brief Return the enumerated type of this annotation. - LogicalAnnotation::Type::type type() const; - - /// \brief Return the appropriate sort order for this annotation. - SortOrder::type sort_order() const; - - // Type checks ... - bool is_string() const; - bool is_map() const; - bool is_list() const; - bool is_enum() const; - bool is_decimal() const; - bool is_date() const; - bool is_time() const; - bool is_timestamp() const; - bool is_interval() const; - bool is_int() const; - bool is_null() const; - bool is_JSON() const; - bool is_BSON() const; - bool is_UUID() const; - bool is_none() const; - /// \brief Return true if this annotation is of a known type. - bool is_valid() const; - bool is_invalid() const; - /// \brief Return true if this annotation is suitable for a schema GroupNode. - bool is_nested() const; - bool is_nonnested() const; - /// \brief Return true if this annotation is included in the Thrift output for its node. - bool is_serialized() const; - - LogicalAnnotation(const LogicalAnnotation&) = delete; - LogicalAnnotation& operator=(const LogicalAnnotation&) = delete; - virtual ~LogicalAnnotation() noexcept; - - protected: - LogicalAnnotation(); - - class Impl; - std::unique_ptr impl_; -}; - -/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. -class PARQUET_EXPORT StringAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - StringAnnotation() = default; -}; - -/// \brief Allowed for group nodes only. -class PARQUET_EXPORT MapAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - MapAnnotation() = default; -}; - -/// \brief Allowed for group nodes only. -class PARQUET_EXPORT ListAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - ListAnnotation() = default; -}; - -/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8. -class PARQUET_EXPORT EnumAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - EnumAnnotation() = default; -}; - -/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY, -/// depending on the precision. -class PARQUET_EXPORT DecimalAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(int32_t precision, - int32_t scale = 0); - int32_t precision() const; - int32_t scale() const; - - private: - DecimalAnnotation() = default; -}; - -/// \brief Allowed for physical type INT32. -class PARQUET_EXPORT DateAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - DateAnnotation() = default; -}; - -/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS). -class PARQUET_EXPORT TimeAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make( - bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); - bool is_adjusted_to_utc() const; - LogicalAnnotation::TimeUnit::unit time_unit() const; - - private: - TimeAnnotation() = default; -}; - -/// \brief Allowed for physical type INT64. -class PARQUET_EXPORT TimestampAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make( - bool is_adjusted_to_utc, LogicalAnnotation::TimeUnit::unit time_unit); - bool is_adjusted_to_utc() const; - LogicalAnnotation::TimeUnit::unit time_unit() const; - - private: - TimestampAnnotation() = default; -}; - -/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12 -class PARQUET_EXPORT IntervalAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - IntervalAnnotation() = default; -}; - -/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64 -/// (for bit width 64). -class PARQUET_EXPORT IntAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(int bit_width, bool is_signed); - int bit_width() const; - bool is_signed() const; - - private: - IntAnnotation() = default; -}; - -/// \brief Allowed for any physical type. -class PARQUET_EXPORT NullAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - NullAnnotation() = default; -}; - -/// \brief Allowed for physical type BYTE_ARRAY. -class PARQUET_EXPORT JSONAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - JSONAnnotation() = default; -}; - -/// \brief Allowed for physical type BYTE_ARRAY. -class PARQUET_EXPORT BSONAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - BSONAnnotation() = default; -}; - -/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16, -/// must encode raw UUID bytes. -class PARQUET_EXPORT UUIDAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - UUIDAnnotation() = default; -}; - -/// \brief Allowed for any physical type. -class PARQUET_EXPORT NoAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - NoAnnotation() = default; -}; - -/// \brief Allowed for any type. -class PARQUET_EXPORT UnknownAnnotation : public LogicalAnnotation { - public: - static std::shared_ptr Make(); - - private: - UnknownAnnotation() = default; -}; - -// Data encodings. Mirrors parquet::Encoding -struct Encoding { - enum type { - PLAIN = 0, - PLAIN_DICTIONARY = 2, - RLE = 3, - BIT_PACKED = 4, - DELTA_BINARY_PACKED = 5, - DELTA_LENGTH_BYTE_ARRAY = 6, - DELTA_BYTE_ARRAY = 7, - RLE_DICTIONARY = 8 - }; -}; - -// Compression, mirrors parquet::CompressionCodec -struct Compression { - enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD }; -}; - -PARQUET_EXPORT -std::unique_ptr<::arrow::util::Codec> GetCodecFromArrow(Compression::type codec); - -struct Encryption { - enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 }; -}; - -// parquet::PageType -struct PageType { - enum type { DATA_PAGE, INDEX_PAGE, DICTIONARY_PAGE, DATA_PAGE_V2 }; -}; - -class ColumnOrder { - public: - enum type { UNDEFINED, TYPE_DEFINED_ORDER }; - explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} - // Default to Type Defined Order - ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} - ColumnOrder::type get_order() { return column_order_; } - - static ColumnOrder undefined_; - static ColumnOrder type_defined_; - - private: - ColumnOrder::type column_order_; -}; - -// ---------------------------------------------------------------------- - -struct ByteArray { - ByteArray() : len(0), ptr(NULLPTR) {} - ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} - uint32_t len; - const uint8_t* ptr; -}; - -inline bool operator==(const ByteArray& left, const ByteArray& right) { - return left.len == right.len && - (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0); -} - -inline bool operator!=(const ByteArray& left, const ByteArray& right) { - return !(left == right); -} - -struct FixedLenByteArray { - FixedLenByteArray() : ptr(NULLPTR) {} - explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} - const uint8_t* ptr; -}; - -using FLBA = FixedLenByteArray; - -// Julian day at unix epoch. -// -// The Julian Day Number (JDN) is the integer assigned to a whole solar day in -// the Julian day count starting from noon Universal time, with Julian day -// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC, -// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian -// calendar), -constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588); -constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24); -constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000); -constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000); -constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000); - -MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; }; -STRUCT_END(Int96, 12); - -inline bool operator==(const Int96& left, const Int96& right) { - return std::equal(left.value, left.value + 3, right.value); -} - -inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); } - -static inline std::string ByteArrayToString(const ByteArray& a) { - return std::string(reinterpret_cast(a.ptr), a.len); -} - -static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { - std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); -} - -static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) { - int64_t days_since_epoch = i96.value[2] - kJulianToUnixEpochDays; - int64_t nanoseconds = 0; - - memcpy(&nanoseconds, &i96.value, sizeof(int64_t)); - return days_since_epoch * kNanosecondsPerDay + nanoseconds; -} - -static inline std::string Int96ToString(const Int96& a) { - std::ostringstream result; - std::copy(a.value, a.value + 3, std::ostream_iterator(result, " ")); - return result.str(); -} - -static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) { - std::ostringstream result; - std::copy(a.ptr, a.ptr + len, std::ostream_iterator(result, " ")); - return result.str(); -} - -template -struct type_traits {}; - -template <> -struct type_traits { - using value_type = bool; - - static constexpr int value_byte_size = 1; - static constexpr const char* printf_code = "d"; -}; - -template <> -struct type_traits { - using value_type = int32_t; - - static constexpr int value_byte_size = 4; - static constexpr const char* printf_code = "d"; -}; - -template <> -struct type_traits { - using value_type = int64_t; - - static constexpr int value_byte_size = 8; - static constexpr const char* printf_code = "ld"; -}; - -template <> -struct type_traits { - using value_type = Int96; - - static constexpr int value_byte_size = 12; - static constexpr const char* printf_code = "s"; -}; - -template <> -struct type_traits { - using value_type = float; - - static constexpr int value_byte_size = 4; - static constexpr const char* printf_code = "f"; -}; - -template <> -struct type_traits { - using value_type = double; - - static constexpr int value_byte_size = 8; - static constexpr const char* printf_code = "lf"; -}; - -template <> -struct type_traits { - using value_type = ByteArray; - - static constexpr int value_byte_size = sizeof(ByteArray); - static constexpr const char* printf_code = "s"; -}; - -template <> -struct type_traits { - using value_type = FixedLenByteArray; - - static constexpr int value_byte_size = sizeof(FixedLenByteArray); - static constexpr const char* printf_code = "s"; -}; - -template -struct DataType { - using c_type = typename type_traits::value_type; - static constexpr Type::type type_num = TYPE; -}; - -using BooleanType = DataType; -using Int32Type = DataType; -using Int64Type = DataType; -using Int96Type = DataType; -using FloatType = DataType; -using DoubleType = DataType; -using ByteArrayType = DataType; -using FLBAType = DataType; - -template -inline std::string format_fwf(int width) { - std::stringstream ss; - ss << "%-" << width << type_traits::printf_code; - return ss.str(); -} - -PARQUET_EXPORT std::string CompressionToString(Compression::type t); - -PARQUET_EXPORT std::string EncodingToString(Encoding::type t); - -PARQUET_EXPORT std::string LogicalTypeToString(LogicalType::type t); - -PARQUET_EXPORT std::string TypeToString(Type::type t); - -PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, - const std::string& val); - -/// \deprecated Since 1.5.0 -ARROW_DEPRECATED("Use std::string instead of char* as input") -PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); - -PARQUET_EXPORT int GetTypeByteSize(Type::type t); - -PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive); - -PARQUET_EXPORT SortOrder::type GetSortOrder(LogicalType::type converted, - Type::type primitive); - -PARQUET_EXPORT SortOrder::type GetSortOrder( - const std::shared_ptr& annotation, Type::type primitive); - -} // namespace parquet - -#endif // PARQUET_TYPES_H diff --git a/r/R/inst/include/parquet/windows_compatibility.h b/r/R/inst/include/parquet/windows_compatibility.h deleted file mode 100644 index 31ca04c8b66..00000000000 --- a/r/R/inst/include/parquet/windows_compatibility.h +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/util/windows_compatibility.h" - -#ifdef _WIN32 - -// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from -// above, so we undefine it -#ifdef OPTIONAL -#undef OPTIONAL -#endif - -#endif diff --git a/r/configure b/r/configure index 10045d3c5ec..11bd4bcd1be 100755 --- a/r/configure +++ b/r/configure @@ -43,11 +43,9 @@ fi # Note that cflags may be empty in case of success if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then echo "Found INCLUDE_DIR and/or LIB_DIR!" - PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS" PKG_LIBS="-L$LIB_DIR $PKG_LIBS" elif [ "$PKGCONFIG_CFLAGS" ] || [ "$PKGCONFIG_LIBS" ]; then echo "Found pkg-config cflags and libs!" - PKG_CFLAGS=${PKGCONFIG_CFLAGS} PKG_LIBS=${PKGCONFIG_LIBS} elif [[ "$OSTYPE" == "darwin"* ]]; then if [ "$(command -v brew)" ]; then @@ -56,22 +54,21 @@ elif [[ "$OSTYPE" == "darwin"* ]]; then curl -sfL "https://jeroen.github.io/autobrew/$PKG_BREW_NAME" > autobrew source autobrew fi - PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include" PKG_LIBS="-L$BREWDIR/opt/$PKG_BREW_NAME/lib $PKG_LIBS" fi -PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_PARQUET" - -# For debugging -echo "PKG_CFLAGS=$PKG_CFLAGS" -echo "PKG_LIBS=$PKG_LIBS" - # Find compiler CXXCPP=$("${R_HOME}"/bin/R CMD config CXXCPP) CXX11FLAGS=$("${R_HOME}"/bin/R CMD config CXX11FLAGS) CXX11STD=$("${R_HOME}"/bin/R CMD config CXX11STD) CPPFLAGS=$("${R_HOME}"/bin/R CMD config CPPFLAGS) +PKG_CFLAGS="-I../inst/include -DARROW_R_WITH_PARQUET" + +# For debugging +echo "PKG_CFLAGS=$PKG_CFLAGS" +echo "PKG_LIBS=$PKG_LIBS" + # If libarrow uses the old GLIBCXX ABI, so we have to use it too if [ "$ARROW_USE_OLD_CXXABI" ]; then $PKG_CFLAGS="$PKG_CFLAGS -D_GLIBCXX_USE_CXX11_ABI=0" diff --git a/r/man/table.Rd b/r/man/table.Rd index 4d93ff385b5..fbf9632a03a 100644 --- a/r/man/table.Rd +++ b/r/man/table.Rd @@ -9,7 +9,10 @@ table(..., schema = NULL) \arguments{ \item{...}{arrays, chunked arrays, or R vectors} -\item{schema}{NULL or a schema} +\item{schema}{a schema. The default (\code{NULL}) infers the schema from the \code{...}} +} +\value{ +an arrow::Table } \description{ Create an arrow::Table from a data frame From 1ca6bca7899b207fb6ccf598ecaffe4dd1d3e420 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Wed, 5 Jun 2019 16:16:18 +0200 Subject: [PATCH 4/4] use a manual library.dynam() and library.dynam.unload() instead of useDynLib --- r/R/zzz.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/r/R/zzz.R b/r/R/zzz.R index be952a6b4f7..21bc504360e 100644 --- a/r/R/zzz.R +++ b/r/R/zzz.R @@ -15,6 +15,19 @@ # specific language governing permissions and limitations # under the License. -#' @useDynLib arrow, .registration = TRUE #' @importFrom Rcpp sourceCpp NULL + +NAMESPACE <- environment() + +.onLoad <- function(libname, pkgname) { + dll <- library.dynam("arrow", pkgname, libname) + + for(routine in getDLLRegisteredRoutines(dll)$.Call) { + assign(routine$name, routine, envir = NAMESPACE) + } +} + +.onUnload <- function(libpath) { + library.dynam.unload("arrow", libpath) +}