From 59dcbde116210d9f756ead81cc6c01163c188cdd Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 1 Sep 2021 08:48:50 -0500 Subject: [PATCH 01/93] working implementation but lacks case-insensitivity and more unit tests --- cpp/src/arrow/util/CMakeLists.txt | 52 ++--- cpp/src/arrow/util/value_parsing.h | 110 ++++++++++ cpp/src/arrow/util/value_parsing_test.cc | 245 ++++++++++++----------- 3 files changed, 262 insertions(+), 145 deletions(-) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 660fb2657b6..03fd0ef4a81 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -40,33 +40,33 @@ endif() add_arrow_test(utility-test SOURCES - align_util_test.cc - async_generator_test.cc - bit_block_counter_test.cc - bit_util_test.cc - cache_test.cc - checked_cast_test.cc - compression_test.cc - decimal_test.cc - formatting_util_test.cc - key_value_metadata_test.cc - hashing_test.cc - int_util_test.cc + # align_util_test.cc + # async_generator_test.cc + # bit_block_counter_test.cc + # bit_util_test.cc + # cache_test.cc + # checked_cast_test.cc + # compression_test.cc + # decimal_test.cc + # formatting_util_test.cc + # key_value_metadata_test.cc + # hashing_test.cc + # int_util_test.cc ${IO_UTIL_TEST_SOURCES} - iterator_test.cc - logging_test.cc - queue_test.cc - range_test.cc - reflection_test.cc - rle_encoding_test.cc - stl_util_test.cc - string_test.cc - tdigest_test.cc - test_common.cc - time_test.cc - trie_test.cc - uri_test.cc - utf8_util_test.cc + # iterator_test.cc + # logging_test.cc + # queue_test.cc + # range_test.cc + # reflection_test.cc + # rle_encoding_test.cc + # stl_util_test.cc + # string_test.cc + # tdigest_test.cc + # test_common.cc + # time_test.cc + # trie_test.cc + # uri_test.cc + # utf8_util_test.cc value_parsing_test.cc variant_test.cc) diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 02e6fa42e01..aa5342ee4c4 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "arrow/type.h" #include "arrow/type_traits.h" @@ -273,6 +274,100 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST + +#define PARSE_HEX_ITERATION(C_TYPE) \ + if (length > 0) { \ + char val = *s; \ + const char* pos = std::lower_bound(kAsciiTable, kAsciiTableEnd, val); \ + s++; \ + result = static_cast(result << 4); \ + length--; \ + if (ARROW_PREDICT_FALSE(pos == kAsciiTableEnd || *pos != val)) { \ + /* Non-digit */ \ + return false; \ + } \ + result = static_cast(result | (pos - kAsciiTable)); \ + } else { \ + break; \ + } + +inline bool ParseHex(const char* s, size_t length, uint8_t* out) { + const char* kAsciiTable = "0123456789ABCDEF"; + const char* kAsciiTableEnd = kAsciiTable + 16; + uint8_t result = 0; + + do { + PARSE_HEX_ITERATION(uint8_t); + PARSE_HEX_ITERATION(uint8_t); + } while (false); + *out = result; + return true; +} + +inline bool ParseHex(const char* s, size_t length, uint16_t* out) { + const char* kAsciiTable = "0123456789ABCDEF"; + const char* kAsciiTableEnd = kAsciiTable + 16; + uint16_t result = 0; + do { + PARSE_HEX_ITERATION(uint16_t); + PARSE_HEX_ITERATION(uint16_t); + PARSE_HEX_ITERATION(uint16_t); + PARSE_HEX_ITERATION(uint16_t); + } while (false); + *out = result; + return true; +} + +inline bool ParseHex(const char* s, size_t length, uint32_t* out) { + const char* kAsciiTable = "0123456789ABCDEF"; + const char* kAsciiTableEnd = kAsciiTable + 16; + uint32_t result = 0; + do { + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + PARSE_HEX_ITERATION(uint32_t); + } while (false); + *out = result; + return true; +} + +inline bool ParseHex(const char* s, size_t length, uint64_t* out) { + const char* kAsciiTable = "0123456789ABCDEF"; + const char* kAsciiTableEnd = kAsciiTable + 16; + uint64_t result = 0; + do { + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + PARSE_HEX_ITERATION(uint64_t); + } while (false); + *out = result; + return true; +} + +#undef PARSE_HEX_ITERATION + template struct StringToUnsignedIntConverterMixin { using value_type = typename ARROW_TYPE::c_type; @@ -336,6 +431,21 @@ struct StringToSignedIntConverterMixin { return false; } } + + // If its starts with 0x then its hex + if (*s == '0' && *(s + 1) == 'x'){ + length -= 2; + s += 2; + // lets make sure that the length of the string is not too big + if (!ARROW_PREDICT_TRUE(sizeof(unsigned_value)*2 >= length)) { + return false; + } + if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { + return false; + } + *out = static_cast(unsigned_value); + return true; + } // Skip leading zeros while (length > 0 && *s == '0') { length--; diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index b5dc5619ded..ea81585001b 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -54,125 +54,125 @@ void AssertConversionFails(const std::string& s) { AssertConversionFails(*type, s); } -TEST(StringConversion, ToBoolean) { - AssertConversion("true", true); - AssertConversion("tRuE", true); - AssertConversion("FAlse", false); - AssertConversion("false", false); - AssertConversion("1", true); - AssertConversion("0", false); - - AssertConversionFails(""); -} - -TEST(StringConversion, ToFloat) { - AssertConversion("1.5", 1.5f); - AssertConversion("0", 0.0f); - // XXX ASSERT_EQ doesn't distinguish signed zeros - AssertConversion("-0.0", -0.0f); - AssertConversion("-1e20", -1e20f); - - AssertConversionFails(""); - AssertConversionFails("e"); -} - -TEST(StringConversion, ToDouble) { - AssertConversion("1.5", 1.5); - AssertConversion("0", 0); - // XXX ASSERT_EQ doesn't distinguish signed zeros - AssertConversion("-0.0", -0.0); - AssertConversion("-1e100", -1e100); - - AssertConversionFails(""); - AssertConversionFails("e"); -} - -#if !defined(_WIN32) || defined(NDEBUG) - -TEST(StringConversion, ToFloatLocale) { - // French locale uses the comma as decimal point - LocaleGuard locale_guard("fr_FR.UTF-8"); - - AssertConversion("1.5", 1.5f); -} - -TEST(StringConversion, ToDoubleLocale) { - // French locale uses the comma as decimal point - LocaleGuard locale_guard("fr_FR.UTF-8"); - - AssertConversion("1.5", 1.5f); -} - -#endif // _WIN32 - -TEST(StringConversion, ToInt8) { - AssertConversion("0", 0); - AssertConversion("127", 127); - AssertConversion("0127", 127); - AssertConversion("-128", -128); - AssertConversion("-00128", -128); - - // Non-representable values - AssertConversionFails("128"); - AssertConversionFails("-129"); - - AssertConversionFails(""); - AssertConversionFails("-"); - AssertConversionFails("0.0"); - AssertConversionFails("e"); -} - -TEST(StringConversion, ToUInt8) { - AssertConversion("0", 0); - AssertConversion("26", 26); - AssertConversion("255", 255); - AssertConversion("0255", 255); - - // Non-representable values - AssertConversionFails("-1"); - AssertConversionFails("256"); - AssertConversionFails("260"); - AssertConversionFails("1234"); - - AssertConversionFails(""); - AssertConversionFails("-"); - AssertConversionFails("0.0"); - AssertConversionFails("e"); -} - -TEST(StringConversion, ToInt16) { - AssertConversion("0", 0); - AssertConversion("32767", 32767); - AssertConversion("032767", 32767); - AssertConversion("-32768", -32768); - AssertConversion("-0032768", -32768); - - // Non-representable values - AssertConversionFails("32768"); - AssertConversionFails("-32769"); - - AssertConversionFails(""); - AssertConversionFails("-"); - AssertConversionFails("0.0"); - AssertConversionFails("e"); -} - -TEST(StringConversion, ToUInt16) { - AssertConversion("0", 0); - AssertConversion("6660", 6660); - AssertConversion("65535", 65535); - AssertConversion("065535", 65535); - - // Non-representable values - AssertConversionFails("-1"); - AssertConversionFails("65536"); - AssertConversionFails("123456"); - - AssertConversionFails(""); - AssertConversionFails("-"); - AssertConversionFails("0.0"); - AssertConversionFails("e"); -} +// TEST(StringConversion, ToBoolean) { +// AssertConversion("true", true); +// AssertConversion("tRuE", true); +// AssertConversion("FAlse", false); +// AssertConversion("false", false); +// AssertConversion("1", true); +// AssertConversion("0", false); + +// AssertConversionFails(""); +// } + +// TEST(StringConversion, ToFloat) { +// AssertConversion("1.5", 1.5f); +// AssertConversion("0", 0.0f); +// // XXX ASSERT_EQ doesn't distinguish signed zeros +// AssertConversion("-0.0", -0.0f); +// AssertConversion("-1e20", -1e20f); + +// AssertConversionFails(""); +// AssertConversionFails("e"); +// } + +// TEST(StringConversion, ToDouble) { +// AssertConversion("1.5", 1.5); +// AssertConversion("0", 0); +// // XXX ASSERT_EQ doesn't distinguish signed zeros +// AssertConversion("-0.0", -0.0); +// AssertConversion("-1e100", -1e100); + +// AssertConversionFails(""); +// AssertConversionFails("e"); +// } + +// #if !defined(_WIN32) || defined(NDEBUG) + +// TEST(StringConversion, ToFloatLocale) { +// // French locale uses the comma as decimal point +// LocaleGuard locale_guard("fr_FR.UTF-8"); + +// AssertConversion("1.5", 1.5f); +// } + +// TEST(StringConversion, ToDoubleLocale) { +// // French locale uses the comma as decimal point +// LocaleGuard locale_guard("fr_FR.UTF-8"); + +// AssertConversion("1.5", 1.5f); +// } + +// #endif // _WIN32 + +// TEST(StringConversion, ToInt8) { +// AssertConversion("0", 0); +// AssertConversion("127", 127); +// AssertConversion("0127", 127); +// AssertConversion("-128", -128); +// AssertConversion("-00128", -128); + +// // Non-representable values +// AssertConversionFails("128"); +// AssertConversionFails("-129"); + +// AssertConversionFails(""); +// AssertConversionFails("-"); +// AssertConversionFails("0.0"); +// AssertConversionFails("e"); +// } + +// TEST(StringConversion, ToUInt8) { +// AssertConversion("0", 0); +// AssertConversion("26", 26); +// AssertConversion("255", 255); +// AssertConversion("0255", 255); + +// // Non-representable values +// AssertConversionFails("-1"); +// AssertConversionFails("256"); +// AssertConversionFails("260"); +// AssertConversionFails("1234"); + +// AssertConversionFails(""); +// AssertConversionFails("-"); +// AssertConversionFails("0.0"); +// AssertConversionFails("e"); +// } + +// TEST(StringConversion, ToInt16) { +// AssertConversion("0", 0); +// AssertConversion("32767", 32767); +// AssertConversion("032767", 32767); +// AssertConversion("-32768", -32768); +// AssertConversion("-0032768", -32768); + +// // Non-representable values +// AssertConversionFails("32768"); +// AssertConversionFails("-32769"); + +// AssertConversionFails(""); +// AssertConversionFails("-"); +// AssertConversionFails("0.0"); +// AssertConversionFails("e"); +// } + +// TEST(StringConversion, ToUInt16) { +// AssertConversion("0", 0); +// AssertConversion("6660", 6660); +// AssertConversion("65535", 65535); +// AssertConversion("065535", 65535); + +// // Non-representable values +// AssertConversionFails("-1"); +// AssertConversionFails("65536"); +// AssertConversionFails("123456"); + +// AssertConversionFails(""); +// AssertConversionFails("-"); +// AssertConversionFails("0.0"); +// AssertConversionFails("e"); +// } TEST(StringConversion, ToInt32) { AssertConversion("0", 0); @@ -189,6 +189,13 @@ TEST(StringConversion, ToInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + } TEST(StringConversion, ToUInt32) { From 4cb862b02c39aa0e9278f6e79ac39158f20f65cf Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 1 Sep 2021 19:52:09 -0500 Subject: [PATCH 02/93] different algorithm. Added more tests and benchmarks --- cpp/src/arrow/util/value_parsing.h | 56 ++-- cpp/src/arrow/util/value_parsing_benchmark.cc | 51 +++ cpp/src/arrow/util/value_parsing_test.cc | 300 +++++++++++------- 3 files changed, 264 insertions(+), 143 deletions(-) diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index aa5342ee4c4..435081047f8 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -27,7 +27,6 @@ #include #include #include -#include #include "arrow/type.h" #include "arrow/type_traits.h" @@ -274,26 +273,28 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST - -#define PARSE_HEX_ITERATION(C_TYPE) \ - if (length > 0) { \ - char val = *s; \ - const char* pos = std::lower_bound(kAsciiTable, kAsciiTableEnd, val); \ - s++; \ - result = static_cast(result << 4); \ - length--; \ - if (ARROW_PREDICT_FALSE(pos == kAsciiTableEnd || *pos != val)) { \ - /* Non-digit */ \ - return false; \ - } \ - result = static_cast(result | (pos - kAsciiTable)); \ - } else { \ - break; \ +#define PARSE_HEX_ITERATION(C_TYPE) \ + if (length > 0) { \ + char val = *s; \ + s++; \ + result = static_cast(result << 4); \ + length--; \ + if (val >= '0' && val <= '9'){ \ + result = static_cast(result | (val -'0')); \ + } else if (val >= 'A' && val <= 'F'){ \ + result = static_cast(result | (val -'A' + 10)); \ + } else if (val >= 'a' && val <= 'f'){ \ + result = static_cast(result | (val -'a' + 10)); \ + } else { \ + /* Non-digit */ \ + return false; \ + } \ + } else { \ + break; \ } + inline bool ParseHex(const char* s, size_t length, uint8_t* out) { - const char* kAsciiTable = "0123456789ABCDEF"; - const char* kAsciiTableEnd = kAsciiTable + 16; uint8_t result = 0; do { @@ -305,8 +306,6 @@ inline bool ParseHex(const char* s, size_t length, uint8_t* out) { } inline bool ParseHex(const char* s, size_t length, uint16_t* out) { - const char* kAsciiTable = "0123456789ABCDEF"; - const char* kAsciiTableEnd = kAsciiTable + 16; uint16_t result = 0; do { PARSE_HEX_ITERATION(uint16_t); @@ -319,8 +318,6 @@ inline bool ParseHex(const char* s, size_t length, uint16_t* out) { } inline bool ParseHex(const char* s, size_t length, uint32_t* out) { - const char* kAsciiTable = "0123456789ABCDEF"; - const char* kAsciiTableEnd = kAsciiTable + 16; uint32_t result = 0; do { PARSE_HEX_ITERATION(uint32_t); @@ -338,8 +335,6 @@ inline bool ParseHex(const char* s, size_t length, uint32_t* out) { } inline bool ParseHex(const char* s, size_t length, uint64_t* out) { - const char* kAsciiTable = "0123456789ABCDEF"; - const char* kAsciiTableEnd = kAsciiTable + 16; uint64_t result = 0; do { PARSE_HEX_ITERATION(uint64_t); @@ -376,6 +371,19 @@ struct StringToUnsignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } + // If its starts with 0x then its hex + if (*s == '0' && *(s + 1) == 'x'){ + length -= 2; + s += 2; + // lets make sure that the length of the string is not too big + if (!ARROW_PREDICT_TRUE(sizeof(value_type)*2 >= length)) { + return false; + } + if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { + return false; + } + return true; + } // Skip leading zeros while (length > 0 && *s == '0') { length--; diff --git a/cpp/src/arrow/util/value_parsing_benchmark.cc b/cpp/src/arrow/util/value_parsing_benchmark.cc index c113c245fff..0fd0d1f8ec3 100644 --- a/cpp/src/arrow/util/value_parsing_benchmark.cc +++ b/cpp/src/arrow/util/value_parsing_benchmark.cc @@ -56,6 +56,29 @@ static std::vector MakeIntStrings(int32_t num_items) { return strings; } +template +static std::vector MakeHexStrings(int32_t num_items) { + int32_t num_bytes = sizeof(c_int); + const char* kAsciiTable = "0123456789ABCDEF"; + std::vector large_hex_chars(num_bytes*2 + 2); + large_hex_chars[0]='0'; + large_hex_chars[1]='x'; + for (int32_t i = 0; i < num_bytes*2; ++i) { + large_hex_chars[i + 2] = kAsciiTable[i]; + } + std::string large_hex(&large_hex_chars[0], large_hex_chars.size()); + + std::vector base_strings = {"0x0", + "0xA5", + "0x5E", + large_hex}; + std::vector strings; + for (int32_t i = 0; i < num_items; ++i) { + strings.push_back(base_strings[i % base_strings.size()]); + } + return strings; +} + static std::vector MakeFloatStrings(int32_t num_items) { std::vector base_strings = {"0.0", "5", "-12.3", "98765430000", "3456.789", "0.0012345", @@ -123,6 +146,25 @@ static void IntegerParsing(benchmark::State& state) { // NOLINT non-const refer state.SetItemsProcessed(state.iterations() * strings.size()); } +template +static void HexParsing(benchmark::State& state) { // NOLINT non-const reference + auto strings = MakeHexStrings(1000); + + while (state.KeepRunning()) { + C_TYPE total = 0; + for (const auto& s : strings) { + C_TYPE value; + if (!ParseValue(s.data(), s.length(), &value)) { + std::cerr << "Conversion failed for '" << s << "'"; + std::abort(); + } + total = static_cast(total + value); + } + benchmark::DoNotOptimize(total); + } + state.SetItemsProcessed(state.iterations() * strings.size()); +} + template static void FloatParsing(benchmark::State& state) { // NOLINT non-const reference auto strings = MakeFloatStrings(1000); @@ -230,6 +272,15 @@ BENCHMARK_TEMPLATE(IntegerParsing, UInt16Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt32Type); BENCHMARK_TEMPLATE(IntegerParsing, UInt64Type); +BENCHMARK_TEMPLATE(HexParsing, Int8Type); +BENCHMARK_TEMPLATE(HexParsing, Int16Type); +BENCHMARK_TEMPLATE(HexParsing, Int32Type); +BENCHMARK_TEMPLATE(HexParsing, Int64Type); +BENCHMARK_TEMPLATE(HexParsing, UInt8Type); +BENCHMARK_TEMPLATE(HexParsing, UInt16Type); +BENCHMARK_TEMPLATE(HexParsing, UInt32Type); +BENCHMARK_TEMPLATE(HexParsing, UInt64Type); + BENCHMARK_TEMPLATE(FloatParsing, FloatType); BENCHMARK_TEMPLATE(FloatParsing, DoubleType); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index ea81585001b..edc5223f0cd 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -54,125 +54,157 @@ void AssertConversionFails(const std::string& s) { AssertConversionFails(*type, s); } -// TEST(StringConversion, ToBoolean) { -// AssertConversion("true", true); -// AssertConversion("tRuE", true); -// AssertConversion("FAlse", false); -// AssertConversion("false", false); -// AssertConversion("1", true); -// AssertConversion("0", false); - -// AssertConversionFails(""); -// } - -// TEST(StringConversion, ToFloat) { -// AssertConversion("1.5", 1.5f); -// AssertConversion("0", 0.0f); -// // XXX ASSERT_EQ doesn't distinguish signed zeros -// AssertConversion("-0.0", -0.0f); -// AssertConversion("-1e20", -1e20f); - -// AssertConversionFails(""); -// AssertConversionFails("e"); -// } - -// TEST(StringConversion, ToDouble) { -// AssertConversion("1.5", 1.5); -// AssertConversion("0", 0); -// // XXX ASSERT_EQ doesn't distinguish signed zeros -// AssertConversion("-0.0", -0.0); -// AssertConversion("-1e100", -1e100); - -// AssertConversionFails(""); -// AssertConversionFails("e"); -// } - -// #if !defined(_WIN32) || defined(NDEBUG) - -// TEST(StringConversion, ToFloatLocale) { -// // French locale uses the comma as decimal point -// LocaleGuard locale_guard("fr_FR.UTF-8"); - -// AssertConversion("1.5", 1.5f); -// } - -// TEST(StringConversion, ToDoubleLocale) { -// // French locale uses the comma as decimal point -// LocaleGuard locale_guard("fr_FR.UTF-8"); - -// AssertConversion("1.5", 1.5f); -// } - -// #endif // _WIN32 - -// TEST(StringConversion, ToInt8) { -// AssertConversion("0", 0); -// AssertConversion("127", 127); -// AssertConversion("0127", 127); -// AssertConversion("-128", -128); -// AssertConversion("-00128", -128); - -// // Non-representable values -// AssertConversionFails("128"); -// AssertConversionFails("-129"); - -// AssertConversionFails(""); -// AssertConversionFails("-"); -// AssertConversionFails("0.0"); -// AssertConversionFails("e"); -// } - -// TEST(StringConversion, ToUInt8) { -// AssertConversion("0", 0); -// AssertConversion("26", 26); -// AssertConversion("255", 255); -// AssertConversion("0255", 255); - -// // Non-representable values -// AssertConversionFails("-1"); -// AssertConversionFails("256"); -// AssertConversionFails("260"); -// AssertConversionFails("1234"); - -// AssertConversionFails(""); -// AssertConversionFails("-"); -// AssertConversionFails("0.0"); -// AssertConversionFails("e"); -// } - -// TEST(StringConversion, ToInt16) { -// AssertConversion("0", 0); -// AssertConversion("32767", 32767); -// AssertConversion("032767", 32767); -// AssertConversion("-32768", -32768); -// AssertConversion("-0032768", -32768); - -// // Non-representable values -// AssertConversionFails("32768"); -// AssertConversionFails("-32769"); - -// AssertConversionFails(""); -// AssertConversionFails("-"); -// AssertConversionFails("0.0"); -// AssertConversionFails("e"); -// } - -// TEST(StringConversion, ToUInt16) { -// AssertConversion("0", 0); -// AssertConversion("6660", 6660); -// AssertConversion("65535", 65535); -// AssertConversion("065535", 65535); - -// // Non-representable values -// AssertConversionFails("-1"); -// AssertConversionFails("65536"); -// AssertConversionFails("123456"); - -// AssertConversionFails(""); -// AssertConversionFails("-"); -// AssertConversionFails("0.0"); -// AssertConversionFails("e"); -// } +TEST(StringConversion, ToBoolean) { + AssertConversion("true", true); + AssertConversion("tRuE", true); + AssertConversion("FAlse", false); + AssertConversion("false", false); + AssertConversion("1", true); + AssertConversion("0", false); + + AssertConversionFails(""); +} + +TEST(StringConversion, ToFloat) { + AssertConversion("1.5", 1.5f); + AssertConversion("0", 0.0f); + // XXX ASSERT_EQ doesn't distinguish signed zeros + AssertConversion("-0.0", -0.0f); + AssertConversion("-1e20", -1e20f); + + AssertConversionFails(""); + AssertConversionFails("e"); +} + +TEST(StringConversion, ToDouble) { + AssertConversion("1.5", 1.5); + AssertConversion("0", 0); + // XXX ASSERT_EQ doesn't distinguish signed zeros + AssertConversion("-0.0", -0.0); + AssertConversion("-1e100", -1e100); + + AssertConversionFails(""); + AssertConversionFails("e"); +} + +#if !defined(_WIN32) || defined(NDEBUG) + +TEST(StringConversion, ToFloatLocale) { + // French locale uses the comma as decimal point + LocaleGuard locale_guard("fr_FR.UTF-8"); + + AssertConversion("1.5", 1.5f); +} + +TEST(StringConversion, ToDoubleLocale) { + // French locale uses the comma as decimal point + LocaleGuard locale_guard("fr_FR.UTF-8"); + + AssertConversion("1.5", 1.5f); +} + +#endif // _WIN32 + +TEST(StringConversion, ToInt8) { + AssertConversion("0", 0); + AssertConversion("127", 127); + AssertConversion("0127", 127); + AssertConversion("-128", -128); + AssertConversion("-00128", -128); + + // Non-representable values + AssertConversionFails("128"); + AssertConversionFails("-129"); + + AssertConversionFails(""); + AssertConversionFails("-"); + AssertConversionFails("0.0"); + AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); +} + +TEST(StringConversion, ToUInt8) { + AssertConversion("0", 0); + AssertConversion("26", 26); + AssertConversion("255", 255); + AssertConversion("0255", 255); + + // Non-representable values + AssertConversionFails("-1"); + AssertConversionFails("256"); + AssertConversionFails("260"); + AssertConversionFails("1234"); + + AssertConversionFails(""); + AssertConversionFails("-"); + AssertConversionFails("0.0"); + AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1A", 26); + AssertConversion("0xb", 11); + AssertConversion("0x7F", 127); + AssertConversionFails("0x100"); + AssertConversionFails("0x1g"); +} + +TEST(StringConversion, ToInt16) { + AssertConversion("0", 0); + AssertConversion("32767", 32767); + AssertConversion("032767", 32767); + AssertConversion("-32768", -32768); + AssertConversion("-0032768", -32768); + + // Non-representable values + AssertConversionFails("32768"); + AssertConversionFails("-32769"); + + AssertConversionFails(""); + AssertConversionFails("-"); + AssertConversionFails("0.0"); + AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); +} + +TEST(StringConversion, ToUInt16) { + AssertConversion("0", 0); + AssertConversion("6660", 6660); + AssertConversion("65535", 65535); + AssertConversion("065535", 65535); + + // Non-representable values + AssertConversionFails("-1"); + AssertConversionFails("65536"); + AssertConversionFails("123456"); + + AssertConversionFails(""); + AssertConversionFails("-"); + AssertConversionFails("0.0"); + AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x1aA", 426); + AssertConversion("0xb", 11); + AssertConversion("0x7ffF", 32767); + AssertConversionFails("0x10000"); + AssertConversionFails("0x1g"); +} TEST(StringConversion, ToInt32) { AssertConversion("0", 0); @@ -195,6 +227,10 @@ TEST(StringConversion, ToInt32) { AssertConversion("0x123ABC", 1194684); AssertConversion("0xA4B35", 674613); AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversionFails("0x23512ak"); } @@ -213,6 +249,16 @@ TEST(StringConversion, ToUInt32) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x123ABC", 1194684); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFF", 2147483647); + AssertConversion("0x123abc", 1194684); + AssertConversion("0xA4b35", 674613); + AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToInt64) { @@ -230,6 +276,14 @@ TEST(StringConversion, ToInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToUInt64) { @@ -244,6 +298,14 @@ TEST(StringConversion, ToUInt64) { AssertConversionFails("-"); AssertConversionFails("0.0"); AssertConversionFails("e"); + + // Hex + AssertConversion("0x0", 0); + AssertConversion("0x5415a123ABC123cb", 6058926048274359243); + AssertConversion("0xA4B35", 674613); + AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversionFails("0x12345678901234567"); + AssertConversionFails("0x23512ak"); } TEST(StringConversion, ToDate32) { From d03b7eb8f1674be4b14722d2ae48b0dee4f48c62 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 1 Sep 2021 19:58:29 -0500 Subject: [PATCH 03/93] uncommented tests --- cpp/src/arrow/util/CMakeLists.txt | 52 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 03fd0ef4a81..660fb2657b6 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -40,33 +40,33 @@ endif() add_arrow_test(utility-test SOURCES - # align_util_test.cc - # async_generator_test.cc - # bit_block_counter_test.cc - # bit_util_test.cc - # cache_test.cc - # checked_cast_test.cc - # compression_test.cc - # decimal_test.cc - # formatting_util_test.cc - # key_value_metadata_test.cc - # hashing_test.cc - # int_util_test.cc + align_util_test.cc + async_generator_test.cc + bit_block_counter_test.cc + bit_util_test.cc + cache_test.cc + checked_cast_test.cc + compression_test.cc + decimal_test.cc + formatting_util_test.cc + key_value_metadata_test.cc + hashing_test.cc + int_util_test.cc ${IO_UTIL_TEST_SOURCES} - # iterator_test.cc - # logging_test.cc - # queue_test.cc - # range_test.cc - # reflection_test.cc - # rle_encoding_test.cc - # stl_util_test.cc - # string_test.cc - # tdigest_test.cc - # test_common.cc - # time_test.cc - # trie_test.cc - # uri_test.cc - # utf8_util_test.cc + iterator_test.cc + logging_test.cc + queue_test.cc + range_test.cc + reflection_test.cc + rle_encoding_test.cc + stl_util_test.cc + string_test.cc + tdigest_test.cc + test_common.cc + time_test.cc + trie_test.cc + uri_test.cc + utf8_util_test.cc value_parsing_test.cc variant_test.cc) From 69972dd4bdac67357c0d90238e0d0487e170e1f2 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Wed, 1 Sep 2021 21:24:08 -0700 Subject: [PATCH 04/93] ARROW-13792 [Java]: The toString representation is incorrect for unsigned integer vectors When adding a byte `0xff` to a UInt1Vector, the toString method produces `[-1]`. Since the vector contains unsinged integers, the correct result should be `[255]`. Closes #11029 from liyafan82/fly_0830_uin Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../org/apache/arrow/vector/UInt1Vector.java | 6 +++- .../org/apache/arrow/vector/UInt2Vector.java | 7 ++++ .../org/apache/arrow/vector/UInt4Vector.java | 6 ++++ .../org/apache/arrow/vector/UInt8Vector.java | 6 ++++ .../arrow/vector/util/ValueVectorUtility.java | 21 ++++++++++-- .../apache/arrow/vector/TestValueVector.java | 32 +++++++++++++++++++ 6 files changed, 74 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java index b735f5fbeb4..bd9a732c108 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt1Vector.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; /** * UInt1Vector implements a fixed width (1 bytes) vector of @@ -328,7 +329,10 @@ public long getValueAsLong(int index) { return this.get(index) & PROMOTION_MASK; } - + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } private class TransferImpl implements TransferPair { UInt1Vector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java index 917700e09c6..5c29ab6b321 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt2Vector.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; /** * UInt2Vector implements a fixed width (2 bytes) vector of @@ -305,6 +306,12 @@ public long getValueAsLong(int index) { return this.get(index); } + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> + v.isNull(i) ? "null" : Integer.toString(v.get(i) & 0x0000ffff)); + } + private class TransferImpl implements TransferPair { UInt2Vector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java index c5045e6a510..cc954d67ddd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt4Vector.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; /** * UInt4Vector implements a fixed width (4 bytes) vector of @@ -300,6 +301,11 @@ public long getValueAsLong(int index) { return this.get(index) & PROMOTION_MASK; } + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } + private class TransferImpl implements TransferPair { UInt4Vector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java b/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java index 3aa4451711d..98eaf25a6e2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/UInt8Vector.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.util.ValueVectorUtility; /** * UInt8Vector implements a fixed width vector (8 bytes) of @@ -296,6 +297,11 @@ public long getValueAsLong(int index) { return this.get(index); } + @Override + public String toString() { + return ValueVectorUtility.getToString(this, 0, getValueCount(), (v, i) -> v.getObjectNoOverflow(i)); + } + private class TransferImpl implements TransferPair { UInt8Vector to; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java index 60553b4e342..ceb7081e1ea 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ValueVectorUtility.java @@ -19,6 +19,8 @@ import static org.apache.arrow.vector.validate.ValidateUtil.validateOrThrow; +import java.util.function.BiFunction; + import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.ValueVector; @@ -37,7 +39,7 @@ private ValueVectorUtility() { /** * Get the toString() representation of vector suitable for debugging. - * Note since vectors may have millions of values, this method only show max 20 values. + * Note since vectors may have millions of values, this method only shows max 20 values. * Examples as below (v represents value): *
  • * vector with 0 value: @@ -52,7 +54,20 @@ private ValueVectorUtility() { * [v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, ..., v90, v91, v92, v93, v94, v95, v96, v97, v98, v99] *
  • */ - public static String getToString(ValueVector vector, int start, int end) { + public static String getToString(V vector, int start, int end) { + return getToString(vector, start, end, (v, i) -> v.getObject(i)); + } + + /** + * Get the toString() representation of vector suitable for debugging. + * Note since vectors may have millions of values, this method only shows at most 20 values. + * @param vector the vector for which to get toString representation. + * @param start the starting index, inclusive. + * @param end the end index, exclusive. + * @param valueToString the function to transform individual elements to strings. + */ + public static String getToString( + V vector, int start, int end, BiFunction valueToString) { Preconditions.checkNotNull(vector); final int length = end - start; Preconditions.checkArgument(length >= 0); @@ -77,7 +92,7 @@ public static String getToString(ValueVector vector, int start, int end) { i = end - window - 1; skipComma = true; } else { - sb.append(vector.getObject(i)); + sb.append(valueToString.apply(vector, i)); } if (i == end - 1) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 2d0f8dab37d..52655bf8fcc 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2791,6 +2791,38 @@ public void testToString() { } } + @Test + public void testUInt1VectorToString() { + try (final UInt1Vector uInt1Vector = new UInt1Vector("uInt1Vector", allocator)) { + setVector(uInt1Vector, (byte) 0xff); + assertEquals("[255]", uInt1Vector.toString()); + } + } + + @Test + public void testUInt2VectorToString() { + try (final UInt2Vector uInt2Vector = new UInt2Vector("uInt2Vector", allocator)) { + setVector(uInt2Vector, (char) 0xffff); + assertEquals("[65535]", uInt2Vector.toString()); + } + } + + @Test + public void testUInt4VectorToString() { + try (final UInt4Vector uInt4Vector = new UInt4Vector("uInt4Vector", allocator)) { + setVector(uInt4Vector, 0xffffffff); + assertEquals("[4294967295]", uInt4Vector.toString()); + } + } + + @Test + public void testUInt8VectorToString() { + try (final UInt8Vector uInt8Vector = new UInt8Vector("uInt8Vector", allocator)) { + setVector(uInt8Vector, 0xffffffffffffffffL); + assertEquals("[18446744073709551615]", uInt8Vector.toString()); + } + } + @Test public void testUnloadVariableWidthVector() { try (final VarCharVector varCharVector = new VarCharVector("var char", allocator)) { From b76caf4964dc96d20b72b21291052f459fa9c68a Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Wed, 1 Sep 2021 22:05:13 -0700 Subject: [PATCH 05/93] ARROW-13544 [Java]: Remove APIs that have been deprecated for long (Changes to Vectors) See https://issues.apache.org/jira/browse/ARROW-13544 According to the discussion in #10864 (comment), we want to split the task into multiple parts. This PR is for the changes related to vectors Closes #10910 from liyafan82/fly_0810_depv Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../main/codegen/templates/UnionVector.java | 7 +- .../arrow/vector/BaseVariableWidthVector.java | 11 -- .../apache/arrow/vector/BitVectorHelper.java | 10 -- .../vector/complex/FixedSizeListVector.java | 17 +-- .../arrow/vector/complex/LargeListVector.java | 2 +- .../arrow/vector/complex/ListVector.java | 23 +--- .../apache/arrow/vector/types/pojo/Field.java | 20 ---- .../vector/util/ByteFunctionHelpers.java | 112 ------------------ .../arrow/vector/TestBitVectorHelper.java | 8 +- .../arrow/vector/TestDictionaryVector.java | 10 +- .../arrow/vector/TestFixedSizeListVector.java | 6 +- .../apache/arrow/vector/TestUnionVector.java | 30 +++-- .../apache/arrow/vector/TestValueVector.java | 7 +- .../arrow/vector/TestVectorReAlloc.java | 2 +- .../apache/arrow/vector/TestVectorReset.java | 2 +- .../compare/TestRangeEqualsVisitor.java | 11 +- .../vector/compare/TestTypeEqualsVisitor.java | 6 +- .../complex/writer/TestComplexWriter.java | 2 +- .../apache/arrow/vector/pojo/TestConvert.java | 2 +- 19 files changed, 55 insertions(+), 233 deletions(-) delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index bd5202977b8..8e5d76f39b8 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -114,11 +114,6 @@ public static UnionVector empty(String name, BufferAllocator allocator) { return new UnionVector(name, allocator, fieldType, null); } - @Deprecated - public UnionVector(String name, BufferAllocator allocator, CallBack callBack) { - this(name, allocator, null, callBack); - } - public UnionVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { super(name, allocator, callBack); this.fieldType = fieldType; @@ -522,7 +517,7 @@ private class TransferImpl implements TransferPair { private final UnionVector to; public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { - to = new UnionVector(name, allocator, callBack); + to = new UnionVector(name, allocator, /* field type */ null, callBack); internalStructVectorTransferPair = internalStruct.makeTransferPair(to.internalStruct); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 7fd19196733..cb36481ef5e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -1215,17 +1215,6 @@ protected final void setBytes(int index, byte[] value, int start, int length) { valueBuffer.setBytes(startOffset, value, start, length); } - /** - * Gets the starting offset of a record, given its index. - * This method is deprecated. Please use {@link BaseVariableWidthVector#getStartOffset(int)} instead. - * @param index index of the record. - * @return the starting offset of the record. - */ - @Deprecated - protected final int getstartOffset(int index) { - return getStartOffset(index); - } - public final int getStartOffset(int index) { return offsetBuffer.getInt((long) index * OFFSET_WIDTH); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index ec73382a0ef..3745c5a75c3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -88,16 +88,6 @@ public static void setBit(ArrowBuf validityBuffer, long index) { validityBuffer.setByte(byteIndex, currentByte); } - /** - * Set the bit at provided index to 1. - * - * @deprecated Please use {@link BitVectorHelper#setBit(ArrowBuf, long)} instead.. - */ - @Deprecated - public static void setValidityBitToOne(ArrowBuf validityBuffer, int index) { - setBit(validityBuffer, index); - } - /** * Set the bit at provided index to 0. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java index c22cba43c56..231ac470f6a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/FixedSizeListVector.java @@ -50,7 +50,6 @@ import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -77,20 +76,6 @@ public static FixedSizeListVector empty(String name, int size, BufferAllocator a private int valueCount; private int validityAllocationSizeInBytes; - /** - * Creates a new instance. - * - * @deprecated use FieldType or static constructor instead. - */ - @Deprecated - public FixedSizeListVector(String name, - BufferAllocator allocator, - int listSize, - DictionaryEncoding dictionary, - CallBack schemaChangeCallback) { - this(name, allocator, new FieldType(true, new ArrowType.FixedSizeList(listSize), dictionary), schemaChangeCallback); - } - /** * Creates a new instance. * @@ -407,7 +392,7 @@ public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { @Override public UnionVector promoteToUnion() { - UnionVector vector = new UnionVector(name, allocator, null); + UnionVector vector = new UnionVector(name, allocator, /* field type */ null, /* call-back */ null); this.vector.clear(); this.vector = vector; invalidateReader(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java index 31e30cc44d4..2aef50babd1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/LargeListVector.java @@ -541,7 +541,7 @@ protected void replaceDataVector(FieldVector v) { @Override public UnionVector promoteToUnion() { - UnionVector vector = new UnionVector("$data$", allocator, callBack); + UnionVector vector = new UnionVector("$data$", allocator, /* field type */ null, callBack); replaceDataVector(vector); invalidateReader(); if (callBack != null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 7e969263cb9..77d2c684619 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -50,7 +50,6 @@ import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -84,26 +83,6 @@ public static ListVector empty(String name, BufferAllocator allocator) { */ private int lastSet; - /** - * Creates a ListVector. - * - * @deprecated Use FieldType or static constructor instead. - */ - @Deprecated - public ListVector(String name, BufferAllocator allocator, CallBack callBack) { - this(name, allocator, FieldType.nullable(ArrowType.List.INSTANCE), callBack); - } - - /** - * Creates a ListVector. - * - * @deprecated Use FieldType or static constructor instead. - */ - @Deprecated - public ListVector(String name, BufferAllocator allocator, DictionaryEncoding dictionary, CallBack callBack) { - this(name, allocator, new FieldType(true, ArrowType.List.INSTANCE, dictionary, null), callBack); - } - /** * Constructs a new instance. * @@ -680,7 +659,7 @@ public ArrowBuf[] getBuffers(boolean clear) { @Override public UnionVector promoteToUnion() { - UnionVector vector = new UnionVector("$data$", allocator, callBack); + UnionVector vector = new UnionVector("$data$", allocator, /* field type*/ null, callBack); replaceDataVector(vector); invalidateReader(); if (callBack != null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index d4fdb9a15ec..2eeb3bea449 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -94,26 +94,6 @@ private Field(String name, FieldType fieldType, List children, TypeLayout this.children = children == null ? Collections.emptyList() : children.stream().collect(Collectors.toList()); } - /** - * Creates a new field. - * - * @deprecated Use FieldType or static constructor instead. - */ - @Deprecated - public Field(String name, boolean nullable, ArrowType type, List children) { - this(name, new FieldType(nullable, type, null, null), children); - } - - /** - * Creates a new field. - * - * @deprecated Use FieldType or static constructor instead. - */ - @Deprecated - public Field(String name, boolean nullable, ArrowType type, DictionaryEncoding dictionary, List children) { - this(name, new FieldType(nullable, type, dictionary, null), children); - } - public Field(String name, FieldType fieldType, List children) { this(name, fieldType, children, fieldType == null ? null : TypeLayout.getTypeLayout(fieldType.getType())); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java deleted file mode 100644 index 2faa1ff74c7..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.util; - -import org.apache.arrow.memory.ArrowBuf; - -/** - * Helper class for comparing byte buffers. - * - * @deprecated This class will be removed. Please use org.apache.arrow.memory.util.ByteFunctionHelpers instead. - */ -@Deprecated -public class ByteFunctionHelpers { - static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ByteFunctionHelpers.class); - - private ByteFunctionHelpers() {} - - /** - * Helper function to check for equality of bytes in two ArrowBufs. - * - * @param left Left ArrowBuf for comparison - * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf for comparison - * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer - * @return 1 if equals, 0 otherwise - */ - @Deprecated - public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd) { - return org.apache.arrow.memory.util.ByteFunctionHelpers.equal(left, lStart, lEnd, right, rStart, rEnd); - } - - /** - * Helper function to compare a set of bytes in two ArrowBufs. - * Function will check data before completing in the case that - * - * @param left Left ArrowBuf to compare - * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right Right ArrowBuf to compare - * @param rStart start offset in the buffer - * @param rEnd end offset in the buffer - * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise - */ - @Deprecated - public static final int compare( - final ArrowBuf left, - int lStart, - int lEnd, - final ArrowBuf right, - int rStart, - int rEnd) { - return org.apache.arrow.memory.util.ByteFunctionHelpers.compare(left, lStart, lEnd, right, rStart, rEnd); - } - - /** - * Helper function to compare a set of bytes in ArrowBuf to a ByteArray. - * - * @param left Left ArrowBuf for comparison purposes - * @param lStart start offset in the buffer - * @param lEnd end offset in the buffer - * @param right second input to be compared - * @param rStart start offset in the byte array - * @param rEnd end offset in the byte array - * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise - */ - @Deprecated - public static final int compare( - final ArrowBuf left, - int lStart, - int lEnd, - final byte[] right, - int rStart, - final int rEnd) { - return org.apache.arrow.memory.util.ByteFunctionHelpers.compare(left, lStart, lEnd, right, rStart, rEnd); - } - - /** - * Compares the two specified {@code long} values, treating them as unsigned values between - * {@code 0} and {@code 2^64 - 1} inclusive. - * - * @param a the first unsigned {@code long} to compare - * @param b the second unsigned {@code long} to compare - * @return a negative value if {@code a} is less than {@code b}; a positive value if {@code a} is - * greater than {@code b}; or zero if they are equal - */ - @Deprecated - public static int unsignedLongCompare(long a, long b) { - return org.apache.arrow.memory.util.ByteFunctionHelpers.unsignedLongCompare(a, b); - } - - @Deprecated - public static int unsignedIntCompare(int a, int b) { - return org.apache.arrow.memory.util.ByteFunctionHelpers.unsignedIntCompare(a, b); - } -} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java index 4b48876ff16..9c7e1979d2b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestBitVectorHelper.java @@ -164,8 +164,8 @@ public void testConcatBits() { final int maxCount = 100; for (int i = 0; i < maxCount; i++) { if (i % 3 == 0) { - BitVectorHelper.setValidityBitToOne(buf1, i); - BitVectorHelper.setValidityBitToOne(buf2, i); + BitVectorHelper.setBit(buf1, i); + BitVectorHelper.setBit(buf2, i); } } @@ -199,8 +199,8 @@ public void testConcatBitsInPlace() { final int maxCount = 100; for (int i = 0; i < maxCount; i++) { if (i % 3 == 0) { - BitVectorHelper.setValidityBitToOne(buf1, i); - BitVectorHelper.setValidityBitToOne(buf2, i); + BitVectorHelper.setBit(buf1, i); + BitVectorHelper.setBit(buf2, i); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java index 165cb7bad3e..bc6cddf3674 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDictionaryVector.java @@ -306,8 +306,9 @@ public void testEncodeBinaryVector() { @Test public void testEncodeUnion() { // Create a new value vector - try (final UnionVector vector = new UnionVector("vector", allocator, null); - final UnionVector dictionaryVector = new UnionVector("dict", allocator, null);) { + try (final UnionVector vector = new UnionVector("vector", allocator, /* field type */ null, /* call-back */ null); + final UnionVector dictionaryVector = + new UnionVector("dict", allocator, /* field type */ null, /* call-back */ null);) { final NullableUInt4Holder uintHolder1 = new NullableUInt4Holder(); uintHolder1.value = 10; @@ -494,8 +495,9 @@ public void testStructEquals() { @Test public void testUnionEquals() { - try (final UnionVector vector1 = new UnionVector("union", allocator, null); - final UnionVector vector2 = new UnionVector("union", allocator, null);) { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = + new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); uInt4Holder.value = 10; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java index 365789e04c8..9d7e413a739 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeListVector.java @@ -165,8 +165,10 @@ public void testNestedInList() { @Test public void testTransferPair() { - try (FixedSizeListVector from = new FixedSizeListVector("from", allocator, 2, null, null); - FixedSizeListVector to = new FixedSizeListVector("to", allocator, 2, null, null)) { + try (FixedSizeListVector from = new FixedSizeListVector( + "from", allocator, new FieldType(true, new ArrowType.FixedSizeList(2), null), null); + FixedSizeListVector to = new FixedSizeListVector( + "to", allocator, new FieldType(true, new ArrowType.FixedSizeList(2), null), null)) { Float4Vector nested = (Float4Vector) from.addOrGetVector(FieldType.nullable(MinorType.FLOAT4.getType())) .getVector(); from.allocateNew(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index 962c233889d..f04998915b6 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -69,7 +69,8 @@ public void testUnionVector() throws Exception { uInt4Holder.value = 100; uInt4Holder.isSet = 1; - try (UnionVector unionVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector unionVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { unionVector.allocateNew(); // write some data @@ -96,7 +97,8 @@ public void testUnionVector() throws Exception { @Test public void testUnionVectorMapValue() throws Exception { - try (UnionVector unionVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector unionVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { unionVector.allocateNew(); UnionWriter writer = (UnionWriter) unionVector.getWriter(); @@ -157,7 +159,8 @@ public void testUnionVectorMapValue() throws Exception { @Test public void testTransfer() throws Exception { - try (UnionVector srcVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { srcVector.allocateNew(); // write some data @@ -171,7 +174,8 @@ public void testTransfer() throws Exception { srcVector.setSafe(5, newBitHolder(false)); srcVector.setValueCount(6); - try (UnionVector destVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector destVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { TransferPair pair = srcVector.makeTransferPair(destVector); // Creating the transfer should transfer the type of the field at least. @@ -206,7 +210,8 @@ public void testTransfer() throws Exception { @Test public void testSplitAndTransfer() throws Exception { - try (UnionVector sourceVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector sourceVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { sourceVector.allocateNew(); @@ -256,7 +261,8 @@ public void testSplitAndTransfer() throws Exception { assertEquals(false, sourceVector.isNull(9)); assertEquals(50, sourceVector.getObject(9)); - try (UnionVector toVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector toVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); @@ -286,7 +292,8 @@ public void testSplitAndTransfer() throws Exception { @Test public void testSplitAndTransferWithMixedVectors() throws Exception { - try (UnionVector sourceVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector sourceVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { sourceVector.allocateNew(); @@ -345,7 +352,8 @@ public void testSplitAndTransferWithMixedVectors() throws Exception { assertEquals(false, sourceVector.isNull(9)); assertEquals(30.5f, sourceVector.getObject(9)); - try (UnionVector toVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector toVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { final TransferPair transferPair = sourceVector.makeTransferPair(toVector); @@ -410,7 +418,8 @@ public void testGetFieldTypeInfo() throws Exception { @Test public void testGetBufferAddress() throws Exception { - try (UnionVector vector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector vector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { boolean error = false; vector.allocateNew(); @@ -467,7 +476,8 @@ public void testGetBufferAddress() throws Exception { @Test public void testSetGetNull() { - try (UnionVector srcVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { srcVector.allocateNew(); final NullableIntHolder holder = new NullableIntHolder(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 52655bf8fcc..572c3d594c5 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -2651,8 +2651,8 @@ public void testStructVectorEqualsWithDiffChild() { @Test public void testUnionVectorEquals() { - try (final UnionVector vector1 = new UnionVector("union", allocator, null); - final UnionVector vector2 = new UnionVector("union", allocator, null);) { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); uInt4Holder.value = 10; @@ -2721,7 +2721,8 @@ public void testVariableWidthVectorNullHashCode() { @Test public void testUnionNullHashCode() { - try (UnionVector srcVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { + try (UnionVector srcVector = + new UnionVector(EMPTY_SCHEMA_PATH, allocator, /* field type */ null, /* call-back */ null)) { srcVector.allocateNew(); final NullableIntHolder holder = new NullableIntHolder(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java index fae50c9dffc..18bb2c95738 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReAlloc.java @@ -421,7 +421,7 @@ public void testFixedSizeListVectorClearAndSet() { @Test public void testUnionVectorClearAndSet() { - try (final UnionVector vector = new UnionVector("", allocator, null)) { + try (final UnionVector vector = new UnionVector("", allocator, /* field type */ null, /* call-back */ null)) { vector.allocateNewSafe(); // Initial allocation NullableIntHolder holder = new NullableIntHolder(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java index adb51960ecd..71009a33375 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorReset.java @@ -153,7 +153,7 @@ public void testStructTypeReset() { @Test public void testUnionTypeReset() { - try (final UnionVector vector = new UnionVector("Union", allocator, null); + try (final UnionVector vector = new UnionVector("Union", allocator, /* field type */ null, /* call-back */ null); final IntVector dataVector = new IntVector("Int", allocator) ) { vector.getBufferSize(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index 9121e82fcaf..4495881ad78 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -270,8 +270,9 @@ public void testStructVectorRangeEquals() { @Test public void testUnionVectorRangeEquals() { - try (final UnionVector vector1 = new UnionVector("union", allocator, null); - final UnionVector vector2 = new UnionVector("union", allocator, null);) { + try (final UnionVector vector1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector vector2 = + new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); uInt4Holder.value = 10; @@ -547,9 +548,9 @@ public void testStructVectorApproxEquals() { @Test public void testUnionVectorApproxEquals() { - try (final UnionVector right = new UnionVector("union", allocator, null); - final UnionVector left1 = new UnionVector("union", allocator, null); - final UnionVector left2 = new UnionVector("union", allocator, null);) { + try (final UnionVector right = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null);) { final NullableFloat4Holder float4Holder = new NullableFloat4Holder(); float4Holder.value = 1.01f; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java index 4195c8811ee..c0a3bd89dc1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java @@ -127,9 +127,9 @@ public void testStructTypeEquals() { @Test public void testUnionTypeEquals() { - try (final UnionVector right = new UnionVector("union", allocator, null); - final UnionVector left1 = new UnionVector("union", allocator, null); - final UnionVector left2 = new UnionVector("union", allocator, null)) { + try (final UnionVector right = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left1 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); + final UnionVector left2 = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null)) { right.addVector(new IntVector("int", allocator)); left1.addVector(new IntVector("int", allocator)); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index d44ada2f30f..d4cf6ea8937 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -604,7 +604,7 @@ private void checkListMap(ListVector listVector) { @Test public void simpleUnion() { - UnionVector vector = new UnionVector("union", allocator, null); + UnionVector vector = new UnionVector("union", allocator, /* field type */ null, /* call-back */ null); UnionWriter unionWriter = new UnionWriter(vector); unionWriter.allocate(); for (int i = 0; i < COUNT; i++) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 8c291e01a36..5cc0d080053 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -144,7 +144,7 @@ public void nestedSchema() { Collections2.asImmutableList( new Field("child5.1", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, null)), null), new Field("child5.2", FieldType.nullable(new FloatingPoint(DOUBLE)), Collections.emptyList()), - new Field("child5.3", true, new Timestamp(TimeUnit.MILLISECOND, "UTC"), null) + new Field("child5.3", FieldType.nullable(new Timestamp(TimeUnit.MILLISECOND, "UTC")), null) ))); Schema initialSchema = new Schema(children); run(initialSchema); From 111f0c700995d6cc9903cec67f38e250e727138c Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Wed, 1 Sep 2021 22:06:31 -0700 Subject: [PATCH 06/93] ARROW-13823 [Java]: Exclude .factorypath Exclude .factorypath files generated by Eclipse IDE for configuring annotation processing from Git commit and RAT plugin scans. Closes #11042 from laurentgo/laurentgo/exclude-factorypath Authored-by: Laurent Goujon Signed-off-by: Micah Kornfield --- java/.gitignore | 1 + java/pom.xml | 1 + 2 files changed, 2 insertions(+) diff --git a/java/.gitignore b/java/.gitignore index 03f5bf76e60..bc6ce4f6c32 100644 --- a/java/.gitignore +++ b/java/.gitignore @@ -2,6 +2,7 @@ .buildpath .classpath .checkstyle +.factorypath .settings/ .idea/ TAGS diff --git a/java/pom.xml b/java/pom.xml index 8752abe1fc4..1b7970ea6e8 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -128,6 +128,7 @@ **/TAGS **/*.checkstyle **/.classpath + **/.factorypath **/.settings/** .*/** **/*.patch From 09497a976604c1960c5934e8f05dd8203700efd6 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Wed, 1 Sep 2021 22:10:25 -0700 Subject: [PATCH 07/93] ARROW-13544 [Java]: Remove APIs that have been deprecated for long (Changes to JDBC) See https://issues.apache.org/jira/browse/ARROW-13544 According to the discussion in #10864 (comment), we want to split the task into multiple parts. This PR is for the changes related to the JDBC adapter Closes #10912 from liyafan82/fly_0811_depj Authored-by: liyafan82 Signed-off-by: Micah Kornfield --- .../arrow/adapter/jdbc/JdbcToArrow.java | 170 ----------------- .../adapter/jdbc/AbstractJdbcToArrowTest.java | 178 ++++++++++++++++++ .../adapter/jdbc/h2/JdbcAliasToArrowTest.java | 4 +- .../adapter/jdbc/h2/JdbcToArrowArrayTest.java | 10 +- .../jdbc/h2/JdbcToArrowCharSetTest.java | 17 +- .../jdbc/h2/JdbcToArrowDataTypesTest.java | 17 +- .../adapter/jdbc/h2/JdbcToArrowNullTest.java | 17 +- .../h2/JdbcToArrowOptionalColumnsTest.java | 3 +- .../adapter/jdbc/h2/JdbcToArrowTest.java | 16 +- .../jdbc/h2/JdbcToArrowTimeZoneTest.java | 11 +- 10 files changed, 224 insertions(+), 219 deletions(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java index c65523d837f..daee64d9308 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java @@ -18,17 +18,11 @@ package org.apache.arrow.adapter.jdbc; import java.io.IOException; -import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; -import java.sql.Statement; -import java.util.Calendar; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.util.Preconditions; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.util.ValueVectorUtility; /** * Utility class to convert JDBC objects to columnar Arrow format objects. @@ -64,170 +58,6 @@ */ public class JdbcToArrow { - /** - * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. - * This method uses the default Calendar instance with default TimeZone and Locale as returned by the JVM. - * If you wish to use specific TimeZone or Locale for any Date, Time and Timestamp datasets, you may want use - * overloaded API that taken Calendar object instance. - * - * @param connection Database connection to be used. This method will not close the passed connection object. Since - * the caller has passed the connection object it's the responsibility of the caller to close or - * return the connection to the pool. - * @param query The DB Query to fetch the data. - * @param allocator Memory allocator - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as - * ResultSet and Statement objects. - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(Connection connection, String query, BufferAllocator allocator) - throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); - - JdbcToArrowConfig config = - new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); - return sqlToArrow(connection, query, config); - } - - /** - * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. - * - * @param connection Database connection to be used. This method will not close the passed connection object. Since - * the caller has passed the connection object it's the responsibility of the caller to close or - * return the connection to the pool. - * @param query The DB Query to fetch the data. - * @param allocator Memory allocator - * @param calendar Calendar object to use to handle Date, Time and Timestamp datasets. - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as - * ResultSet and Statement objects. - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow( - Connection connection, - String query, - BufferAllocator allocator, - Calendar calendar) throws SQLException, IOException { - - Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); - Preconditions.checkNotNull(calendar, "Calendar object can not be null"); - - return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar)); - } - - /** - * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. - * - * @param connection Database connection to be used. This method will not close the passed connection object. - * Since the caller has passed the connection object it's the responsibility of the caller - * to close or return the connection to the pool. - * @param query The DB Query to fetch the data. - * @param config Configuration - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as - * ResultSet and Statement objects. - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(Connection connection, String query, JdbcToArrowConfig config) - throws SQLException, IOException { - Preconditions.checkNotNull(connection, "JDBC connection object can not be null"); - Preconditions.checkArgument(query != null && query.length() > 0, "SQL query can not be null or empty"); - - try (Statement stmt = connection.createStatement()) { - return sqlToArrow(stmt.executeQuery(query), config); - } - } - - /** - * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. This - * method uses the default RootAllocator and Calendar object. - * - * @param resultSet ResultSet to use to fetch the data from underlying database - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException on error - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); - - return sqlToArrow(resultSet, JdbcToArrowUtils.getUtcCalendar()); - } - - /** - * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. - * - * @param resultSet ResultSet to use to fetch the data from underlying database - * @param allocator Memory allocator - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException on error - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BufferAllocator allocator) - throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); - - JdbcToArrowConfig config = - new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); - return sqlToArrow(resultSet, config); - } - - /** - * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. - * - * @param resultSet ResultSet to use to fetch the data from underlying database - * @param calendar Calendar instance to use for Date, Time and Timestamp datasets, or null if none. - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException on error - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); - return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar)); - } - - /** - * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. - * - * @param resultSet ResultSet to use to fetch the data from underlying database - * @param allocator Memory allocator to use. - * @param calendar Calendar instance to use for Date, Time and Timestamp datasets, or null if none. - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException on error - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow( - ResultSet resultSet, - BufferAllocator allocator, - Calendar calendar) - throws SQLException, IOException { - Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); - - return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar)); - } - - /** - * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. - * - * @param resultSet ResultSet to use to fetch the data from underlying database - * @param config Configuration of the conversion from JDBC to Arrow. - * @return Arrow Data Objects {@link VectorSchemaRoot} - * @throws SQLException on error - */ - @Deprecated - public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, JdbcToArrowConfig config) - throws SQLException, IOException { - Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); - Preconditions.checkNotNull(config, "The configuration cannot be null"); - - VectorSchemaRoot root = VectorSchemaRoot.create( - JdbcToArrowUtils.jdbcToArrowSchema(resultSet.getMetaData(), config), config.getAllocator()); - if (config.getTargetBatchSize() != JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { - ValueVectorUtility.preAllocate(root, config.getTargetBatchSize()); - } - JdbcToArrowUtils.jdbcToArrowVectors(resultSet, root, config); - return root; - } - /*----------------------------------------------------------------* | | | Partial Convert API | diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java index 616363ecf64..6bbe0987b35 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/AbstractJdbcToArrowTest.java @@ -20,10 +20,16 @@ import java.io.IOException; import java.sql.Connection; import java.sql.DriverManager; +import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; +import java.util.Calendar; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.util.ValueVectorUtility; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -140,4 +146,176 @@ public static Object[][] prepareTestData(String[] testFiles, @SuppressWarnings(" */ public abstract void testDataSets(VectorSchemaRoot root); + /** + * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. + * This method uses the default Calendar instance with default TimeZone and Locale as returned by the JVM. + * If you wish to use specific TimeZone or Locale for any Date, Time and Timestamp datasets, you may want use + * overloaded API that taken Calendar object instance. + * + * This method is for test only. + * + * @param connection Database connection to be used. This method will not close the passed connection object. Since + * the caller has passed the connection object it's the responsibility of the caller to close or + * return the connection to the pool. + * @param query The DB Query to fetch the data. + * @param allocator Memory allocator + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as + * ResultSet and Statement objects. + */ + public static VectorSchemaRoot sqlToArrow(Connection connection, String query, BufferAllocator allocator) + throws SQLException, IOException { + Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); + + JdbcToArrowConfig config = + new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); + return sqlToArrow(connection, query, config); + } + + /** + * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param connection Database connection to be used. This method will not close the passed connection object. Since + * the caller has passed the connection object it's the responsibility of the caller to close or + * return the connection to the pool. + * @param query The DB Query to fetch the data. + * @param allocator Memory allocator + * @param calendar Calendar object to use to handle Date, Time and Timestamp datasets. + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as + * ResultSet and Statement objects. + */ + public static VectorSchemaRoot sqlToArrow( + Connection connection, + String query, + BufferAllocator allocator, + Calendar calendar) throws SQLException, IOException { + + Preconditions.checkNotNull(allocator, "Memory allocator object can not be null"); + Preconditions.checkNotNull(calendar, "Calendar object can not be null"); + + return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar)); + } + + /** + * For the given SQL query, execute and fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param connection Database connection to be used. This method will not close the passed connection object. + * Since the caller has passed the connection object it's the responsibility of the caller + * to close or return the connection to the pool. + * @param query The DB Query to fetch the data. + * @param config Configuration + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException Propagate any SQL Exceptions to the caller after closing any resources opened such as + * ResultSet and Statement objects. + */ + public static VectorSchemaRoot sqlToArrow(Connection connection, String query, JdbcToArrowConfig config) + throws SQLException, IOException { + Preconditions.checkNotNull(connection, "JDBC connection object can not be null"); + Preconditions.checkArgument(query != null && query.length() > 0, "SQL query can not be null or empty"); + + try (Statement stmt = connection.createStatement()) { + return sqlToArrow(stmt.executeQuery(query), config); + } + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. This + * method uses the default RootAllocator and Calendar object. + * + * This method is for test only. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException on error + */ + public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLException, IOException { + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + + return sqlToArrow(resultSet, JdbcToArrowUtils.getUtcCalendar()); + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param allocator Memory allocator + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException on error + */ + public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BufferAllocator allocator) + throws SQLException, IOException { + Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + + JdbcToArrowConfig config = + new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); + return sqlToArrow(resultSet, config); + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param calendar Calendar instance to use for Date, Time and Timestamp datasets, or null if none. + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException on error + */ + public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException { + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar)); + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param allocator Memory allocator to use. + * @param calendar Calendar instance to use for Date, Time and Timestamp datasets, or null if none. + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException on error + */ + public static VectorSchemaRoot sqlToArrow( + ResultSet resultSet, + BufferAllocator allocator, + Calendar calendar) + throws SQLException, IOException { + Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + + return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar)); + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * + * This method is for test only. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param config Configuration of the conversion from JDBC to Arrow. + * @return Arrow Data Objects {@link VectorSchemaRoot} + * @throws SQLException on error + */ + public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, JdbcToArrowConfig config) + throws SQLException, IOException { + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(config, "The configuration cannot be null"); + + VectorSchemaRoot root = VectorSchemaRoot.create( + JdbcToArrowUtils.jdbcToArrowSchema(resultSet.getMetaData(), config), config.getAllocator()); + if (config.getTargetBatchSize() != JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { + ValueVectorUtility.preAllocate(root, config.getTargetBatchSize()); + } + JdbcToArrowUtils.jdbcToArrowVectors(resultSet, root, config); + return root; + } + } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java index f44818a9f09..a6e6b22fcb4 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcAliasToArrowTest.java @@ -17,6 +17,7 @@ package org.apache.arrow.adapter.jdbc.h2; +import static org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest.sqlToArrow; import static org.junit.Assert.assertEquals; import java.sql.Connection; @@ -28,7 +29,6 @@ import java.sql.Statement; import java.util.List; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.types.pojo.Field; @@ -106,7 +106,7 @@ public void testJdbcAliasToArrow() throws Exception { try (ResultSet resultSet = conn.createStatement().executeQuery(QUERY)) { final VectorSchemaRoot vector = - JdbcToArrow.sqlToArrow(resultSet, new RootAllocator(Integer.MAX_VALUE)); + sqlToArrow(resultSet, new RootAllocator(Integer.MAX_VALUE)); assertEquals(rowCount, vector.getRowCount()); Schema vectorSchema = vector.getSchema(); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java index 31f7db549e9..b7dc1ee58a5 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowArrayTest.java @@ -17,7 +17,10 @@ package org.apache.arrow.adapter.jdbc.h2; -import static org.junit.Assert.*; +import static org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest.sqlToArrow; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import java.sql.Array; import java.sql.Connection; @@ -32,7 +35,6 @@ import java.util.Map; import org.apache.arrow.adapter.jdbc.JdbcFieldInfo; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowUtils; @@ -160,7 +162,7 @@ public void testJdbcToArrow() throws Exception { final JdbcToArrowConfig config = builder.build(); try (ResultSet resultSet = conn.createStatement().executeQuery(QUERY)) { - final VectorSchemaRoot vector = JdbcToArrow.sqlToArrow(resultSet, config); + final VectorSchemaRoot vector = sqlToArrow(resultSet, config); assertEquals(rowCount, vector.getRowCount()); @@ -204,7 +206,7 @@ public void testJdbcToArrowWithNulls() throws Exception { final JdbcToArrowConfig config = builder.build(); try (ResultSet resultSet = conn.createStatement().executeQuery(QUERY)) { - final VectorSchemaRoot vector = JdbcToArrow.sqlToArrow(resultSet, config); + final VectorSchemaRoot vector = sqlToArrow(resultSet, config); assertEquals(rowCount, vector.getRowCount()); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java index b2ac349b596..b548c9169af 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowCharSetTest.java @@ -31,7 +31,6 @@ import java.util.Collection; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; @@ -109,20 +108,20 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn.createStatement().executeQuery(table.getQuery()), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn, table.getQuery(), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java index 2be6a83c342..40db5c23579 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowDataTypesTest.java @@ -41,7 +41,6 @@ import java.util.Collection; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; @@ -143,19 +142,19 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); + testDataSets(sqlToArrow( conn.createStatement().executeQuery(table.getQuery()), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn, table.getQuery(), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java index fd373091f93..71cc700568f 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowNullTest.java @@ -49,7 +49,6 @@ import java.util.Collection; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; @@ -120,19 +119,19 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); + testDataSets(sqlToArrow( conn.createStatement().executeQuery(table.getQuery()), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn, table.getQuery(), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java index 4ab9017e247..84960dc8880 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowOptionalColumnsTest.java @@ -26,7 +26,6 @@ import java.util.Collection; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; import org.apache.arrow.adapter.jdbc.Table; import org.apache.arrow.memory.RootAllocator; @@ -72,7 +71,7 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); } /** diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java index 8c5a17c37f7..5957eee742b 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTest.java @@ -135,20 +135,20 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()))); + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn.createStatement().executeQuery(table.getQuery()), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn, table.getQuery(), new JdbcToArrowConfigBuilder(new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance()).build())); diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java index 7062fa6aec1..f5ddbdb9bf0 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/h2/JdbcToArrowTimeZoneTest.java @@ -30,7 +30,6 @@ import java.util.TimeZone; import org.apache.arrow.adapter.jdbc.AbstractJdbcToArrowTest; -import org.apache.arrow.adapter.jdbc.JdbcToArrow; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfig; import org.apache.arrow.adapter.jdbc.JdbcToArrowConfigBuilder; import org.apache.arrow.adapter.jdbc.JdbcToArrowTestHelper; @@ -105,18 +104,18 @@ public static Collection getTestData() throws SQLException, ClassNotFo */ @Test public void testJdbcToArrowValues() throws SQLException, IOException { - testDataSets(JdbcToArrow.sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), + testDataSets(sqlToArrow(conn, table.getQuery(), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance(TimeZone.getTimeZone(table.getTimezone())))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance(TimeZone.getTimeZone(table.getTimezone())))); - testDataSets(JdbcToArrow.sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), + testDataSets(sqlToArrow(conn.createStatement().executeQuery(table.getQuery()), Calendar.getInstance(TimeZone.getTimeZone(table.getTimezone())))); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn.createStatement().executeQuery(table.getQuery()), new JdbcToArrowConfigBuilder( new RootAllocator(Integer.MAX_VALUE), Calendar.getInstance(TimeZone.getTimeZone(table.getTimezone()))).build())); - testDataSets(JdbcToArrow.sqlToArrow( + testDataSets(sqlToArrow( conn, table.getQuery(), new JdbcToArrowConfigBuilder( From e380c1a08bba100671d5d6032e22ccfaf73a2c05 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 2 Sep 2021 07:14:10 +0000 Subject: [PATCH 08/93] ARROW-13812: [C++] Fix Valgrind error in Grouper.BooleanKey test Essentially, this failure boils down to: when generating the array of uniques for booleans, we pack 8 bytes at a time into one byte. The bytes are packed from what turns out to be a scratch array allocated by TempVectorStack, which does not initialize its memory. So when we have a non-multiple-of-8 number of bytes, we may end up packing initialized bytes and uninitialized bytes together into a single garbage byte, resulting in Valgrind complaining. Closes #11041 from lidavidm/arrow-13812 Authored-by: David Li Signed-off-by: Yibo Cai --- cpp/src/arrow/compute/exec/util.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index f5c55afe0f5..63f3315f7e0 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -70,6 +70,8 @@ class TempVectorStack { top_ = 0; buffer_size_ = size; ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + // Ensure later operations don't accidentally read uninitialized memory. + std::memset(buffer->mutable_data(), 0xFF, size); buffer_ = std::move(buffer); return Status::OK(); } From bbecb6ae5242cf97061d5daa73ef127d49b105b6 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Thu, 2 Sep 2021 10:04:39 +0000 Subject: [PATCH 09/93] ARROW-13067: [C++][Compute] Implement integer to decimal cast Closes #11045 from cyb70289/13067-int2dec Authored-by: Yibo Cai Signed-off-by: Yibo Cai --- .../compute/kernels/scalar_arithmetic_test.cc | 14 ++++- .../compute/kernels/scalar_cast_numeric.cc | 59 +++++++++++++++++++ .../arrow/compute/kernels/scalar_cast_test.cc | 34 ++++++++++- 3 files changed, 104 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 2939e47666e..ce3588fb432 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -1365,6 +1365,18 @@ TEST(TestBinaryDecimalArithmetic, DispatchBest) { } } + // decimal, integer + for (std::string name : {"add", "subtract", "multiply", "divide"}) { + for (std::string suffix : {"", "_checked"}) { + name += suffix; + + CheckDispatchBest(name, {int64(), decimal128(1, 0)}, + {decimal128(1, 0), decimal128(1, 0)}); + CheckDispatchBest(name, {decimal128(1, 0), int64()}, + {decimal128(1, 0), decimal128(1, 0)}); + } + } + // decimal, decimal for (std::string name : {"add", "subtract"}) { for (std::string suffix : {"", "_checked"}) { @@ -1410,8 +1422,6 @@ TEST(TestBinaryDecimalArithmetic, DispatchBest) { {decimal256(6, 4), decimal256(6, 4)}); } } - - // TODO(ARROW-13067): add 'integer, decimal' tests } // reference result from bc (precsion=100, scale=40) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index cd89a57ed77..e9cf9284ceb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -391,6 +391,53 @@ struct CastFunctor + OutValue Call(KernelContext*, IntegerType val, Status* st) const { + auto maybe_decimal = OutValue(val).Rescale(0, out_scale_); + if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) { + return maybe_decimal.MoveValueUnsafe(); + } + *st = maybe_decimal.status(); + return OutValue{}; + } + + int32_t out_scale_; +}; + +template +struct CastFunctor::value && is_integer_type::value>> { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& out_type = checked_cast(*out->type()); + const auto out_scale = out_type.scale(); + const auto out_precision = out_type.precision(); + + // verify precision and scale + if (out_scale < 0) { + return Status::Invalid("Scale must be non-negative"); + } + // maximal number of decimal digits for int8/16/32/64 + constexpr std::array decimal_digits{3, 5, 10, 19}; + using ctype = typename I::c_type; + static_assert(sizeof(ctype) <= 8, ""); + const int precision = decimal_digits[BitUtil::Log2(sizeof(ctype))] + out_scale; + if (out_precision < precision) { + return Status::Invalid( + "Precision is not great enough for the result. " + "It should be at least ", + precision); + } + + applicator::ScalarUnaryNotNullStateful kernel( + IntegerToDecimal{out_scale}); + return kernel.Exec(ctx, batch, out); + } +}; + // ---------------------------------------------------------------------- // Decimal to decimal @@ -641,6 +688,12 @@ std::shared_ptr GetCastToDecimal128() { DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty, CastFunctor::Exec)); + // Cast from integer + for (const std::shared_ptr& in_ty : IntTypes()) { + auto exec = GenerateInteger(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + // Cast from other decimal auto exec = CastFunctor::Exec; // We resolve the output type of this kernel from the CastOptions @@ -664,6 +717,12 @@ std::shared_ptr GetCastToDecimal256() { DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty, CastFunctor::Exec)); + // Cast from integer + for (const std::shared_ptr& in_ty : IntTypes()) { + auto exec = GenerateInteger(in_ty->id()); + DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, sig_out_ty, std::move(exec))); + } + // Cast from other decimal auto exec = CastFunctor::Exec; DCHECK_OK( diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 90d41894578..fc7e42aca6f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -71,9 +71,11 @@ static std::vector> kNumericTypes = { uint8(), int8(), uint16(), int16(), uint32(), int32(), uint64(), int64(), float32(), float64()}; -static std::vector> kDictionaryIndexTypes = { +static std::vector> kIntegerTypes = { int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()}; +static std::vector> kDictionaryIndexTypes = kIntegerTypes; + static std::vector> kBaseBinaryTypes = { binary(), utf8(), large_binary(), large_utf8()}; @@ -587,6 +589,36 @@ TEST(Cast, Decimal256ToInt) { CheckCast(negative_scale, ArrayFromJSON(int64(), "[1234567890000, -120000]"), options); } +TEST(Cast, IntegerToDecimal) { + for (auto decimal_type : {decimal128(21, 2), decimal256(21, 2)}) { + for (auto integer_type : kIntegerTypes) { + CheckCast( + ArrayFromJSON(integer_type, "[0, 7, null, 100, 99]"), + ArrayFromJSON(decimal_type, R"(["0.00", "7.00", null, "100.00", "99.00"])")); + } + } + + // extreme value + for (auto decimal_type : {decimal128(19, 0), decimal256(19, 0)}) { + CheckCast(ArrayFromJSON(int64(), "[-9223372036854775808, 9223372036854775807]"), + ArrayFromJSON(decimal_type, + R"(["-9223372036854775808", "9223372036854775807"])")); + CheckCast(ArrayFromJSON(uint64(), "[0, 18446744073709551615]"), + ArrayFromJSON(decimal_type, R"(["0", "18446744073709551615"])")); + } + + // insufficient output precision + { + CastOptions options; + + options.to_type = decimal128(5, 3); + CheckCastFails(ArrayFromJSON(int8(), "[0]"), options); + + options.to_type = decimal256(76, 67); + CheckCastFails(ArrayFromJSON(int32(), "[0]"), options); + } +} + TEST(Cast, Decimal128ToDecimal128) { CastOptions options; From 495c734969f4af9288a8e745c29fb1d707a46baa Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 2 Sep 2021 13:03:21 +0200 Subject: [PATCH 10/93] ARROW-13846: [C++] Fix crashes on invalid IPC file Should fix the following issues found by OSS-Fuzz: * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37927 * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37915 * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37888 Also add the IPC integration reference files to the fuzzing corpus, this may help find more issues. Closes #11059 from pitrou/ARROW-13846-ipc-fuzz-crashes Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../fuzzing/generate_corpuses.sh | 9 ++- cpp/src/arrow/array/array_test.cc | 69 +++++++++++++++++++ cpp/src/arrow/array/util.cc | 26 ++++--- cpp/src/arrow/type.cc | 2 + testing | 2 +- 5 files changed, 96 insertions(+), 12 deletions(-) diff --git a/cpp/build-support/fuzzing/generate_corpuses.sh b/cpp/build-support/fuzzing/generate_corpuses.sh index f0d8e162375..e3f00e64782 100755 --- a/cpp/build-support/fuzzing/generate_corpuses.sh +++ b/cpp/build-support/fuzzing/generate_corpuses.sh @@ -27,15 +27,21 @@ fi set -ex CORPUS_DIR=/tmp/corpus -ARROW_CPP=$(cd $(dirname $BASH_SOURCE)/../..; pwd) +ARROW_ROOT=$(cd $(dirname $BASH_SOURCE)/../../..; pwd) +ARROW_CPP=$ARROW_ROOT/cpp OUT=$1 # NOTE: name of seed corpus output file should be "-seed_corpus.zip" # where "" is the exact name of the fuzz target executable the # seed corpus is generated for. +IPC_INTEGRATION_FILES=$(find ${ARROW_ROOT}/testing/data/arrow-ipc-stream/integration -name "*.stream") + rm -rf ${CORPUS_DIR} ${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR} +# Several IPC integration files can have the same name, make sure +# they all appear in the corpus by numbering the duplicates. +cp --backup=numbered ${IPC_INTEGRATION_FILES} ${CORPUS_DIR} ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-stream-fuzz_seed_corpus.zip rm -rf ${CORPUS_DIR} @@ -48,5 +54,6 @@ ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc rm -rf ${CORPUS_DIR} ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR} +# Add Parquet testing examples cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR} ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index cc45a369400..d9617c4e603 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3214,4 +3214,73 @@ TEST(TestSwapEndianArrayData, MonthDayNanoInterval) { ASSERT_OK(swap_array->ValidateFull()); } +DataTypeVector SwappableTypes() { + return DataTypeVector{int8(), + int16(), + int32(), + int64(), + uint8(), + uint16(), + uint32(), + uint64(), + decimal128(19, 4), + decimal256(37, 8), + timestamp(TimeUnit::MICRO, ""), + time32(TimeUnit::SECOND), + time64(TimeUnit::NANO), + date32(), + date64(), + day_time_interval(), + month_interval(), + month_day_nano_interval(), + binary(), + utf8(), + large_binary(), + large_utf8(), + list(int16()), + large_list(int16()), + dictionary(int16(), utf8())}; +} + +TEST(TestSwapEndianArrayData, RandomData) { + random::RandomArrayGenerator rng(42); + + for (const auto& type : SwappableTypes()) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + auto arr = rng.ArrayOf(*field("", type), /*size=*/31); + ASSERT_OK_AND_ASSIGN(auto swapped_data, + ::arrow::internal::SwapEndianArrayData(arr->data())); + auto swapped = MakeArray(swapped_data); + ASSERT_OK_AND_ASSIGN(auto roundtripped_data, + ::arrow::internal::SwapEndianArrayData(swapped_data)); + auto roundtripped = MakeArray(roundtripped_data); + ASSERT_OK(roundtripped->ValidateFull()); + + AssertArraysEqual(*arr, *roundtripped, /*verbose=*/true); + if (type->id() == Type::INT8 || type->id() == Type::UINT8) { + AssertArraysEqual(*arr, *swapped, /*verbose=*/true); + } else { + // Random generated data is unlikely to be made of byte-palindromes + ASSERT_FALSE(arr->Equals(*swapped)); + } + } +} + +TEST(TestSwapEndianArrayData, InvalidLength) { + // IPC-incoming data may be invalid, SwapEndianArrayData shouldn't crash + // by accessing memory out of bounds. + random::RandomArrayGenerator rng(42); + + for (const auto& type : SwappableTypes()) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + ASSERT_OK_AND_ASSIGN(auto arr, MakeArrayOfNull(type, 0)); + auto data = arr->data(); + // Fake length + data->length = 123456789; + ASSERT_OK_AND_ASSIGN(auto swapped_data, ::arrow::internal::SwapEndianArrayData(data)); + auto swapped = MakeArray(swapped_data); + ASSERT_RAISES(Invalid, swapped->Validate()); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index f12281155b8..232947d2c88 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -78,11 +78,16 @@ class ArrayDataWrapper { class ArrayDataEndianSwapper { public: - ArrayDataEndianSwapper(const std::shared_ptr& data, int64_t length) - : data_(data), length_(length) { + explicit ArrayDataEndianSwapper(const std::shared_ptr& data) : data_(data) { out_ = data->Copy(); } + // WARNING: this facility can be called on invalid Array data by the IPC reader. + // Do not rely on the advertised ArrayData length, instead use the physical + // buffer sizes to avoid accessing memory out of bounds. + // + // (If this guarantee turns out to be difficult to maintain, we should call + // Validate() instead) Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); RETURN_NOT_OK(SwapChildren(type.fields())); @@ -111,6 +116,7 @@ class ArrayDataEndianSwapper { auto in_data = reinterpret_cast(in_buffer->data()); ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); auto out_data = reinterpret_cast(out_buffer->mutable_data()); + // NOTE: data_->length not trusted (see warning above) int64_t length = in_buffer->size() / sizeof(T); for (int64_t i = 0; i < length; i++) { out_data[i] = BitUtil::ByteSwap(in_data[i]); @@ -146,8 +152,8 @@ class ArrayDataEndianSwapper { auto data = reinterpret_cast(data_->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); auto new_data = reinterpret_cast(new_buffer->mutable_data()); - int64_t length = length_; - length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2); + // NOTE: data_->length not trusted (see warning above) + const int64_t length = data_->buffers[1]->size() / Decimal128Type::kByteWidth; for (int64_t i = 0; i < length; i++) { uint64_t tmp; auto idx = i * 2; @@ -169,8 +175,8 @@ class ArrayDataEndianSwapper { auto data = reinterpret_cast(data_->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); auto new_data = reinterpret_cast(new_buffer->mutable_data()); - int64_t length = length_; - length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4); + // NOTE: data_->length not trusted (see warning above) + const int64_t length = data_->buffers[1]->size() / Decimal256Type::kByteWidth; for (int64_t i = 0; i < length; i++) { uint64_t tmp0, tmp1, tmp2; auto idx = i * 4; @@ -206,9 +212,10 @@ class ArrayDataEndianSwapper { auto data = reinterpret_cast(data_->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); auto new_data = reinterpret_cast(new_buffer->mutable_data()); - int64_t length = data_->length; + // NOTE: data_->length not trusted (see warning above) + const int64_t length = data_->buffers[1]->size() / sizeof(MonthDayNanos); for (int64_t i = 0; i < length; i++) { - MonthDayNanoIntervalType::MonthDayNanos tmp = data[i]; + MonthDayNanos tmp = data[i]; #if ARROW_LITTLE_ENDIAN tmp.months = BitUtil::FromBigEndian(tmp.months); tmp.days = BitUtil::FromBigEndian(tmp.days); @@ -279,7 +286,6 @@ class ArrayDataEndianSwapper { } const std::shared_ptr& data_; - int64_t length_; std::shared_ptr out_; }; @@ -292,7 +298,7 @@ Result> SwapEndianArrayData( if (data->offset != 0) { return Status::Invalid("Unsupported data format: data.offset != 0"); } - ArrayDataEndianSwapper swapper(data, data->length); + ArrayDataEndianSwapper swapper(data); RETURN_NOT_OK(swapper.SwapType(*data->type)); return std::move(swapper.out_); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d2adbf04b15..572286799a6 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -88,6 +88,8 @@ constexpr Type::type MonthIntervalType::type_id; constexpr Type::type DayTimeIntervalType::type_id; +constexpr Type::type MonthDayNanoIntervalType::type_id; + constexpr Type::type DurationType::type_id; constexpr Type::type DictionaryType::type_id; diff --git a/testing b/testing index 6d98243093c..896d05d3516 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 6d98243093c0b36442da94de7010f3eacc2a9909 +Subproject commit 896d05d35163168831876b0f3e76977f6f20d4f4 From 425b1cb256040411aca7202faf7124910ac3fdd8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 2 Sep 2021 13:49:40 +0200 Subject: [PATCH 11/93] ARROW-13850: [C++] Fix crashes on invalid Parquet data Add validation to detect invalid DELTA_BINARY_PACKED data. This should fix the following issues: * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37431 * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37432 * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=37421 Closes #11060 from pitrou/ARROW-13850-parquet-fuzz Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 24 +++++++++++++++++++----- testing | 2 +- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e3460144fc1..2639c3dd4aa 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2105,22 +2105,33 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder(sizeof(T) * 8); + void InitHeader() { - if (!decoder_.GetVlqInt(&values_per_block_)) ParquetException::EofException(); - if (!decoder_.GetVlqInt(&mini_blocks_per_block_)) ParquetException::EofException(); - if (!decoder_.GetVlqInt(&total_value_count_)) { + if (!decoder_.GetVlqInt(&values_per_block_) || + !decoder_.GetVlqInt(&mini_blocks_per_block_) || + !decoder_.GetVlqInt(&total_value_count_) || + !decoder_.GetZigZagVlqInt(&last_value_)) { ParquetException::EofException(); } - if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); - delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_); + if (values_per_block_ == 0) { + throw ParquetException("cannot have zero value per block"); + } + if (mini_blocks_per_block_ == 0) { + throw ParquetException("cannot have zero miniblock per block"); + } values_per_mini_block_ = values_per_block_ / mini_blocks_per_block_; + if (values_per_mini_block_ == 0) { + throw ParquetException("cannot have zero value per miniblock"); + } if (values_per_mini_block_ % 32 != 0) { throw ParquetException( "the number of values in a miniblock must be multiple of 32, but it's " + std::to_string(values_per_mini_block_)); } + delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_); block_initialized_ = false; values_current_mini_block_ = 0; } @@ -2134,6 +2145,9 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder(1, bit_width_data + i)) { ParquetException::EofException(); } + if (bit_width_data[i] > kMaxDeltaBitWidth) { + throw ParquetException("delta bit width larger than integer bit width"); + } } mini_block_idx_ = 0; delta_bit_width_ = bit_width_data[0]; diff --git a/testing b/testing index 896d05d3516..2c29a733ac2 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 896d05d35163168831876b0f3e76977f6f20d4f4 +Subproject commit 2c29a733ac2c8492d5df3b74ea5ab1a32f892f60 From f0879a511216725431e77576781645b70c095d98 Mon Sep 17 00:00:00 2001 From: Romain Francois Date: Thu, 2 Sep 2021 09:06:56 -0400 Subject: [PATCH 12/93] ARROW-13164: [R] altrep vectors from Array with nulls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #10730 from romainfrancois/ARROW-13164_altrep_with_nulls Lead-authored-by: Romain Francois Co-authored-by: Romain François Signed-off-by: Neal Richardson --- r/R/arrowExports.R | 8 +- r/data-raw/codegen.R | 2 +- r/src/altrep.cpp | 519 +++++++++++++++++++++++++++------ r/src/array_to_vector.cpp | 16 +- r/src/arrowExports.cpp | 30 +- r/src/arrow_types.h | 9 +- r/tests/testthat/test-altrep.R | 179 +++++++++--- 7 files changed, 595 insertions(+), 168 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 2237f818ee0..72a5e455858 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1,11 +1,7 @@ # Generated by using data-raw/codegen.R -> do not edit by hand -is_altrep_int_nonull <- function(x) { - .Call(`_arrow_is_altrep_int_nonull`, x) -} - -is_altrep_dbl_nonull <- function(x) { - .Call(`_arrow_is_altrep_dbl_nonull`, x) +is_altrep <- function(x) { + .Call(`_arrow_is_altrep`, x) } Array__Slice1 <- function(array, offset) { diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index bb0e92eb640..7bdd8486d39 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -216,7 +216,7 @@ glue::glue('\n R_useDynamicSymbols(dll, FALSE); #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP) - arrow::r::Init_Altrep_classes(dll); + arrow::r::altrep::Init_Altrep_classes(dll); #endif } diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index b07cbe70ed3..ec68ade1ba9 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -19,9 +19,16 @@ #if defined(ARROW_R_WITH_ARROW) +#include +#include +#include + #include #if defined(HAS_ALTREP) +// defined in array_to_vector.cpp +SEXP Array__as_vector(const std::shared_ptr& array); + #if R_VERSION < R_Version(3, 6, 0) // workaround because R's not so conveniently uses `class` @@ -43,144 +50,484 @@ extern "C" { #include #endif -#include +#include "./r_task_group.h" namespace arrow { namespace r { +namespace altrep { + +template +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, c_type* buf); + +template <> +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, double* buf) { + return REAL_GET_REGION(data2, i, n, buf); +} + +template <> +R_xlen_t Standard_Get_region(SEXP data2, R_xlen_t i, R_xlen_t n, int* buf) { + return INTEGER_GET_REGION(data2, i, n, buf); +} + +// altrep R vector shadowing an Array. +// +// This tries as much as possible to directly use the data +// from the Array and minimize data copies. +// +// Both slots of the altrep object (data1 and data2) are used: +// +// data1: always used, stores an R external pointer to a +// shared pointer of the Array +// data2: starts as NULL, and becomes a standard R vector with the same +// data if necessary (if materialization is needed) template -struct ArrayNoNull { - using data_type = typename std::conditional::type; +struct AltrepArrayPrimitive { static void DeleteArray(std::shared_ptr* ptr) { delete ptr; } using Pointer = cpp11::external_pointer, DeleteArray>; - // altrep object around an Array with no nulls - // data1: an external pointer to a shared pointer to the Array - // data2: not used + using c_type = typename std::conditional::type; - static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr& array) { - // we don't need the whole r6 object, just an external pointer - // that retain the array - Pointer xp(new std::shared_ptr(array)); - - SEXP res = R_new_altrep(class_t, xp, R_NilValue); - MARK_NOT_MUTABLE(res); + // singleton altrep class description + static R_altrep_class_t class_t; - return res; + // the altrep R object + SEXP alt_; + + // This constructor is used to create the altrep object from + // an Array. Used by MakeAltrepArrayPrimitive() which is used + // in array_to_vector.cpp + explicit AltrepArrayPrimitive(const std::shared_ptr& array) + : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr(array)), + R_NilValue)) { + // force duplicate on modify + MARK_NOT_MUTABLE(alt_); } - static Rboolean Inspect(SEXP x, int pre, int deep, int pvec, - void (*inspect_subtree)(SEXP, int, int, int)) { - const auto& array = Get(x); - Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n", - array->type()->ToString().c_str(), array->length(), array.get()); - inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec); - return TRUE; + // This constructor is used when R calls altrep methods. + // + // For example in the Length() method below: + // + // template + // R_xlen_t Length(SEXP alt) { + // return AltrepClass(alt).Length(); + // } + explicit AltrepArrayPrimitive(SEXP alt) : alt_(alt) {} + + // the arrow::Array that is being wrapped by the altrep object + // this is only valid before data2 has been materialized + const std::shared_ptr& array() const { return *Pointer(R_altrep_data1(alt_)); } + + R_xlen_t Length() { return array()->length(); } + + // Does the data2 slot of the altrep object contain a + // standard R vector with the same data as the array + bool IsMaterialized() const { return !Rf_isNull(R_altrep_data2(alt_)); } + + // Force materialization. After calling this, the data2 slot of the altrep + // object contains a standard R vector with the same data, with + // R sentinels where the Array has nulls. + void Materialize() { + if (!IsMaterialized()) { + auto size = array()->length(); + + // create a standard R vector + SEXP copy = PROTECT(Rf_allocVector(sexp_type, size)); + + // copy the data from the array, through Get_region + Get_region(0, size, reinterpret_cast(DATAPTR(copy))); + + // store as data2, this is now considered materialized + R_set_altrep_data2(alt_, copy); + MARK_NOT_MUTABLE(copy); + + UNPROTECT(1); + } } - static const std::shared_ptr& Get(SEXP vec) { - return *Pointer(R_altrep_data1(vec)); + // Duplication is done by first materializing the vector and + // then make a lazy duplicate of data2 + SEXP Duplicate(Rboolean /* deep */) { + Materialize(); + return Rf_lazy_duplicate(R_altrep_data2(alt_)); } - static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); } + // What gets printed on .Internal(inspect()) + Rboolean Inspect(int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + const auto& array_ = array(); + Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n", + array_->type()->ToString().c_str(), array_->null_count(), + IsMaterialized() ? "materialized" : "not materialized", array_->length(), + array_.get()); + inspect_subtree(R_altrep_data1(alt_), pre, deep + 1, pvec); + if (IsMaterialized()) { + inspect_subtree(R_altrep_data2(alt_), pre, deep + 1, pvec); + } + + return TRUE; + } - static const void* Dataptr_or_null(SEXP vec) { - return Get(vec)->data()->template GetValues(1); + // R calls this to get a pointer to the start of the vector data + // but only if this is possible without allocating (in the R sense). + // + // For this implementation we can return the data in these cases + // - data2 has been created, and so the R sentinels are in place where the array has + // nulls + // - the Array has no nulls, we can directly return the start of its data + // + // Otherwise: if the array has nulls and data2 has not been generated: give up + const void* Dataptr_or_null() { + if (IsMaterialized()) { + return DATAPTR_RO(R_altrep_data2(alt_)); + } + + const auto& array_ = array(); + if (array_->null_count() == 0) { + return reinterpret_cast(array_->data()->template GetValues(1)); + } + + return NULL; } - static SEXP Duplicate(SEXP vec, Rboolean) { - const auto& array = Get(vec); - auto size = array->length(); + // R calls this to get a pointer to the start of the data, R allocations are allowed. + // + // If the object hasn't been materialized, and the array has no + // nulls we can directly point to the array data. + // + // Otherwise, the object is materialized DATAPTR(data2) is returned. + void* Dataptr(Rboolean writeable) { + if (!IsMaterialized()) { + const auto& array_ = array(); + + if (array_->null_count() == 0) { + return reinterpret_cast( + const_cast(array_->data()->template GetValues(1))); + } + } + + // Otherwise we have to materialize and hand the pointer to data2 + // + // NOTE: this returns the DATAPTR() of data2 even in the case writeable = TRUE + // + // which is risky because C(++) clients of this object might + // modify data2, and therefore make it diverge from the data of the Array, + // but the object was marked as immutable on creation, so doing this is + // disregarding the R api. + // + // Simply stop() when `writeable = TRUE` is too strong, e.g. this fails + // identical() which calls DATAPTR() even though DATAPTR_RO() would + // be enough + Materialize(); + return DATAPTR(R_altrep_data2(alt_)); + } - SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length())); + // Does the Array have no nulls ? + int No_NA() const { return array()->null_count() != 0; } - memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type)); + int Is_sorted() const { return UNKNOWN_SORTEDNESS; } - UNPROTECT(1); - return copy; + // The value at position i + c_type Elt(R_xlen_t i) { + const auto& array_ = array(); + return array_->IsNull(i) ? cpp11::na() + : array_->data()->template GetValues(1)[i]; } - static void* Dataptr(SEXP vec, Rboolean writeable) { - return const_cast(Dataptr_or_null(vec)); + // R calls this when it wants data from position `i` to `i + n` copied into `buf` + // The returned value is the number of values that were really copied + // (this can be lower than n) + R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, c_type* buf) { + // If we have data2, we can just copy the region into buf + // using the standard Get_region for this R type + if (IsMaterialized()) { + return Standard_Get_region(R_altrep_data2(alt_), i, n, buf); + } + + // The vector was not materialized, aka we don't have data2 + // + // In that case, we copy the data from the Array, and then + // do a second pass to force the R sentinels for where the + // array has nulls + // + // This only materialize the region, into buf. Not the entire vector. + auto slice = array()->Slice(i, n); + R_xlen_t ncopy = slice->length(); + + // first copy the data buffer + memcpy(buf, slice->data()->template GetValues(1), ncopy * sizeof(c_type)); + + // then set the R NA sentinels if needed + if (slice->null_count() > 0) { + internal::BitmapReader bitmap_reader(slice->null_bitmap()->data(), slice->offset(), + ncopy); + + for (R_xlen_t j = 0; j < ncopy; j++, bitmap_reader.Next()) { + if (bitmap_reader.IsNotSet()) { + buf[j] = cpp11::na(); + } + } + } + + return ncopy; } - // by definition, there are no NA - static int No_NA(SEXP vec) { return 1; } + // This cannot keep the external pointer to an Arrow object through + // R serialization, so return the materialized + SEXP Serialized_state() { + Materialize(); + return R_altrep_data2(alt_); + } - static void Init(R_altrep_class_t class_t, DllInfo* dll) { - // altrep - R_set_altrep_Length_method(class_t, ArrayNoNull::Length); - R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect); - R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate); + static SEXP Unserialize(SEXP /* class_ */, SEXP state) { return state; } - // altvec - R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr); - R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null); + SEXP Coerce(int type) { + // Just let R handle it for now + return NULL; } }; +template +R_altrep_class_t AltrepArrayPrimitive::class_t; -struct DoubleArrayNoNull { - static R_altrep_class_t class_t; +// The methods below are how R interacts with the altrep objects. +// +// They all use the same pattern: create a C++ object of the +// class parameter, and then call the method. +template +R_xlen_t Length(SEXP alt) { + return AltrepClass(alt).Length(); +} - static void Init(DllInfo* dll) { - class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll); - ArrayNoNull::Init(class_t, dll); - R_set_altreal_No_NA_method(class_t, ArrayNoNull::No_NA); - } +template +Rboolean Inspect(SEXP alt, int pre, int deep, int pvec, + void (*inspect_subtree)(SEXP, int, int, int)) { + return AltrepClass(alt).Inspect(pre, deep, pvec, inspect_subtree); +} - static SEXP Make(const std::shared_ptr& array) { - return ArrayNoNull::Make(class_t, array); - } -}; +template +const void* Dataptr_or_null(SEXP alt) { + return AltrepClass(alt).Dataptr_or_null(); +} -struct Int32ArrayNoNull { - static R_altrep_class_t class_t; +template +void* Dataptr(SEXP alt, Rboolean writeable) { + return AltrepClass(alt).Dataptr(writeable); +} + +template +SEXP Duplicate(SEXP alt, Rboolean deep) { + return AltrepClass(alt).Duplicate(deep); +} + +template +auto Elt(SEXP alt, R_xlen_t i) -> decltype(AltrepClass(alt).Elt(i)) { + return AltrepClass(alt).Elt(i); +} + +template +int No_NA(SEXP alt) { + return AltrepClass(alt).No_NA(); +} + +template +int Is_sorted(SEXP alt) { + return AltrepClass(alt).Is_sorted(); +} + +template +R_xlen_t Get_region(SEXP alt, R_xlen_t i, R_xlen_t n, typename AltrepClass::c_type* buf) { + return AltrepClass(alt).Get_region(i, n, buf); +} + +template +SEXP Serialized_state(SEXP alt) { + return AltrepClass(alt).Serialized_state(); +} + +template +SEXP Unserialize(SEXP class_, SEXP state) { + return AltrepClass::Unserialize(class_, state); +} - static void Init(DllInfo* dll) { - class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll); - ArrayNoNull::Init(class_t, dll); - R_set_altinteger_No_NA_method(class_t, ArrayNoNull::No_NA); +template +SEXP Coerce(SEXP alt, int type) { + return AltrepClass(alt).Coerce(type); +} + +static std::shared_ptr NaRmOptions( + const std::shared_ptr& array, bool na_rm) { + auto options = std::make_shared( + arrow::compute::ScalarAggregateOptions::Defaults()); + options->min_count = 0; + options->skip_nulls = na_rm; + return options; +} + +template +SEXP MinMax(SEXP alt, Rboolean narm) { + using data_type = typename std::conditional::type; + using scalar_type = + typename std::conditional::type; + + AltrepArrayPrimitive alt_(alt); + + const auto& array = alt_.array(); + bool na_rm = narm == TRUE; + auto n = array->length(); + auto null_count = array->null_count(); + if ((na_rm || n == 0) && null_count == n) { + return Rf_ScalarReal(Min ? R_PosInf : R_NegInf); } + if (!na_rm && null_count > 0) { + return cpp11::as_sexp(cpp11::na()); + } + + auto options = NaRmOptions(array, na_rm); + + const auto& minmax = + ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get())); + const auto& minmax_scalar = + internal::checked_cast(*minmax.scalar()); + + const auto& result_scalar = internal::checked_cast( + *ValueOrStop(minmax_scalar.field(Min ? "min" : "max"))); + return cpp11::as_sexp(result_scalar.value); +} + +template +SEXP Min(SEXP alt, Rboolean narm) { + return MinMax(alt, narm); +} + +template +SEXP Max(SEXP alt, Rboolean narm) { + return MinMax(alt, narm); +} + +template +static SEXP Sum(SEXP alt, Rboolean narm) { + using data_type = typename std::conditional::type; + + AltrepArrayPrimitive alt_(alt); + + const auto& array = alt_.array(); + bool na_rm = narm == TRUE; + auto null_count = array->null_count(); - static SEXP Make(const std::shared_ptr& array) { - return ArrayNoNull::Make(class_t, array); + if (!na_rm && null_count > 0) { + return cpp11::as_sexp(cpp11::na()); } -}; + auto options = NaRmOptions(array, na_rm); + + const auto& sum = + ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get())); + + if (sexp_type == INTSXP) { + // When calling the "sum" function on an int32 array, we get an Int64 scalar + // in case of overflow, make it a double like R + int64_t value = internal::checked_cast(*sum.scalar()).value; + if (value <= INT32_MIN || value > INT32_MAX) { + return Rf_ScalarReal(static_cast(value)); + } else { + return Rf_ScalarInteger(static_cast(value)); + } + } else { + return Rf_ScalarReal( + internal::checked_cast(*sum.scalar()).value); + } +} -R_altrep_class_t Int32ArrayNoNull::class_t; -R_altrep_class_t DoubleArrayNoNull::class_t; +// initialize altrep, altvec, altreal, and altinteger methods +template +void InitAltrepMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altrep_Length_method(class_t, Length); + R_set_altrep_Inspect_method(class_t, Inspect); + R_set_altrep_Duplicate_method(class_t, Duplicate); + R_set_altrep_Serialized_state_method(class_t, Serialized_state); + R_set_altrep_Unserialize_method(class_t, Unserialize); + R_set_altrep_Coerce_method(class_t, Coerce); +} -void Init_Altrep_classes(DllInfo* dll) { - DoubleArrayNoNull::Init(dll); - Int32ArrayNoNull::Init(dll); +template +void InitAltvecMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altvec_Dataptr_method(class_t, Dataptr); + R_set_altvec_Dataptr_or_null_method(class_t, Dataptr_or_null); +} + +template +void InitAltRealMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altreal_No_NA_method(class_t, No_NA); + R_set_altreal_Is_sorted_method(class_t, Is_sorted); + + R_set_altreal_Sum_method(class_t, Sum); + R_set_altreal_Min_method(class_t, Min); + R_set_altreal_Max_method(class_t, Max); + + R_set_altreal_Elt_method(class_t, Elt); + R_set_altreal_Get_region_method(class_t, Get_region); } -SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array) { - return DoubleArrayNoNull::Make(array); +template +void InitAltIntegerMethods(R_altrep_class_t class_t, DllInfo* dll) { + R_set_altinteger_No_NA_method(class_t, No_NA); + R_set_altinteger_Is_sorted_method(class_t, Is_sorted); + + R_set_altinteger_Sum_method(class_t, Sum); + R_set_altinteger_Min_method(class_t, Min); + R_set_altinteger_Max_method(class_t, Max); + + R_set_altinteger_Elt_method(class_t, Elt); + R_set_altinteger_Get_region_method(class_t, Get_region); } -SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array) { - return Int32ArrayNoNull::Make(array); +template +void InitAltRealClass(DllInfo* dll, const char* name) { + AltrepClass::class_t = R_make_altreal_class(name, "arrow", dll); + InitAltrepMethods(AltrepClass::class_t, dll); + InitAltvecMethods(AltrepClass::class_t, dll); + InitAltRealMethods(AltrepClass::class_t, dll); } -} // namespace r -} // namespace arrow +template +void InitAltIntegerClass(DllInfo* dll, const char* name) { + AltrepClass::class_t = R_make_altinteger_class(name, "arrow", dll); + InitAltrepMethods(AltrepClass::class_t, dll); + InitAltvecMethods(AltrepClass::class_t, dll); + InitAltIntegerMethods(AltrepClass::class_t, dll); +} -#endif +// initialize the altrep classes +void Init_Altrep_classes(DllInfo* dll) { + InitAltRealClass>(dll, "array_dbl_vector"); + InitAltIntegerClass>(dll, "array_int_vector"); +} -// [[arrow::export]] -bool is_altrep_int_nonull(SEXP x) { -#if defined(HAS_ALTREP) - return R_altrep_inherits(x, arrow::r::Int32ArrayNoNull::class_t); -#else - return false; -#endif +// return an altrep R vector that shadows the array if possible +SEXP MakeAltrepArrayPrimitive(const std::shared_ptr& array) { + switch (array->type()->id()) { + case arrow::Type::DOUBLE: + return altrep::AltrepArrayPrimitive(array).alt_; + + case arrow::Type::INT32: + return altrep::AltrepArrayPrimitive(array).alt_; + + default: + break; + } + + return R_NilValue; } +} // namespace altrep +} // namespace r +} // namespace arrow + +#endif // HAS_ALTREP + // [[arrow::export]] -bool is_altrep_dbl_nonull(SEXP x) { +bool is_altrep(SEXP x) { #if defined(HAS_ALTREP) - return R_altrep_inherits(x, arrow::r::DoubleArrayNoNull::class_t); + return ALTREP(x); #else return false; #endif diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index ae6c97e169c..772e816a131 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -69,15 +69,13 @@ class Converter { // special case when there is only one array if (chunked_array_->num_chunks() == 1) { const auto& array = chunked_array_->chunk(0); - if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 && - array->null_count() == 0) { - switch (array->type()->id()) { - case arrow::Type::DOUBLE: - return arrow::r::MakeDoubleArrayNoNull(array); - case arrow::Type::INT32: - return arrow::r::MakeInt32ArrayNoNull(array); - default: - break; + // using altrep if + // - the arrow.use_altrep is set to TRUE or unset (implicit TRUE) + // - the array has at least one element + if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) { + SEXP alt = altrep::MakeAltrepArrayPrimitive(array); + if (!Rf_isNull(alt)) { + return alt; } } } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5ef39215c73..d99abf2605d 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -6,31 +6,16 @@ // altrep.cpp #if defined(ARROW_R_WITH_ARROW) -bool is_altrep_int_nonull(SEXP x); -extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ +bool is_altrep(SEXP x); +extern "C" SEXP _arrow_is_altrep(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input::type x(x_sexp); - return cpp11::as_sexp(is_altrep_int_nonull(x)); + return cpp11::as_sexp(is_altrep(x)); END_CPP11 } #else -extern "C" SEXP _arrow_is_altrep_int_nonull(SEXP x_sexp){ - Rf_error("Cannot call is_altrep_int_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - -// altrep.cpp -#if defined(ARROW_R_WITH_ARROW) -bool is_altrep_dbl_nonull(SEXP x); -extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ -BEGIN_CPP11 - arrow::r::Input::type x(x_sexp); - return cpp11::as_sexp(is_altrep_dbl_nonull(x)); -END_CPP11 -} -#else -extern "C" SEXP _arrow_is_altrep_dbl_nonull(SEXP x_sexp){ - Rf_error("Cannot call is_altrep_dbl_nonull(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_is_altrep(SEXP x_sexp){ + Rf_error("Cannot call is_altrep(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -7040,8 +7025,7 @@ static const R_CallMethodDef CallEntries[] = { { "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, - { "_arrow_is_altrep_int_nonull", (DL_FUNC) &_arrow_is_altrep_int_nonull, 1}, - { "_arrow_is_altrep_dbl_nonull", (DL_FUNC) &_arrow_is_altrep_dbl_nonull, 1}, + { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, @@ -7492,7 +7476,7 @@ extern "C" void R_init_arrow(DllInfo* dll){ R_useDynamicSymbols(dll, FALSE); #if defined(ARROW_R_WITH_ARROW) && defined(HAS_ALTREP) - arrow::r::Init_Altrep_classes(dll); + arrow::r::altrep::Init_Altrep_classes(dll); #endif } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 4ecb99174b5..9419d956877 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -101,6 +101,7 @@ auto ValueOrStop(R&& result) -> decltype(std::forward(result).ValueOrDie()) { } namespace r { +class RTasks; std::shared_ptr InferArrowType(SEXP x); std::shared_ptr vec_to_arrow__reuse_memory(SEXP x); @@ -174,9 +175,13 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, std::shared_ptr& schema); #if defined(HAS_ALTREP) + +namespace altrep { + void Init_Altrep_classes(DllInfo* dll); -SEXP MakeInt32ArrayNoNull(const std::shared_ptr& array); -SEXP MakeDoubleArrayNoNull(const std::shared_ptr& array); +SEXP MakeAltrepArrayPrimitive(const std::shared_ptr& array); + +} // namespace altrep #endif } // namespace r diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 42784b61442..8cb989b1d4c 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -26,30 +26,30 @@ test_that("altrep vectors from int32 and dbl arrays with no nulls", { c_int <- ChunkedArray$create(1:1000) c_dbl <- ChunkedArray$create(as.numeric(1:1000)) - expect_true(is_altrep_int_nonull(as.vector(v_int))) - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) expect_equal(c_int$num_chunks, 1L) - expect_true(is_altrep_int_nonull(as.vector(c_int))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) + expect_true(is_altrep(as.vector(c_int))) + expect_true(is_altrep(as.vector(c_int$Slice(1)))) expect_equal(c_dbl$num_chunks, 1L) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_dbl$Slice(1)))) withr::local_options(list(arrow.use_altrep = NULL)) - expect_true(is_altrep_int_nonull(as.vector(v_int))) - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) withr::local_options(list(arrow.use_altrep = FALSE)) - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) + expect_false(is_altrep(as.vector(v_int))) + expect_false(is_altrep(as.vector(v_int$Slice(1)))) + expect_false(is_altrep(as.vector(v_dbl))) + expect_false(is_altrep(as.vector(v_dbl$Slice(1)))) }) test_that("altrep vectors from int32 and dbl arrays with nulls", { @@ -59,31 +59,30 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", { c_int <- ChunkedArray$create(c(1L, NA, 3L)) c_dbl <- ChunkedArray$create(c(1, NA, 3)) - # cannot be altrep because one NA - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1)))) - expect_false(is_altrep_int_nonull(as.vector(c_int))) - expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1)))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1)))) - - # but then, no NA beyond, so can be altrep again - expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2)))) - expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2)))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2)))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2)))) + expect_true(is_altrep(as.vector(v_int))) + expect_true(is_altrep(as.vector(v_int$Slice(1)))) + expect_true(is_altrep(as.vector(v_dbl))) + expect_true(is_altrep(as.vector(v_dbl$Slice(1)))) + expect_true(is_altrep(as.vector(c_int))) + expect_true(is_altrep(as.vector(c_int$Slice(1)))) + expect_true(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_dbl$Slice(1)))) + + expect_true(is_altrep(as.vector(v_int$Slice(2)))) + expect_true(is_altrep(as.vector(v_dbl$Slice(2)))) + expect_true(is_altrep(as.vector(c_int$Slice(2)))) + expect_true(is_altrep(as.vector(c_dbl$Slice(2)))) # chunked array with 2 chunks cannot be altrep c_int <- ChunkedArray$create(0L, c(1L, NA, 3L)) c_dbl <- ChunkedArray$create(0, c(1, NA, 3)) expect_equal(c_int$num_chunks, 2L) expect_equal(c_dbl$num_chunks, 2L) - expect_false(is_altrep_int_nonull(as.vector(c_int))) - expect_false(is_altrep_dbl_nonull(as.vector(c_dbl))) - expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3)))) - expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3)))) + + expect_false(is_altrep(as.vector(c_int))) + expect_false(is_altrep(as.vector(c_dbl))) + expect_true(is_altrep(as.vector(c_int$Slice(3)))) + expect_true(is_altrep(as.vector(c_dbl$Slice(3)))) }) test_that("empty vectors are not altrep", { @@ -91,8 +90,8 @@ test_that("empty vectors are not altrep", { v_int <- Array$create(integer()) v_dbl <- Array$create(numeric()) - expect_false(is_altrep_int_nonull(as.vector(v_int))) - expect_false(is_altrep_dbl_nonull(as.vector(v_dbl))) + expect_false(is_altrep(as.vector(v_int))) + expect_false(is_altrep(as.vector(v_dbl))) }) test_that("as.data.frame(, ) can create altrep vectors", { @@ -100,11 +99,109 @@ test_that("as.data.frame(
    , ) can create altrep vectors", { table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3)) df_table <- as.data.frame(table) - expect_true(is_altrep_int_nonull(df_table$int)) - expect_true(is_altrep_dbl_nonull(df_table$dbl)) + expect_true(is_altrep(df_table$int)) + expect_true(is_altrep(df_table$dbl)) batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3)) df_batch <- as.data.frame(batch) - expect_true(is_altrep_int_nonull(df_batch$int)) - expect_true(is_altrep_dbl_nonull(df_batch$dbl)) + expect_true(is_altrep(df_batch$int)) + expect_true(is_altrep(df_batch$dbl)) +}) + +expect_altrep_rountrip <- function(x, fn, ...) { + alt <- Array$create(x)$as_vector() + + expect_true(is_altrep(alt)) + expect_identical(fn(x, ...), fn(alt, ...)) + expect_true(is_altrep(alt)) +} + +test_that("altrep min/max/sum identical to R versions for double", { + x <- c(1, 2, 3) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1, 2, NA_real_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_real_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep min/max/sum identical to R versions for int", { + x <- c(1L, 2L, 3L) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- c(1L, 2L, NA_integer_) + expect_altrep_rountrip(x, min, na.rm = TRUE) + expect_altrep_rountrip(x, max, na.rm = TRUE) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + x <- rep(NA_integer_, 3) + expect_warning( + expect_altrep_rountrip(x, min, na.rm = TRUE), + "no non-missing arguments to min" + ) + expect_warning( + expect_altrep_rountrip(x, max, na.rm = TRUE), + "no non-missing arguments to max" + ) + expect_altrep_rountrip(x, sum, na.rm = TRUE) + + expect_altrep_rountrip(x, min) + expect_altrep_rountrip(x, max) + expect_altrep_rountrip(x, sum) + + # sum(x) is INT_MIN -> convert to double. + x <- as.integer(c(-2^31 + 1L, -1L)) + expect_altrep_rountrip(x, sum) +}) + +test_that("altrep vectors handle serialization", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + + expect_identical(ints, unserialize(serialize(Array$create(ints)$as_vector(), NULL))) + expect_identical(dbls, unserialize(serialize(Array$create(dbls)$as_vector(), NULL))) +}) + +test_that("altrep vectors handle coercion", { + ints <- c(1L, 2L, NA_integer_) + dbls <- c(1, 2, NA_real_) + + expect_identical(ints, as.integer(Array$create(dbls)$as_vector())) + expect_identical(dbls, as.numeric(Array$create(ints)$as_vector())) }) From 8c70a5f5178c5b74cc181dc8bdd4b03ba14f36d9 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 2 Sep 2021 09:07:27 -0400 Subject: [PATCH 13/93] ARROW-13459: [C++][Docs]Missing param docs for RecordBatch::SetColumn Signed-off-by: Junwang Zhao Closes #11056 from zhjwpku/docs/missing_param_for_setcolumn Authored-by: Junwang Zhao Signed-off-by: David Li --- cpp/src/arrow/record_batch.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 92ffa8b87fb..3173eee1000 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -130,6 +130,10 @@ class ARROW_EXPORT RecordBatch { int i, std::string field_name, const std::shared_ptr& column) const; /// \brief Replace a column in the record batch, producing a new RecordBatch + /// + /// \param[in] i field index, does boundscheck + /// \param[in] field field to be replaced + /// \param[in] column column to be replaced virtual Result> SetColumn( int i, const std::shared_ptr& field, const std::shared_ptr& column) const = 0; From a1d207ec03f468a6a3d47ba17eaefc1d1ca912ad Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 3 Sep 2021 04:45:46 +0900 Subject: [PATCH 14/93] ARROW-13831: [GLib][Ruby] Add support for writing by Arrow Dataset Closes #11055 from kou/ruby-table-save-dataset Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../arrow-dataset-glib/arrow-dataset-glib.h | 2 + .../arrow-dataset-glib/arrow-dataset-glib.hpp | 1 + c_glib/arrow-dataset-glib/dataset-factory.cpp | 68 +++ c_glib/arrow-dataset-glib/dataset.cpp | 403 +++++++++++++++- c_glib/arrow-dataset-glib/dataset.h | 25 + c_glib/arrow-dataset-glib/dataset.hpp | 10 +- c_glib/arrow-dataset-glib/enums.c.template | 52 +++ c_glib/arrow-dataset-glib/enums.h.template | 41 ++ c_glib/arrow-dataset-glib/file-format.cpp | 380 +++++++++++++-- c_glib/arrow-dataset-glib/file-format.h | 59 ++- c_glib/arrow-dataset-glib/file-format.hpp | 18 +- c_glib/arrow-dataset-glib/meson.build | 22 +- c_glib/arrow-dataset-glib/partitioning.cpp | 440 ++++++++++++++++++ c_glib/arrow-dataset-glib/partitioning.h | 110 +++++ c_glib/arrow-dataset-glib/partitioning.hpp | 31 ++ c_glib/arrow-dataset-glib/scanner.cpp | 19 + c_glib/arrow-dataset-glib/scanner.h | 4 + c_glib/arrow-glib/compute.cpp | 4 +- c_glib/arrow-glib/input-stream.cpp | 124 ++++- c_glib/arrow-glib/input-stream.h | 68 ++- c_glib/arrow-glib/input-stream.hpp | 11 +- .../arrow-dataset-glib-docs.xml | 8 +- .../test/dataset/test-file-system-dataset.rb | 64 ++- c_glib/test/dataset/test-file-writer.rb | 65 +++ .../test/dataset/test-partitioning-options.rb | 46 ++ c_glib/test/dataset/test-partitioning.rb | 34 ++ c_glib/test/dataset/test-scanner.rb | 7 + c_glib/test/helper/buildable.rb | 7 + c_glib/test/helper/readable.rb | 47 ++ c_glib/test/run-test.rb | 2 + c_glib/test/test-file-input-stream.rb | 102 ++++ .../lib/arrow-dataset/arrow-table-savable.rb | 68 +++ .../lib/arrow-dataset/file-format.rb | 14 + .../lib/arrow-dataset/loader.rb | 1 + .../test/test-arrow-table.rb | 5 +- ruby/red-arrow/lib/arrow/file-system.rb | 34 ++ ruby/red-arrow/lib/arrow/loader.rb | 1 + ruby/red-arrow/lib/arrow/table-saver.rb | 32 +- 38 files changed, 2290 insertions(+), 139 deletions(-) create mode 100644 c_glib/arrow-dataset-glib/enums.c.template create mode 100644 c_glib/arrow-dataset-glib/enums.h.template create mode 100644 c_glib/arrow-dataset-glib/partitioning.cpp create mode 100644 c_glib/arrow-dataset-glib/partitioning.h create mode 100644 c_glib/arrow-dataset-glib/partitioning.hpp create mode 100644 c_glib/test/dataset/test-file-writer.rb create mode 100644 c_glib/test/dataset/test-partitioning-options.rb create mode 100644 c_glib/test/dataset/test-partitioning.rb create mode 100644 c_glib/test/helper/readable.rb create mode 100644 c_glib/test/test-file-input-stream.rb create mode 100644 ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb create mode 100644 ruby/red-arrow/lib/arrow/file-system.rb diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index 03e56516112..58f4e216cc7 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -23,6 +23,8 @@ #include #include +#include #include #include +#include #include diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp index 65341b9b77e..8e996506884 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -25,4 +25,5 @@ #include #include #include +#include #include diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp index 146db69adfc..433e58b2031 100644 --- a/c_glib/arrow-dataset-glib/dataset-factory.cpp +++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -23,6 +23,7 @@ #include #include #include +#include G_BEGIN_DECLS @@ -142,6 +143,7 @@ gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { GADatasetFileFormat *format; GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; GList *files; arrow::dataset::FileSystemFactoryOptions options; } GADatasetFileSystemDatasetFactoryPrivate; @@ -149,6 +151,8 @@ typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { enum { PROP_FORMAT = 1, PROP_FILE_SYSTEM, + PROP_PARTITIONING, + PROP_PARTITION_BASE_DIR, }; G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory, @@ -175,6 +179,11 @@ gadataset_file_system_dataset_factory_dispose(GObject *object) priv->file_system = NULL; } + if (priv->partitioning) { + g_object_unref(priv->partitioning); + priv->partitioning = NULL; + } + if (priv->files) { g_list_free_full(priv->files, g_object_unref); priv->files = NULL; @@ -205,6 +214,29 @@ gadataset_file_system_dataset_factory_set_property(GObject *object, case PROP_FORMAT: priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); break; + case PROP_PARTITIONING: + { + auto partitioning = g_value_get_object(value); + if (partitioning == priv->partitioning) { + break; + } + auto old_partitioning = priv->partitioning; + if (partitioning) { + g_object_ref(partitioning); + priv->partitioning = GADATASET_PARTITIONING(partitioning); + priv->options.partitioning = + gadataset_partitioning_get_raw(priv->partitioning); + } else { + priv->options.partitioning = arrow::dataset::Partitioning::Default(); + } + if (old_partitioning) { + g_object_unref(old_partitioning); + } + } + break; + case PROP_PARTITION_BASE_DIR: + priv->options.partition_base_dir = g_value_get_string(value); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -226,6 +258,12 @@ gadataset_file_system_dataset_factory_get_property(GObject *object, case PROP_FILE_SYSTEM: g_value_set_object(value, priv->file_system); break; + case PROP_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; + case PROP_PARTITION_BASE_DIR: + g_value_set_string(value, priv->options.partition_base_dir.c_str()); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -279,6 +317,35 @@ gadataset_file_system_dataset_factory_class_init( GARROW_TYPE_FILE_SYSTEM, static_cast(G_PARAM_READABLE)); g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); + + /** + * GADatasetFileSystemDatasetFactory:partitioning: + * + * Partitioning used by #GADatasetFileSystemDataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "Partitioning used by GADatasetFileSystemDataset", + GADATASET_TYPE_PARTITIONING, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); + + /** + * GADatasetFileSystemDatasetFactory:partition-base-dir: + * + * Partition base directory used by #GADatasetFileSystemDataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("partition-base-dir", + "Partition base directory", + "Partition base directory " + "used by GADatasetFileSystemDataset", + NULL, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITION_BASE_DIR, spec); } /** @@ -454,6 +521,7 @@ gadataset_file_system_dataset_factory_finish( "dataset", &arrow_dataset, "file-system", priv->file_system, "format", priv->format, + "partitioning", priv->partitioning, NULL)); } diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp index 3bd62f99ef3..8613bedad42 100644 --- a/c_glib/arrow-dataset-glib/dataset.cpp +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -18,11 +18,14 @@ */ #include +#include #include #include #include -#include +#include +#include +#include G_BEGIN_DECLS @@ -36,13 +39,8 @@ G_BEGIN_DECLS * * #GADatasetFileSystemDataset is a class for file system dataset. * - * #GADatasetFileFormat is a base class for file formats. - * - * #GADatasetCSVFileFormat is a class for CSV file format. - * - * #GADatasetIPCFileFormat is a class for IPC file format. - * - * #GADatasetParquetFileFormat is a class for Apache Parquet file format. + * #GADatasetFileSystemDatasetWriteOptions is a class for options to + * write a dataset to file system dataset. * * Since: 5.0.0 */ @@ -190,14 +188,326 @@ gadataset_dataset_get_type_name(GADatasetDataset *dataset) } +typedef struct GADatasetFileSystemDatasetWriteOptionsPrivate_ { + arrow::dataset::FileSystemDatasetWriteOptions options; + GADatasetFileWriteOptions *file_write_options; + GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; +} GADatasetFileSystemDatasetWriteOptionsPrivate; + +enum { + PROP_FILE_WRITE_OPTIONS = 1, + PROP_FILE_SYSTEM, + PROP_BASE_DIR, + PROP_PARTITIONING, + PROP_MAX_PARTITIONS, + PROP_BASE_NAME_TEMPLATE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetWriteOptions, + gadataset_file_system_dataset_write_options, + G_TYPE_OBJECT) + +#define GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_write_options_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS(obj))) + +static void +gadataset_file_system_dataset_write_options_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + priv->options.~FileSystemDatasetWriteOptions(); + G_OBJECT_CLASS(gadataset_file_system_dataset_write_options_parent_class)-> + finalize(object); +} + +static void +gadataset_file_system_dataset_write_options_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + if (priv->file_write_options) { + g_object_unref(priv->file_write_options); + priv->file_write_options = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->partitioning) { + g_object_unref(priv->partitioning); + priv->partitioning = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_write_options_parent_class)-> + dispose(object); +} + +static void +gadataset_file_system_dataset_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_WRITE_OPTIONS: + { + auto file_write_options = g_value_get_object(value); + if (file_write_options == priv->file_write_options) { + break; + } + auto old_file_write_options = priv->file_write_options; + if (file_write_options) { + g_object_ref(file_write_options); + priv->file_write_options = + GADATASET_FILE_WRITE_OPTIONS(file_write_options); + priv->options.file_write_options = + gadataset_file_write_options_get_raw(priv->file_write_options); + } else { + priv->options.file_write_options = nullptr; + } + if (old_file_write_options) { + g_object_unref(old_file_write_options); + } + } + break; + case PROP_FILE_SYSTEM: + { + auto file_system = g_value_get_object(value); + if (file_system == priv->file_system) { + break; + } + auto old_file_system = priv->file_system; + if (file_system) { + g_object_ref(file_system); + priv->file_system = GARROW_FILE_SYSTEM(file_system); + priv->options.filesystem = garrow_file_system_get_raw(priv->file_system); + } else { + priv->options.filesystem = nullptr; + } + if (old_file_system) { + g_object_unref(old_file_system); + } + } + break; + case PROP_BASE_DIR: + priv->options.base_dir = g_value_get_string(value); + break; + case PROP_PARTITIONING: + { + auto partitioning = g_value_get_object(value); + if (partitioning == priv->partitioning) { + break; + } + auto old_partitioning = priv->partitioning; + if (partitioning) { + g_object_ref(partitioning); + priv->partitioning = GADATASET_PARTITIONING(partitioning); + priv->options.partitioning = + gadataset_partitioning_get_raw(priv->partitioning); + } else { + priv->options.partitioning = arrow::dataset::Partitioning::Default(); + } + if (old_partitioning) { + g_object_unref(old_partitioning); + } + } + break; + case PROP_MAX_PARTITIONS: + priv->options.max_partitions = g_value_get_uint(value); + break; + case PROP_BASE_NAME_TEMPLATE: + priv->options.basename_template = g_value_get_string(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_write_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FILE_WRITE_OPTIONS: + g_value_set_object(value, priv->file_write_options); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + case PROP_BASE_DIR: + g_value_set_string(value, priv->options.base_dir.c_str()); + break; + case PROP_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; + case PROP_MAX_PARTITIONS: + g_value_set_uint(value, priv->options.max_partitions); + break; + case PROP_BASE_NAME_TEMPLATE: + g_value_set_string(value, priv->options.basename_template.c_str()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_write_options_init( + GADatasetFileSystemDatasetWriteOptions *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(object); + new(&(priv->options)) arrow::dataset::FileSystemDatasetWriteOptions; + priv->options.partitioning = arrow::dataset::Partitioning::Default(); +} + +static void +gadataset_file_system_dataset_write_options_class_init( + GADatasetFileSystemDatasetWriteOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = + gadataset_file_system_dataset_write_options_finalize; + gobject_class->dispose = + gadataset_file_system_dataset_write_options_dispose; + gobject_class->set_property = + gadataset_file_system_dataset_write_options_set_property; + gobject_class->get_property = + gadataset_file_system_dataset_write_options_get_property; + + arrow::dataset::FileSystemDatasetWriteOptions default_options; + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetWriteOptions:file_write_options: + * + * Options for individual fragment writing. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("file-write-options", + "File write options", + "Options for individual fragment writing", + GADATASET_TYPE_FILE_WRITE_OPTIONS, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_FILE_WRITE_OPTIONS, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:file_system: + * + * #GArrowFileSystem into which a dataset will be written. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "GArrowFileSystem into which " + "a dataset will be written", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:base_dir: + * + * Root directory into which the dataset will be written. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("base-dir", + "Base directory", + "Root directory into which " + "the dataset will be written", + NULL, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_BASE_DIR, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:partitioning: + * + * #GADatasetPartitioning used to generate fragment paths. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "GADatasetPartitioning used to " + "generate fragment paths", + GADATASET_TYPE_PARTITIONING, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:max-partitions: + * + * Maximum number of partitions any batch may be written into. + * + * Since: 6.0.0 + */ + spec = g_param_spec_uint("max-partitions", + "Max partitions", + "Maximum number of partitions " + "any batch may be written into", + 0, + G_MAXINT, + default_options.max_partitions, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_MAX_PARTITIONS, spec); + + /** + * GADatasetFileSystemDatasetWriteOptions:base-name-template: + * + * Template string used to generate fragment base names. {i} will be + * replaced by an auto incremented integer. + * + * Since: 6.0.0 + */ + spec = g_param_spec_string("base-name-template", + "Base name template", + "Template string used to generate fragment " + "base names. {i} will be replaced by " + "an auto incremented integer", + NULL, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_BASE_NAME_TEMPLATE, spec); +} + +/** + * gadataset_file_system_dataset_write_options_new: + * + * Returns: The newly created #GADatasetFileSystemDatasetWriteOptions. + * + * Since: 6.0.0 + */ +GADatasetFileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_new(void) +{ + return GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS, + NULL)); +} + + typedef struct GADatasetFileSystemDatasetPrivate_ { GADatasetFileFormat *format; GArrowFileSystem *file_system; + GADatasetPartitioning *partitioning; } GADatasetFileSystemDatasetPrivate; enum { - PROP_FORMAT = 1, - PROP_FILE_SYSTEM, + PROP_FILE_SYSTEM_DATASET_FORMAT = 1, + PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM, + PROP_FILE_SYSTEM_DATASET_PARTITIONING, }; G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset, @@ -236,12 +546,15 @@ gadataset_file_system_dataset_set_property(GObject *object, auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); switch (prop_id) { - case PROP_FORMAT: + case PROP_FILE_SYSTEM_DATASET_FORMAT: priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); break; - case PROP_FILE_SYSTEM: + case PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM: priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value)); break; + case PROP_FILE_SYSTEM_DATASET_PARTITIONING: + priv->partitioning = GADATASET_PARTITIONING(g_value_dup_object(value)); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -257,12 +570,15 @@ gadataset_file_system_dataset_get_property(GObject *object, auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); switch (prop_id) { - case PROP_FORMAT: + case PROP_FILE_SYSTEM_DATASET_FORMAT: g_value_set_object(value, priv->format); break; - case PROP_FILE_SYSTEM: + case PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM: g_value_set_object(value, priv->file_system); break; + case PROP_FILE_SYSTEM_DATASET_PARTITIONING: + g_value_set_object(value, priv->partitioning); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); break; @@ -296,7 +612,9 @@ gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) GADATASET_TYPE_FILE_FORMAT, static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_FORMAT, + spec); /** * GADatasetFileSystemDataset:file-system: @@ -311,7 +629,52 @@ gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) GARROW_TYPE_FILE_SYSTEM, static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_FILE_SYSTEM, + spec); + + /** + * GADatasetFileSystemDataset:partitioning: + * + * Partitioning of the dataset. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("partitioning", + "Partitioning", + "Partitioning of the dataset", + GADATASET_TYPE_PARTITIONING, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_FILE_SYSTEM_DATASET_PARTITIONING, + spec); +} + +/** + * gadataset_file_system_dataset_write_scanner: + * @scanner: A #GADatasetScanner that produces data to be written. + * @options: A #GADatasetFileSystemDatasetWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_system_dataset_write_scanner( + GADatasetScanner *scanner, + GADatasetFileSystemDatasetWriteOptions *options, + GError **error) +{ + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_options = + gadataset_file_system_dataset_write_options_get_raw(options); + auto status = + arrow::dataset::FileSystemDataset::Write(*arrow_options, arrow_scanner); + return garrow::check(error, + status, + "[file-system-dataset][write-scanner]"); } @@ -363,3 +726,11 @@ gadataset_dataset_get_raw(GADatasetDataset *dataset) auto priv = GADATASET_DATASET_GET_PRIVATE(dataset); return priv->dataset; } + +arrow::dataset::FileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_get_raw( + GADatasetFileSystemDatasetWriteOptions *options) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_WRITE_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h index 97cf35d74d7..86d077caa98 100644 --- a/c_glib/arrow-dataset-glib/dataset.h +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -24,6 +24,7 @@ G_BEGIN_DECLS typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder; +typedef struct _GADatasetScanner GADatasetScanner; #define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, @@ -49,6 +50,23 @@ gchar * gadataset_dataset_get_type_name(GADatasetDataset *dataset); +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_WRITE_OPTIONS \ + (gadataset_file_system_dataset_write_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetWriteOptions, + gadataset_file_system_dataset_write_options, + GADATASET, + FILE_SYSTEM_DATASET_WRITE_OPTIONS, + GObject) +struct _GADatasetFileSystemDatasetWriteOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetFileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_new(void); + + #define GADATASET_TYPE_FILE_SYSTEM_DATASET \ (gadataset_file_system_dataset_get_type()) G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, @@ -61,5 +79,12 @@ struct _GADatasetFileSystemDatasetClass GADatasetDatasetClass parent_class; }; +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_system_dataset_write_scanner( + GADatasetScanner *scanner, + GADatasetFileSystemDatasetWriteOptions *options, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset.hpp b/c_glib/arrow-dataset-glib/dataset.hpp index 94dddd2eb7a..1dab391e8af 100644 --- a/c_glib/arrow-dataset-glib/dataset.hpp +++ b/c_glib/arrow-dataset-glib/dataset.hpp @@ -23,6 +23,7 @@ #include + GADatasetDataset * gadataset_dataset_new_raw( std::shared_ptr *arrow_dataset); @@ -39,10 +40,7 @@ gadataset_dataset_new_raw_valist( std::shared_ptr gadataset_dataset_get_raw(GADatasetDataset *dataset); -GADatasetFileFormat * -gadataset_file_format_new_raw( - std::shared_ptr *arrow_format); -std::shared_ptr -gadataset_dataset_get_raw(GADatasetDataset *dataset); - +arrow::dataset::FileSystemDatasetWriteOptions * +gadataset_file_system_dataset_write_options_get_raw( + GADatasetFileSystemDatasetWriteOptions *options); diff --git a/c_glib/arrow-dataset-glib/enums.c.template b/c_glib/arrow-dataset-glib/enums.c.template new file mode 100644 index 00000000000..8921ab06252 --- /dev/null +++ b/c_glib/arrow-dataset-glib/enums.c.template @@ -0,0 +1,52 @@ +/*** BEGIN file-header ***/ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +/*** END file-header ***/ + +/*** BEGIN file-production ***/ + +/* enumerations from "@filename@" */ +/*** END file-production ***/ + +/*** BEGIN value-header ***/ +GType +@enum_name@_get_type(void) +{ + static GType etype = 0; + if (G_UNLIKELY(etype == 0)) { + static const G@Type@Value values[] = { +/*** END value-header ***/ + +/*** BEGIN value-production ***/ + {@VALUENAME@, "@VALUENAME@", "@valuenick@"}, +/*** END value-production ***/ + +/*** BEGIN value-tail ***/ + {0, NULL, NULL} + }; + etype = g_@type@_register_static(g_intern_static_string("@EnumName@"), values); + } + return etype; +} +/*** END value-tail ***/ + +/*** BEGIN file-tail ***/ +/*** END file-tail ***/ diff --git a/c_glib/arrow-dataset-glib/enums.h.template b/c_glib/arrow-dataset-glib/enums.h.template new file mode 100644 index 00000000000..d6a0a455f5a --- /dev/null +++ b/c_glib/arrow-dataset-glib/enums.h.template @@ -0,0 +1,41 @@ +/*** BEGIN file-header ***/ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS +/*** END file-header ***/ + +/*** BEGIN file-production ***/ + +/* enumerations from "@filename@" */ +/*** END file-production ***/ + +/*** BEGIN value-header ***/ +GType @enum_name@_get_type(void) G_GNUC_CONST; +#define @ENUMPREFIX@_TYPE_@ENUMSHORT@ (@enum_name@_get_type()) +/*** END value-header ***/ + +/*** BEGIN file-tail ***/ + +G_END_DECLS +/*** END file-tail ***/ diff --git a/c_glib/arrow-dataset-glib/file-format.cpp b/c_glib/arrow-dataset-glib/file-format.cpp index 43f6a198f23..c0c92d966f8 100644 --- a/c_glib/arrow-dataset-glib/file-format.cpp +++ b/c_glib/arrow-dataset-glib/file-format.cpp @@ -18,6 +18,11 @@ */ #include +#include +#include +#include +#include +#include #include @@ -29,6 +34,11 @@ G_BEGIN_DECLS * @title: File format classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * + * #GADatasetFileWriteOptions is a class for options to write a file + * of this format. + * + * #GADatasetFileWriter is a class for writing a file of this format. + * * #GADatasetFileFormat is a base class for file format classes. * * #GADatasetCSVFileFormat is a class for CSV file format. @@ -40,12 +50,218 @@ G_BEGIN_DECLS * Since: 3.0.0 */ +typedef struct GADatasetFileWriteOptionsPrivate_ { + std::shared_ptr options; +} GADatasetFileWriteOptionsPrivate; + +enum { + PROP_OPTIONS = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions, + gadataset_file_write_options, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_write_options_get_instance_private( \ + GADATASET_FILE_WRITE_OPTIONS(obj))) + +static void +gadataset_file_write_options_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + priv->options.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object); +} + +static void +gadataset_file_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_OPTIONS: + priv->options = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_write_options_init(GADatasetFileWriteOptions *object) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object); + new(&priv->options) std::shared_ptr; +} + +static void +gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_write_options_finalize; + gobject_class->set_property = gadataset_file_write_options_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("options", + "Options", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); +} + + +typedef struct GADatasetFileWriterPrivate_ { + std::shared_ptr writer; +} GADatasetFileWriterPrivate; + +enum { + PROP_WRITER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter, + gadataset_file_writer, + G_TYPE_OBJECT) + +#define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_writer_get_instance_private( \ + GADATASET_FILE_WRITER(obj))) + +static void +gadataset_file_writer_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + priv->writer.~shared_ptr(); + G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object); +} + +static void +gadataset_file_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_WRITER: + priv->writer = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_writer_init(GADatasetFileWriter *object) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object); + new(&(priv->writer)) std::shared_ptr; +} + +static void +gadataset_file_writer_class_init(GADatasetFileWriterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_file_writer_finalize; + gobject_class->set_property = gadataset_file_writer_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("writer", + "Writer", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_WRITER, spec); +} + +/** + * gadataset_file_writer_write_record_batch: + * @writer: A #GADatasetFileWriter. + * @record_batch: A #GArrowRecordBatch to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + auto status = arrow_writer->Write(arrow_record_batch); + return garrow::check(error, status, "[file-writer][write-record-batch]"); +} + +/** + * gadataset_file_writer_write_record_batch_reader: + * @writer: A #GADatasetFileWriter. + * @reader: A #GArrowRecordBatchReader to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, + GArrowRecordBatchReader *reader, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto status = arrow_writer->Write(arrow_reader.get()); + return garrow::check(error, + status, + "[file-writer][write-record-batch-reader]"); +} + +/** + * gadataset_file_writer_finish: + * @writer: A #GADatasetFileWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 6.0.0 + */ +gboolean +gadataset_file_writer_finish(GADatasetFileWriter *writer, + GError **error) +{ + const auto arrow_writer = gadataset_file_writer_get_raw(writer); + auto status = arrow_writer->Finish(); + return garrow::check(error, + status, + "[file-writer][finish]"); +} + + typedef struct GADatasetFileFormatPrivate_ { - std::shared_ptr file_format; + std::shared_ptr format; } GADatasetFileFormatPrivate; enum { - PROP_FILE_FORMAT = 1, + PROP_FORMAT = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat, @@ -61,9 +277,7 @@ static void gadataset_file_format_finalize(GObject *object) { auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); - - priv->file_format.~shared_ptr(); - + priv->format.~shared_ptr(); G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object); } @@ -76,8 +290,8 @@ gadataset_file_format_set_property(GObject *object, auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); switch (prop_id) { - case PROP_FILE_FORMAT: - priv->file_format = + case PROP_FORMAT: + priv->format = *static_cast *>( g_value_get_pointer(value)); break; @@ -91,7 +305,7 @@ static void gadataset_file_format_init(GADatasetFileFormat *object) { auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); - new(&priv->file_format) std::shared_ptr; + new(&priv->format) std::shared_ptr; } static void @@ -103,49 +317,106 @@ gadataset_file_format_class_init(GADatasetFileFormatClass *klass) gobject_class->set_property = gadataset_file_format_set_property; GParamSpec *spec; - spec = g_param_spec_pointer("file-format", - "FileFormat", + spec = g_param_spec_pointer("format", + "Format", "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FILE_FORMAT, spec); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); } /** * gadataset_file_format_get_type_name: - * @file_format: A #GADatasetFileFormat. + * @format: A #GADatasetFileFormat. * - * Returns: The type name of @file_format. + * Returns: The type name of @format. * * It should be freed with g_free() when no longer needed. * * Since: 3.0.0 */ gchar * -gadataset_file_format_get_type_name(GADatasetFileFormat *file_format) +gadataset_file_format_get_type_name(GADatasetFileFormat *format) { - const auto arrow_file_format = gadataset_file_format_get_raw(file_format); - const auto &type_name = arrow_file_format->type_name(); + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto &type_name = arrow_format->type_name(); return g_strndup(type_name.data(), type_name.size()); } +/** + * gadataset_file_format_get_default_write_options: + * @format: A #GADatasetFileFormat. + * + * Returns: (transfer full): The default #GADatasetFileWriteOptions of @format. + * + * Since: 6.0.0 + */ +GADatasetFileWriteOptions * +gadataset_file_format_get_default_write_options(GADatasetFileFormat *format) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_options = arrow_format->DefaultWriteOptions(); + return gadataset_file_write_options_new_raw(&arrow_options); +} + +/** + * gadataset_file_format_open_writer: + * @format: A #GADatasetFileFormat. + * @destination: A #GArrowOutputStream. + * @file_system: The #GArrowFileSystem of @destination. + * @path: The path of @destination. + * @schema: A #GArrowSchema that is used by written record batches. + * @options: A #GADatasetFileWriteOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The newly created #GADatasetFileWriter of @format + * on success, %NULL on error. + * + * Since: 6.0.0 + */ +GADatasetFileWriter * +gadataset_file_format_open_writer(GADatasetFileFormat *format, + GArrowOutputStream *destination, + GArrowFileSystem *file_system, + const gchar *path, + GArrowSchema *schema, + GADatasetFileWriteOptions *options, + GError **error) +{ + const auto arrow_format = gadataset_file_format_get_raw(format); + auto arrow_destination = garrow_output_stream_get_raw(destination); + auto arrow_file_system = garrow_file_system_get_raw(file_system); + auto arrow_schema = garrow_schema_get_raw(schema); + auto arrow_options = gadataset_file_write_options_get_raw(options); + auto arrow_writer_result = + arrow_format->MakeWriter(arrow_destination, + arrow_schema, + arrow_options, + {arrow_file_system, path}); + if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) { + auto arrow_writer = *arrow_writer_result; + return gadataset_file_writer_new_raw(&arrow_writer); + } else { + return NULL; + } +} + /** * gadataset_file_format_equal: - * @file_format: A #GADatasetFileFormat. - * @other_file_format: A #GADatasetFileFormat to be compared. + * @format: A #GADatasetFileFormat. + * @other_format: A #GADatasetFileFormat to be compared. * * Returns: %TRUE if they are the same content file format, %FALSE otherwise. * * Since: 3.0.0 */ gboolean -gadataset_file_format_equal(GADatasetFileFormat *file_format, - GADatasetFileFormat *other_file_format) +gadataset_file_format_equal(GADatasetFileFormat *format, + GADatasetFileFormat *other_format) { - const auto arrow_file_format = gadataset_file_format_get_raw(file_format); - const auto arrow_other_file_format = - gadataset_file_format_get_raw(other_file_format); - return arrow_file_format->Equals(*arrow_other_file_format); + const auto arrow_format = gadataset_file_format_get_raw(format); + const auto arrow_other_format = gadataset_file_format_get_raw(other_format); + return arrow_format->Equals(*arrow_other_format); } @@ -173,10 +444,9 @@ gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass) GADatasetCSVFileFormat * gadataset_csv_file_format_new(void) { - std::shared_ptr arrow_file_format = + std::shared_ptr arrow_format = std::make_shared(); - return GADATASET_CSV_FILE_FORMAT( - gadataset_file_format_new_raw(&arrow_file_format)); + return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); } @@ -204,10 +474,9 @@ gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass) GADatasetIPCFileFormat * gadataset_ipc_file_format_new(void) { - std::shared_ptr arrow_file_format = + std::shared_ptr arrow_format = std::make_shared(); - return GADATASET_IPC_FILE_FORMAT( - gadataset_file_format_new_raw(&arrow_file_format)); + return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format)); } @@ -235,21 +504,56 @@ gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass) GADatasetParquetFileFormat * gadataset_parquet_file_format_new(void) { - std::shared_ptr arrow_file_format = + std::shared_ptr arrow_format = std::make_shared(); return GADATASET_PARQUET_FILE_FORMAT( - gadataset_file_format_new_raw(&arrow_file_format)); + gadataset_file_format_new_raw(&arrow_format)); } G_END_DECLS +GADatasetFileWriteOptions * +gadataset_file_write_options_new_raw( + std::shared_ptr *arrow_options) +{ + return GADATASET_FILE_WRITE_OPTIONS( + g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS, + "options", arrow_options, + NULL)); +} + +std::shared_ptr +gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options) +{ + auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options); + return priv->options; +} + + +GADatasetFileWriter * +gadataset_file_writer_new_raw( + std::shared_ptr *arrow_writer) +{ + return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER, + "writer", arrow_writer, + NULL)); +} + +std::shared_ptr +gadataset_file_writer_get_raw(GADatasetFileWriter *writer) +{ + auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer); + return priv->writer; +} + + GADatasetFileFormat * gadataset_file_format_new_raw( - std::shared_ptr *arrow_file_format) + std::shared_ptr *arrow_format) { GType type = GADATASET_TYPE_FILE_FORMAT; - const auto &type_name = (*arrow_file_format)->type_name(); + const auto &type_name = (*arrow_format)->type_name(); if (type_name == "csv") { type = GADATASET_TYPE_CSV_FILE_FORMAT; } else if (type_name == "ipc") { @@ -258,13 +562,13 @@ gadataset_file_format_new_raw( type = GADATASET_TYPE_PARQUET_FILE_FORMAT; } return GADATASET_FILE_FORMAT(g_object_new(type, - "file-format", arrow_file_format, + "format", arrow_format, NULL)); } std::shared_ptr -gadataset_file_format_get_raw(GADatasetFileFormat *file_format) +gadataset_file_format_get_raw(GADatasetFileFormat *format) { - auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(file_format); - return priv->file_format; + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format); + return priv->format; } diff --git a/c_glib/arrow-dataset-glib/file-format.h b/c_glib/arrow-dataset-glib/file-format.h index 7a6f46f56e9..16a8340747c 100644 --- a/c_glib/arrow-dataset-glib/file-format.h +++ b/c_glib/arrow-dataset-glib/file-format.h @@ -23,6 +23,47 @@ G_BEGIN_DECLS +#define GADATASET_TYPE_FILE_WRITE_OPTIONS \ + (gadataset_file_write_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileWriteOptions, + gadataset_file_write_options, + GADATASET, + FILE_WRITE_OPTIONS, + GObject) +struct _GADatasetFileWriteOptionsClass +{ + GObjectClass parent_class; +}; + + +#define GADATASET_TYPE_FILE_WRITER \ + (gadataset_file_writer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileWriter, + gadataset_file_writer, + GADATASET, + FILE_WRITER, + GObject) +struct _GADatasetFileWriterClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error); +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer, + GArrowRecordBatchReader *reader, + GError **error); +GARROW_AVAILABLE_IN_6_0 +gboolean +gadataset_file_writer_finish(GADatasetFileWriter *writer, + GError **error); + + #define GADATASET_TYPE_FILE_FORMAT (gadataset_file_format_get_type()) G_DECLARE_DERIVABLE_TYPE(GADatasetFileFormat, gadataset_file_format, @@ -36,12 +77,24 @@ struct _GADatasetFileFormatClass GARROW_AVAILABLE_IN_3_0 gchar * -gadataset_file_format_get_type_name(GADatasetFileFormat *file_format); +gadataset_file_format_get_type_name(GADatasetFileFormat *format); +GARROW_AVAILABLE_IN_6_0 +GADatasetFileWriteOptions * +gadataset_file_format_get_default_write_options(GADatasetFileFormat *format); +GARROW_AVAILABLE_IN_6_0 +GADatasetFileWriter * +gadataset_file_format_open_writer(GADatasetFileFormat *format, + GArrowOutputStream *destination, + GArrowFileSystem *file_system, + const gchar *path, + GArrowSchema *schema, + GADatasetFileWriteOptions *options, + GError **error); GARROW_AVAILABLE_IN_3_0 gboolean -gadataset_file_format_equal(GADatasetFileFormat *file_format, - GADatasetFileFormat *other_file_format); +gadataset_file_format_equal(GADatasetFileFormat *format, + GADatasetFileFormat *other_format); #define GADATASET_TYPE_CSV_FILE_FORMAT (gadataset_csv_file_format_get_type()) diff --git a/c_glib/arrow-dataset-glib/file-format.hpp b/c_glib/arrow-dataset-glib/file-format.hpp index 5dfb20b3caa..636dc5c015b 100644 --- a/c_glib/arrow-dataset-glib/file-format.hpp +++ b/c_glib/arrow-dataset-glib/file-format.hpp @@ -23,8 +23,22 @@ #include +GADatasetFileWriteOptions * +gadataset_file_write_options_new_raw( + std::shared_ptr *arrow_options); +std::shared_ptr +gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options); + + +GADatasetFileWriter * +gadataset_file_writer_new_raw( + std::shared_ptr *arrow_writer); +std::shared_ptr +gadataset_file_writer_get_raw(GADatasetFileWriter *writer); + + GADatasetFileFormat * gadataset_file_format_new_raw( - std::shared_ptr *arrow_file_format); + std::shared_ptr *arrow_format); std::shared_ptr -gadataset_file_format_get_raw(GADatasetFileFormat *file_format); +gadataset_file_format_get_raw(GADatasetFileFormat *format); diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index b3f617330cf..0d9b8564ecb 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -22,6 +22,7 @@ sources = files( 'dataset.cpp', 'file-format.cpp', 'fragment.cpp', + 'partitioning.cpp', 'scanner.cpp', ) @@ -31,6 +32,7 @@ c_headers = files( 'dataset.h', 'file-format.h', 'fragment.h', + 'partitioning.h', 'scanner.h', ) @@ -40,9 +42,22 @@ cpp_headers = files( 'dataset.hpp', 'file-format.hpp', 'fragment.hpp', + 'partitioning.hpp', 'scanner.hpp', ) +enums = gnome.mkenums('enums', + sources: c_headers, + identifier_prefix: 'GADataset', + symbol_prefix: 'gadataset', + c_template: 'enums.c.template', + h_template: 'enums.h.template', + install_dir: join_paths(include_dir, meson.project_name()), + install_header: true) +enums_source = enums[0] +enums_header = enums[1] + + headers = c_headers + cpp_headers install_headers(headers, subdir: 'arrow-dataset-glib') @@ -51,7 +66,7 @@ dependencies = [ arrow_glib, ] libarrow_dataset_glib = library('arrow-dataset-glib', - sources: sources, + sources: sources + enums, install: true, dependencies: dependencies, include_directories: base_include_directories, @@ -59,7 +74,8 @@ libarrow_dataset_glib = library('arrow-dataset-glib', version: library_version) arrow_dataset_glib = declare_dependency(link_with: libarrow_dataset_glib, include_directories: base_include_directories, - dependencies: dependencies) + dependencies: dependencies, + sources: enums_header) pkgconfig.generate(libarrow_dataset_glib, filebase: 'arrow-dataset-glib', @@ -71,7 +87,7 @@ pkgconfig.generate(libarrow_dataset_glib, if have_gi gnome.generate_gir(libarrow_dataset_glib, dependencies: declare_dependency(sources: arrow_glib_gir), - sources: sources + c_headers, + sources: sources + c_headers + enums, namespace: 'ArrowDataset', nsversion: api_version, identifier_prefix: 'GADataset', diff --git a/c_glib/arrow-dataset-glib/partitioning.cpp b/c_glib/arrow-dataset-glib/partitioning.cpp new file mode 100644 index 00000000000..bce33671a35 --- /dev/null +++ b/c_glib/arrow-dataset-glib/partitioning.cpp @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: partitioning + * @section_id: partitioning + * @title: Partitioning classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetPartitioningOptions is a class for partitioning options. + * + * #GADatasetPartitioning is a base class for partitioning classes + * such as #GADatasetDirectoryPartitioning. + * + * #GADatasetKeyValuePartitioning is a base class for key-value style + * partitioning classes such as #GADatasetDirectoryPartitioning. + * + * #GADatasetDirectoryPartitioning is a class for partitioning that + * uses directory structure. + * + * Since: 6.0.0 + */ + +typedef struct GADatasetPartitioningOptionsPrivate_ { + gboolean infer_dictionary; + GArrowSchema *schema; + GADatasetSegmentEncoding segment_encoding; +} GADatasetPartitioningOptionsPrivate; + +enum { + PROP_INFER_DICTIONARY = 1, + PROP_SCHEMA, + PROP_SEGMENT_ENCODING, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioningOptions, + gadataset_partitioning_options, + G_TYPE_OBJECT) + +#define GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_partitioning_options_get_instance_private( \ + GADATASET_PARTITIONING_OPTIONS(obj))) + +static void +gadataset_partitioning_options_dispose(GObject *object) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + if (priv->schema) { + g_object_unref(priv->schema); + priv->schema = nullptr; + } + + G_OBJECT_CLASS(gadataset_partitioning_options_parent_class)->dispose(object); +} + +static void +gadataset_partitioning_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_INFER_DICTIONARY: + priv->infer_dictionary = g_value_get_boolean(value); + break; + case PROP_SCHEMA: + { + auto schema = g_value_get_object(value); + if (priv->schema == schema) { + break; + } + auto old_schema = priv->schema; + if (schema) { + g_object_ref(schema); + priv->schema = GARROW_SCHEMA(schema); + } else { + priv->schema = NULL; + } + if (old_schema) { + g_object_unref(old_schema); + } + } + break; + case PROP_SEGMENT_ENCODING: + priv->segment_encoding = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_INFER_DICTIONARY: + g_value_set_boolean(value, priv->infer_dictionary); + break; + case PROP_SCHEMA: + g_value_set_object(value, priv->schema); + break; + case PROP_SEGMENT_ENCODING: + g_value_set_enum(value, priv->segment_encoding); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_options_init(GADatasetPartitioningOptions *object) +{ +} + +static void +gadataset_partitioning_options_class_init( + GADatasetPartitioningOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gadataset_partitioning_options_dispose; + gobject_class->set_property = gadataset_partitioning_options_set_property; + gobject_class->get_property = gadataset_partitioning_options_get_property; + + arrow::dataset::PartitioningFactoryOptions default_options; + GParamSpec *spec; + /** + * GADatasetPartitioningOptions:infer-dictionary: + * + * When inferring a schema for partition fields, yield dictionary + * encoded types instead of plain. This can be more efficient when + * materializing virtual columns, and Expressions parsed by the + * finished Partitioning will include dictionaries of all unique + * inspected values for each field. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("infer-dictionary", + "Infer dictionary", + "Whether encode partitioned field values as " + "dictionary", + default_options.infer_dictionary, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_INFER_DICTIONARY, spec); + + /** + * GADatasetPartitioningOptions:schema: + * + * Optionally, an expected schema can be provided, in which case + * inference will only check discovered fields against the schema + * and update internal state (such as dictionaries). + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("schema", + "Schema", + "Inference will only check discovered fields " + "against the schema and update internal state", + GARROW_TYPE_SCHEMA, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SCHEMA, spec); + + /** + * GADatasetPartitioningOptions:segment-encoding: + * + * After splitting a path into components, decode the path + * components before parsing according to this scheme. + * + * Since: 6.0.0 + */ + spec = g_param_spec_enum("segment-encoding", + "Segment encoding", + "After splitting a path into components, " + "decode the path components before " + "parsing according to this scheme", + GADATASET_TYPE_SEGMENT_ENCODING, + static_cast( + default_options.segment_encoding), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SEGMENT_ENCODING, spec); +} + +/** + * gadataset_partitioning_options_new: + * + * Returns: The newly created #GADatasetPartitioningOptions. + * + * Since: 6.0.0 + */ +GADatasetPartitioningOptions * +gadataset_partitioning_options_new(void) +{ + return GADATASET_PARTITIONING_OPTIONS( + g_object_new(GADATASET_TYPE_PARTITIONING_OPTIONS, + NULL)); +} + + +typedef struct GADatasetPartitioningPrivate_ { + std::shared_ptr partitioning; +} GADatasetPartitioningPrivate; + +enum { + PROP_PARTITIONING = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetPartitioning, + gadataset_partitioning, + G_TYPE_OBJECT) + +#define GADATASET_PARTITIONING_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_partitioning_get_instance_private( \ + GADATASET_PARTITIONING(obj))) + +static void +gadataset_partitioning_finalize(GObject *object) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + priv->partitioning.~shared_ptr(); + G_OBJECT_CLASS(gadataset_partitioning_parent_class)->finalize(object); +} + +static void +gadataset_partitioning_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_PARTITIONING: + priv->partitioning = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_partitioning_init(GADatasetPartitioning *object) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(object); + new(&priv->partitioning) std::shared_ptr; +} + +static void +gadataset_partitioning_class_init(GADatasetPartitioningClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gadataset_partitioning_finalize; + gobject_class->set_property = gadataset_partitioning_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("partitioning", + "Partitioning", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_PARTITIONING, spec); +} + +/** + * gadataset_partitioning_new: + * + * Returns: The newly created #GADatasetPartitioning that doesn't + * partition. + * + * Since: 6.0.0 + */ +GADatasetPartitioning * +gadataset_partitioning_new(void) +{ + auto arrow_partitioning = arrow::dataset::Partitioning::Default(); + return GADATASET_PARTITIONING( + g_object_new(GADATASET_TYPE_PARTITIONING, + "partitioning", &arrow_partitioning, + NULL)); +} + +/** + * gadataset_partitioning_get_type_name: + * @partitioning: A #GADatasetPartitioning. + * + * Returns: The type name of @partitioning. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 6.0.0 + */ +gchar * +gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) +{ + auto arrow_partitioning = gadataset_partitioning_get_raw(partitioning); + auto arrow_type_name = arrow_partitioning->type_name(); + return g_strndup(arrow_type_name.c_str(), + arrow_type_name.size()); +} + + +G_DEFINE_TYPE(GADatasetKeyValuePartitioning, + gadataset_key_value_partitioning, + GADATASET_TYPE_PARTITIONING) + +static void +gadataset_key_value_partitioning_init(GADatasetKeyValuePartitioning *object) +{ +} + +static void +gadataset_key_value_partitioning_class_init( + GADatasetKeyValuePartitioningClass *klass) +{ +} + + +G_DEFINE_TYPE(GADatasetDirectoryPartitioning, + gadataset_directory_partitioning, + GADATASET_TYPE_KEY_VALUE_PARTITIONING) + +static void +gadataset_directory_partitioning_init(GADatasetDirectoryPartitioning *object) +{ +} + +static void +gadataset_directory_partitioning_class_init( + GADatasetDirectoryPartitioningClass *klass) +{ +} + +/** + * gadataset_directory_partitioning_new: + * @schema: A #GArrowSchema that describes all partitioned segments. + * @dictionaries: (nullable) (element-type GArrowArray): A list of #GArrowArray + * for dictionary data types in @schema. + * @options: (nullable): A #GADatasetPartitioningOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: The newly created #GADatasetDirectoryPartitioning on success, + * %NULL on error. + * + * Since: 6.0.0 + */ +GADatasetDirectoryPartitioning * +gadataset_directory_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetPartitioningOptions *options, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + std::vector> arrow_dictionaries; + for (auto node = dictionaries; node; node = node->next) { + auto dictionary = GARROW_ARRAY(node->data); + if (dictionary) { + arrow_dictionaries.push_back(garrow_array_get_raw(dictionary)); + } else { + arrow_dictionaries.push_back(nullptr); + } + } + arrow::dataset::KeyValuePartitioningOptions arrow_options; + if (options) { + arrow_options = + gadataset_partitioning_options_get_raw_key_value_partitioning_options( + options); + } + auto arrow_partitioning = + std::make_shared( + arrow_schema, + arrow_dictionaries, + arrow_options); + return GADATASET_DIRECTORY_PARTITIONING( + g_object_new(GADATASET_TYPE_DIRECTORY_PARTITIONING, + "partitioning", &arrow_partitioning, + NULL)); +} + + +G_END_DECLS + +arrow::dataset::KeyValuePartitioningOptions +gadataset_partitioning_options_get_raw_key_value_partitioning_options( + GADatasetPartitioningOptions *options) +{ + auto priv = GADATASET_PARTITIONING_OPTIONS_GET_PRIVATE(options); + arrow::dataset::KeyValuePartitioningOptions arrow_options; + arrow_options.segment_encoding = + static_cast(priv->segment_encoding); + return arrow_options; +} + +std::shared_ptr +gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning) +{ + auto priv = GADATASET_PARTITIONING_GET_PRIVATE(partitioning); + return priv->partitioning; +} diff --git a/c_glib/arrow-dataset-glib/partitioning.h b/c_glib/arrow-dataset-glib/partitioning.h new file mode 100644 index 00000000000..d408d9bd502 --- /dev/null +++ b/c_glib/arrow-dataset-glib/partitioning.h @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +/** + * GADatasetSegmentEncoding + * @GADATASET_SEGMENT_ENCODING_NONE: No encoding. + * @GADATASET_SEGMENT_ENCODING_URI: Segment values are URL-encoded. + * + * They are corresponding to `arrow::dataset::SegmentEncoding` values. + * + * Since: 6.0.0 + */ +typedef enum { + GADATASET_SEGMENT_ENCODING_NONE, + GADATASET_SEGMENT_ENCODING_URI, +} GADatasetSegmentEncoding; + + +#define GADATASET_TYPE_PARTITIONING_OPTIONS \ + (gadataset_partitioning_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioningOptions, + gadataset_partitioning_options, + GADATASET, + PARTITIONING_OPTIONS, + GObject) +struct _GADatasetPartitioningOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetPartitioningOptions * +gadataset_partitioning_options_new(void); + + +#define GADATASET_TYPE_PARTITIONING (gadataset_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetPartitioning, + gadataset_partitioning, + GADATASET, + PARTITIONING, + GObject) +struct _GADatasetPartitioningClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetPartitioning * +gadataset_partitioning_new(void); +GARROW_AVAILABLE_IN_6_0 +gchar * +gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); + + +#define GADATASET_TYPE_KEY_VALUE_PARTITIONING \ + (gadataset_key_value_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetKeyValuePartitioning, + gadataset_key_value_partitioning, + GADATASET, + KEY_VALUE_PARTITIONING, + GADatasetPartitioning) +struct _GADatasetKeyValuePartitioningClass +{ + GADatasetPartitioningClass parent_class; +}; + + +#define GADATASET_TYPE_DIRECTORY_PARTITIONING \ + (gadataset_directory_partitioning_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDirectoryPartitioning, + gadataset_directory_partitioning, + GADATASET, + DIRECTORY_PARTITIONING, + GADatasetKeyValuePartitioning) +struct _GADatasetDirectoryPartitioningClass +{ + GADatasetKeyValuePartitioningClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GADatasetDirectoryPartitioning * +gadataset_directory_partitioning_new(GArrowSchema *schema, + GList *dictionaries, + GADatasetPartitioningOptions *options, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/partitioning.hpp b/c_glib/arrow-dataset-glib/partitioning.hpp new file mode 100644 index 00000000000..2481ecb3340 --- /dev/null +++ b/c_glib/arrow-dataset-glib/partitioning.hpp @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +arrow::dataset::KeyValuePartitioningOptions +gadataset_partitioning_options_get_raw_key_value_partitioning_options( + GADatasetPartitioningOptions *options); + +std::shared_ptr +gadataset_partitioning_get_raw(GADatasetPartitioning *partitioning); diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index efa2a5c3287..ddd3fd88af7 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -225,6 +226,24 @@ gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error) } } +/** + * gadataset_scanner_builder_new_record_batch_reader: + * @reader: A #GArrowRecordBatchReader that produces record batches. + * + * Returns: (nullable): A newly created #GADatasetScannerBuilder. + * + * Since: 6.0.0 + */ +GADatasetScannerBuilder * +gadataset_scanner_builder_new_record_batch_reader( + GArrowRecordBatchReader *reader) +{ + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto arrow_scanner_builder = + arrow::dataset::ScannerBuilder::FromRecordBatchReader(arrow_reader); + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); +} + /** * gadataset_scanner_builder_finish: * @builder: A #GADatasetScannerBuilder. diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index 446815d6db1..ba7f9c6b7c3 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -55,6 +55,10 @@ GARROW_AVAILABLE_IN_5_0 GADatasetScannerBuilder * gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error); +GARROW_AVAILABLE_IN_6_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new_record_batch_reader( + GArrowRecordBatchReader *reader); GARROW_AVAILABLE_IN_5_0 GADatasetScanner * gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 8783510728a..825d296dd26 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1271,7 +1271,7 @@ garrow_execute_plan_build_source_node(GArrowExecutePlan *plan, /** * garrow_execute_plan_build_aggregate_node: * @plan: A #GArrowExecutePlan. - * @input: A #GarrowExecuteNode. + * @input: A #GArrowExecuteNode. * @options: A #GArrowAggregateNodeOptions. * @error: (nullable): Return location for a #GError or %NULL. * @@ -1304,7 +1304,7 @@ garrow_execute_plan_build_aggregate_node(GArrowExecutePlan *plan, /** * garrow_execute_plan_build_sink_node: * @plan: A #GArrowExecutePlan. - * @input: A #GarrowExecuteNode. + * @input: A #GArrowExecuteNode. * @options: A #GArrowSinkNodeOptions. * @error: (nullable): Return location for a #GError or %NULL. * diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index 64f366a6282..57a13e65a1f 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -50,6 +50,8 @@ G_BEGIN_DECLS * * #GArrowBufferInputStream is a class to read data on buffer. * + * #GArrowFileInputStream is a class to read data in file. + * * #GArrowMemoryMappedInputStream is a class to read data in file by * mapping the file on memory. It supports zero copy. * @@ -631,6 +633,86 @@ garrow_buffer_input_stream_get_buffer(GArrowBufferInputStream *input_stream) } +G_DEFINE_TYPE(GArrowFileInputStream, + garrow_file_input_stream, + GARROW_TYPE_SEEKABLE_INPUT_STREAM); + +static void +garrow_file_input_stream_init(GArrowFileInputStream *object) +{ +} + +static void +garrow_file_input_stream_class_init(GArrowFileInputStreamClass *klass) +{ +} + +/** + * garrow_file_input_stream_new: + * @path: The path of the file to be opened. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowFileInputStream + * or %NULL on error. + * + * Since: 6.0.0 + */ +GArrowFileInputStream * +garrow_file_input_stream_new(const gchar *path, + GError **error) +{ + auto arrow_stream_result = arrow::io::ReadableFile::Open(path); + if (garrow::check(error, arrow_stream_result, "[file-input-stream][new]")) { + auto arrow_stream = *arrow_stream_result; + return garrow_file_input_stream_new_raw(&arrow_stream); + } else { + return NULL; + } +} + +/** + * garrow_file_input_stream_new_file_descriptor: + * @file_descriptor: The file descriptor of this input stream. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowFileInputStream + * or %NULL on error. + * + * Since: 6.0.0 + */ +GArrowFileInputStream * +garrow_file_input_stream_new_file_descriptor(gint fd, + GError **error) +{ + auto arrow_stream_result = arrow::io::ReadableFile::Open(fd); + if (garrow::check(error, + arrow_stream_result, + "[file-input-stream][new-file-descriptor]")) { + auto arrow_stream = *arrow_stream_result; + return garrow_file_input_stream_new_raw(&arrow_stream); + } else { + return NULL; + } +} + +/** + * garrow_file_input_stream_get_file_descriptor: + * @stream: A #GArrowFileInuptStream. + * + * Returns: The file descriptor of @stream. + * + * Since: 6.0.0 + */ +gint +garrow_file_input_stream_get_file_descriptor(GArrowFileInputStream *stream) +{ + auto arrow_stream = + std::static_pointer_cast( + garrow_input_stream_get_raw(GARROW_INPUT_STREAM(stream))); + return arrow_stream->file_descriptor(); +} + + G_DEFINE_TYPE(GArrowMemoryMappedInputStream, garrow_memory_mapped_input_stream, GARROW_TYPE_SEEKABLE_INPUT_STREAM); @@ -657,18 +739,14 @@ GArrowMemoryMappedInputStream * garrow_memory_mapped_input_stream_new(const gchar *path, GError **error) { - auto arrow_memory_mapped_file_result = - arrow::io::MemoryMappedFile::Open(std::string(path), - arrow::io::FileMode::READ); - if (arrow_memory_mapped_file_result.ok()) { - auto arrow_memory_mapped_file = - arrow_memory_mapped_file_result.ValueOrDie(); - return garrow_memory_mapped_input_stream_new_raw(&(arrow_memory_mapped_file)); + auto arrow_stream_result = + arrow::io::MemoryMappedFile::Open(path, arrow::io::FileMode::READ); + if (garrow::check(error, + arrow_stream_result, + "[memory-mapped-input-stream][new]")) { + auto arrow_stream = *arrow_stream_result; + return garrow_memory_mapped_input_stream_new_raw(&arrow_stream); } else { - std::string context("[memory-mapped-input-stream][open]: <"); - context += path; - context += ">"; - garrow::check(error, arrow_memory_mapped_file_result, context.c_str()); return NULL; } } @@ -1203,16 +1281,28 @@ garrow_buffer_input_stream_get_raw(GArrowBufferInputStream *buffer_input_stream) return arrow_buffer_reader; } + +GArrowFileInputStream * +garrow_file_input_stream_new_raw( + std::shared_ptr *arrow_stream) +{ + return GARROW_FILE_INPUT_STREAM(g_object_new(GARROW_TYPE_FILE_INPUT_STREAM, + "input-stream", arrow_stream, + NULL)); +} + + GArrowMemoryMappedInputStream * -garrow_memory_mapped_input_stream_new_raw(std::shared_ptr *arrow_memory_mapped_file) +garrow_memory_mapped_input_stream_new_raw( + std::shared_ptr *arrow_stream) { - auto object = g_object_new(GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM, - "input-stream", arrow_memory_mapped_file, - NULL); - auto memory_mapped_input_stream = GARROW_MEMORY_MAPPED_INPUT_STREAM(object); - return memory_mapped_input_stream; + return GARROW_MEMORY_MAPPED_INPUT_STREAM( + g_object_new(GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM, + "input-stream", arrow_stream, + NULL)); } + GArrowCompressedInputStream * garrow_compressed_input_stream_new_raw(std::shared_ptr *arrow_raw, GArrowCodec *codec, diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 4b4c51eb3e7..5ead66b8389 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -104,54 +104,42 @@ GArrowBufferInputStream *garrow_buffer_input_stream_new(GArrowBuffer *buffer); GArrowBuffer *garrow_buffer_input_stream_get_buffer(GArrowBufferInputStream *input_stream); -#define GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM \ - (garrow_memory_mapped_input_stream_get_type()) -#define GARROW_MEMORY_MAPPED_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_CAST((obj), \ - GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM, \ - GArrowMemoryMappedInputStream)) -#define GARROW_MEMORY_MAPPED_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_CAST((klass), \ - GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM, \ - GArrowMemoryMappedInputStreamClass)) -#define GARROW_IS_MEMORY_MAPPED_INPUT_STREAM(obj) \ - (G_TYPE_CHECK_INSTANCE_TYPE((obj), \ - GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM)) -#define GARROW_IS_MEMORY_MAPPED_INPUT_STREAM_CLASS(klass) \ - (G_TYPE_CHECK_CLASS_TYPE((klass), \ - GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM)) -#define GARROW_MEMORY_MAPPED_INPUT_STREAM_GET_CLASS(obj) \ - (G_TYPE_INSTANCE_GET_CLASS((obj), \ - GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM, \ - GArrowMemoryMappedInputStreamClass)) - -typedef struct _GArrowMemoryMappedInputStream GArrowMemoryMappedInputStream; -#ifndef __GTK_DOC_IGNORE__ -typedef struct _GArrowMemoryMappedInputStreamClass GArrowMemoryMappedInputStreamClass; -#endif - -/** - * GArrowMemoryMappedInputStream: - * - * It wraps `arrow::io::MemoryMappedFile`. - */ -struct _GArrowMemoryMappedInputStream +#define GARROW_TYPE_FILE_INPUT_STREAM (garrow_file_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFileInputStream, + garrow_file_input_stream, + GARROW, + FILE_INPUT_STREAM, + GArrowSeekableInputStream) +struct _GArrowFileInputStreamClass { - /*< private >*/ - GArrowSeekableInputStream parent_instance; + GArrowSeekableInputStreamClass parent_class; }; -#ifndef __GTK_DOC_IGNORE__ +GArrowFileInputStream * +garrow_file_input_stream_new(const gchar *path, + GError **error); +GArrowFileInputStream * +garrow_file_input_stream_new_file_descriptor(gint fd, + GError **error); +gint +garrow_file_input_stream_get_file_descriptor(GArrowFileInputStream *stream); + + +#define GARROW_TYPE_MEMORY_MAPPED_INPUT_STREAM \ + (garrow_memory_mapped_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowMemoryMappedInputStream, + garrow_memory_mapped_input_stream, + GARROW, + MEMORY_MAPPED_INPUT_STREAM, + GArrowSeekableInputStream) struct _GArrowMemoryMappedInputStreamClass { GArrowSeekableInputStreamClass parent_class; }; -#endif -GType garrow_memory_mapped_input_stream_get_type(void) G_GNUC_CONST; - -GArrowMemoryMappedInputStream *garrow_memory_mapped_input_stream_new(const gchar *path, - GError **error); +GArrowMemoryMappedInputStream * +garrow_memory_mapped_input_stream_new(const gchar *path, + GError **error); #define GARROW_TYPE_GIO_INPUT_STREAM \ diff --git a/c_glib/arrow-glib/input-stream.hpp b/c_glib/arrow-glib/input-stream.hpp index 88fbb8f64c1..2a0a3d3ddcc 100644 --- a/c_glib/arrow-glib/input-stream.hpp +++ b/c_glib/arrow-glib/input-stream.hpp @@ -40,7 +40,16 @@ garrow_buffer_input_stream_new_raw(std::shared_ptr *arr GArrowBuffer *buffer); std::shared_ptr garrow_buffer_input_stream_get_raw(GArrowBufferInputStream *input_stream); -GArrowMemoryMappedInputStream *garrow_memory_mapped_input_stream_new_raw(std::shared_ptr *arrow_memory_mapped_file); + +GArrowFileInputStream * +garrow_file_input_stream_new_raw( + std::shared_ptr *arrow_stream); + + +GArrowMemoryMappedInputStream * +garrow_memory_mapped_input_stream_new_raw( + std::shared_ptr *arrow_stream); + GArrowCompressedInputStream * garrow_compressed_input_stream_new_raw(std::shared_ptr *arrow_raw, diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index 3e8da5bd9d1..b13195b0703 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -39,6 +39,8 @@ Data + Partitioning + Dataset Dataset factory @@ -66,8 +68,12 @@ Index of deprecated API + + Index of new symbols in 6.0.0 + + - Index of new symbols in 4.0.0 + Index of new symbols in 5.0.0 diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb index 6d6ec3b18c6..0e856b678f8 100644 --- a/c_glib/test/dataset/test-file-system-dataset.rb +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -16,19 +16,73 @@ # under the License. class TestDatasetFileSystemDataset < Test::Unit::TestCase + include Helper::Buildable + include Helper::Readable + def setup omit("Arrow Dataset is required") unless defined?(ArrowDataset) Dir.mktmpdir do |tmpdir| @dir = tmpdir - format = ArrowDataset::IPCFileFormat.new - factory = ArrowDataset::FileSystemDatasetFactory.new(format) - factory.file_system = Arrow::LocalFileSystem.new - @dataset = factory.finish + @format = ArrowDataset::IPCFileFormat.new + @factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + @file_system = Arrow::LocalFileSystem.new + @factory.file_system = @file_system + partitioning_schema = build_schema(label: Arrow::StringDataType.new) + @partitioning = + ArrowDataset::DirectoryPartitioning.new(partitioning_schema) + @factory.partitioning = @partitioning yield end end def test_type_name - assert_equal("filesystem", @dataset.type_name) + dataset = @factory.finish + assert_equal("filesystem", dataset.type_name) + end + + def test_format + dataset = @factory.finish + assert_equal(@format, dataset.format) + end + + def test_file_system + dataset = @factory.finish + assert_equal(@file_system, dataset.file_system) + end + + def test_partitioning + dataset = @factory.finish + assert_equal(@partitioning, dataset.partitioning) + end + + def test_read_write + table = build_table(label: build_string_array(["a", "a", "b", "c"]), + count: build_int32_array([1, 10, 2, 3])) + table_reader = Arrow::TableBatchReader.new(table) + scanner_builder = ArrowDataset::ScannerBuilder.new(table_reader) + scanner = scanner_builder.finish + options = ArrowDataset::FileSystemDatasetWriteOptions.new + options.file_write_options = @format.default_write_options + options.file_system = Arrow::LocalFileSystem.new + options.base_dir = @dir + options.base_name_template = "{i}.arrow" + options.partitioning = @partitioning + ArrowDataset::FileSystemDataset.write_scanner(scanner, options) + Find.find(@dir) do |path| + @factory.add_path(path) if File.file?(path) + end + @factory.partition_base_dir = @dir + dataset = @factory.finish + assert_equal(build_table(count: [ + build_int32_array([1, 10]), + build_int32_array([2]), + build_int32_array([3]), + ], + label: [ + build_string_array(["a", "a"]), + build_string_array(["b"]), + build_string_array(["c"]), + ]), + dataset.to_table) end end diff --git a/c_glib/test/dataset/test-file-writer.rb b/c_glib/test/dataset/test-file-writer.rb new file mode 100644 index 00000000000..5b25d6044d6 --- /dev/null +++ b/c_glib/test/dataset/test-file-writer.rb @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileWriter < Test::Unit::TestCase + include Helper::Buildable + include Helper::Readable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @format = ArrowDataset::IPCFileFormat.new + @file_system = Arrow::LocalFileSystem.new + @path = File.join(@dir, "data.arrow") + @output = @file_system.open_output_stream(@path) + @schema = build_schema(visible: Arrow::BooleanDataType.new, + point: Arrow::UInt8DataType.new) + @writer = @format.open_writer(@output, + @file_system, + @path, + @schema, + @format.default_write_options) + yield + end + end + + def test_write_record_batch + record_batch = build_record_batch( + visible: build_boolean_array([true, false, true]), + point: build_uint8_array([1, 2, 3])) + @writer.write_record_batch(record_batch) + @writer.finish + @output.close + read_table(@path) do |written_table| + assert_equal(Arrow::Table.new(record_batch.schema, + [record_batch]), + written_table) + end + end + + def test_write_record_batch_reader + table = build_table(visible: build_boolean_array([true, false, true]), + point: build_uint8_array([1, 2, 3])) + @writer.write_record_batch_reader(Arrow::TableBatchReader.new(table)) + @writer.finish + @output.close + read_table(@path) do |written_table| + assert_equal(table, written_table) + end + end +end diff --git a/c_glib/test/dataset/test-partitioning-options.rb b/c_glib/test/dataset/test-partitioning-options.rb new file mode 100644 index 00000000000..9ff585aa7cf --- /dev/null +++ b/c_glib/test/dataset/test-partitioning-options.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetPartitioningOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + @options = ArrowDataset::PartitioningOptions.new + end + + def test_infer_dictionary + assert_false(@options.infer_dictionary?) + @options.infer_dictionary = true + assert_true(@options.infer_dictionary?) + end + + def test_schema + assert_nil(@options.schema) + schema = build_schema(year: Arrow::UInt16DataType.new) + @options.schema = schema + assert_equal(schema, @options.schema) + end + + def test_segment_encoding + assert_equal(ArrowDataset::SegmentEncoding::NONE, + @options.segment_encoding) + @options.segment_encoding = :uri + assert_equal(ArrowDataset::SegmentEncoding::URI, + @options.segment_encoding) + end +end diff --git a/c_glib/test/dataset/test-partitioning.rb b/c_glib/test/dataset/test-partitioning.rb new file mode 100644 index 00000000000..d98e51f3c59 --- /dev/null +++ b/c_glib/test/dataset/test-partitioning.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetPartitioning < Test::Unit::TestCase + include Helper::Buildable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + end + + def test_default + assert_equal("default", ArrowDataset::Partitioning.new.type_name) + end + + def test_directory + schema = build_schema(year: Arrow::UInt16DataType.new) + partitioning = ArrowDataset::DirectoryPartitioning.new(schema) + assert_equal("schema", partitioning.type_name) + end +end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb index f7702d4905f..ed6a706c6f2 100644 --- a/c_glib/test/dataset/test-scanner.rb +++ b/c_glib/test/dataset/test-scanner.rb @@ -45,4 +45,11 @@ def setup def test_to_table assert_equal(@table, @scanner.to_table) end + + def test_new_record_batch_reader + reader = Arrow::TableBatchReader.new(@table) + builder = ArrowDataset::ScannerBuilder.new(reader) + scanner = builder.finish + assert_equal(@table, scanner.to_table) + end end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 356fa651c6a..3a1240cfa1f 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -17,6 +17,13 @@ module Helper module Buildable + def build_schema(fields) + fields = fields.collect do |name, data_type| + Arrow::Field.new(name, data_type) + end + Arrow::Schema.new(fields) + end + def build_null_array(values) build_array(Arrow::NullArrayBuilder.new, values) end diff --git a/c_glib/test/helper/readable.rb b/c_glib/test/helper/readable.rb new file mode 100644 index 00000000000..81bf0795c6b --- /dev/null +++ b/c_glib/test/helper/readable.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Helper + module Readable + def read_table(input, type: :file) + if input.is_a?(Arrow::Buffer) + input_stream = Arrow::BufferIntputStream.new(input) + else + input_stream = Arrow::FileInputStream.new(input) + end + begin + if type == :file + reader = Arrow::RecordBatchFileReader.new(input_stream) + record_batches = [] + reader.n_record_batches.times do |i| + record_batches << reader.read_record_batch(i) + end + yield(Arrow::Table.new(record_batches[0].schema, record_batches)) + else + reader = Arrow::RecordBatchStreamReader.new(input_stream) + begin + yield(reader.read_all) + ensure + reader.close + end + end + ensure + input_stream.close + end + end + end +end diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index abae4e722c5..621c78c3986 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -84,6 +84,7 @@ def should_unlock_gvl?(info, klass) end require "fileutils" +require "find" require "rbconfig" require "stringio" require "tempfile" @@ -97,6 +98,7 @@ def should_unlock_gvl?(info, klass) end require_relative "helper/omittable" require_relative "helper/plasma-store" +require_relative "helper/readable" require_relative "helper/writable" exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/c_glib/test/test-file-input-stream.rb b/c_glib/test/test-file-input-stream.rb new file mode 100644 index 00000000000..2b43f97f5dd --- /dev/null +++ b/c_glib/test/test-file-input-stream.rb @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFileInputStream < Test::Unit::TestCase + def setup + @data = "Hello World" + @tempfile = Tempfile.open("arrow-file-input-stream") + @tempfile.write(@data) + @tempfile.close + end + + def test_new + input = Arrow::FileInputStream.new(@tempfile.path) + begin + buffer = input.read(5) + assert_equal("Hello", buffer.data.to_s) + ensure + input.close + end + end + + def test_close + input = Arrow::FileInputStream.new(@tempfile.path) + assert do + not input.closed? + end + input.close + assert do + input.closed? + end + end + + def test_size + input = Arrow::FileInputStream.new(@tempfile.path) + begin + assert_equal(@data.bytesize, input.size) + ensure + input.close + end + end + + def test_read + input = Arrow::FileInputStream.new(@tempfile.path) + begin + buffer = input.read(5) + assert_equal("Hello", buffer.data.to_s) + ensure + input.close + end + end + + def test_read_at + input = Arrow::FileInputStream.new(@tempfile.path) + begin + buffer = input.read_at(6, 5) + assert_equal("World", buffer.data.to_s) + ensure + input.close + end + end + + def test_mode + input = Arrow::FileInputStream.new(@tempfile.path) + begin + assert_equal(Arrow::FileMode::READ, input.mode) + ensure + input.close + end + end + + def test_file_descriptor + @tempfile.open + begin + fd = @tempfile.fileno + input = Arrow::FileInputStream.new(fd) + begin + assert_equal(fd, input.file_descriptor) + ensure + input.close + end + ensure + begin + @tempfile.close + rescue + end + end + end +end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb new file mode 100644 index 00000000000..f3896f04b92 --- /dev/null +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/arrow-table-savable.rb @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module ArrowDataset + module ArrowTableSavable + private + def save_to_uri + format = FileFormat.resolve(@options[:format]) + options = FileSystemDatasetWriteOptions.new + options.file_write_options = format.default_write_options + path = @output.path + if @output.scheme.nil? + options.file_system = Arrow::LocalFileSystem.new + else + options.file_system = Arrow::FileSystem.create(@output.to_s) + # /C:/... -> C:/... + unless File.expand_path(".").start_with?("/") + path = path.gsub(/\A\//, "") + end + end + partitioning = @options[:partitioning] + if partitioning + # TODO + options.base_dir = File.dirname(path) + options.base_name_template = File.basename(path) + options.partitioning = Partitioning.resolve(@options[:partitioning]) + scanner_builder = ScannerBuilder.new(@table) + scanner = scanner_builder.finish + FileSystemDataset.write_scanner(scanner, options) + else + dir = File.dirname(path) + unless File.exist?(dir) + options.file_system.create_dir(dir, true) + end + options.file_system.open_output_stream(path) do |output_stream| + format.open_writer(output_stream, + options.file_system, + path, + @table.schema, + format.default_write_options) do |writer| + reader = Arrow::TableBatchReader.new(@table) + writer.write_record_batch_reader(reader) + end + end + end + end + end +end + +module Arrow + class TableSaver + include ArrowDataset::ArrowTableSavable + end +end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb index 266ea49e3f5..83e61c4b24a 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/file-format.rb @@ -41,5 +41,19 @@ def resolve(format) end end end + + alias_method :open_writer_raw, :open_writer + def open_writer(destination, file_system, path, schema, options) + writer = open_writer_raw(destination, file_system, path, schema, options) + if block_given? + begin + yield(writer) + ensure + writer.finish + end + else + writer + end + end end end diff --git a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb index 599c055e84c..b1be000f7c3 100644 --- a/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb +++ b/ruby/red-arrow-dataset/lib/arrow-dataset/loader.rb @@ -30,6 +30,7 @@ def post_load(repository, namespace) def require_libraries require "arrow-dataset/arrow-table-loadable" + require "arrow-dataset/arrow-table-savable" require "arrow-dataset/dataset" require "arrow-dataset/file-format" require "arrow-dataset/file-system-dataset-factory" diff --git a/ruby/red-arrow-dataset/test/test-arrow-table.rb b/ruby/red-arrow-dataset/test/test-arrow-table.rb index 6ae9a905d2b..a9ab40337b6 100644 --- a/ruby/red-arrow-dataset/test/test-arrow-table.rb +++ b/ruby/red-arrow-dataset/test/test-arrow-table.rb @@ -19,10 +19,9 @@ class TestArrowTable < Test::Unit::TestCase def setup Dir.mktmpdir do |tmpdir| @dir = tmpdir - @path = File.join(@dir, "table.arrow") + @path = File.join(@dir, "data", "table.arrow") @table = Arrow::Table.new(visible: [true, false, true], point: [1, 2, 3]) - @table.save(@path) yield end end @@ -40,12 +39,14 @@ def build_file_uri(path) def test_no_scheme Dir.chdir(@dir) do uri = URI(File.basename(@path)) + @table.save(uri) assert_equal(@table, Arrow::Table.load(uri)) end end def test_file uri = build_file_uri(@path) + @table.save(uri) assert_equal(@table, Arrow::Table.load(uri)) end end diff --git a/ruby/red-arrow/lib/arrow/file-system.rb b/ruby/red-arrow/lib/arrow/file-system.rb new file mode 100644 index 00000000000..7d105b42a3e --- /dev/null +++ b/ruby/red-arrow/lib/arrow/file-system.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class FileSystem + alias_method :open_output_stream_raw, :open_output_stream + def open_output_stream(path) + stream = open_output_stream_raw(path) + if block_given? + begin + yield(stream) + ensure + stream.close + end + else + stream + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb index 5ede8eeea1d..9ca7e2619ef 100644 --- a/ruby/red-arrow/lib/arrow/loader.rb +++ b/ruby/red-arrow/lib/arrow/loader.rb @@ -72,6 +72,7 @@ def require_libraries require "arrow/equal-options" require "arrow/field" require "arrow/file-output-stream" + require "arrow/file-system" require "arrow/fixed-size-binary-array" require "arrow/fixed-size-binary-array-builder" require "arrow/group" diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index bc2296a0a07..207a10a8217 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -32,6 +32,29 @@ def initialize(table, output, options={}) end def save + if @output.is_a?(URI) + custom_save_method = "save_to_uri" + else + custom_save_method = "save_to_file" + end + unless respond_to?(custom_save_method, true) + available_schemes = [] + (methods(true) | private_methods(true)).each do |name| + match_data = /\Asave_to_/.match(name.to_s) + if match_data + available_schemes << match_data.post_match + end + end + message = "Arrow::Table save source must be one of [" + message << available_schemes.join(", ") + message << "]: #{@output.scheme.inspect}" + raise ArgumentError, message + end + __send__(custom_save_method) + end + + private + def save_to_file format = @options[:format] custom_save_method = "save_as_#{format}" unless respond_to?(custom_save_method, true) @@ -57,21 +80,24 @@ def save end end - private def fill_options if @options[:format] and @options.key?(:compression) return end - if @output.is_a?(Buffer) + case @output + when Buffer info = {} + when URI + extension = PathExtension.new(@output.path) + info = extension.extract else extension = PathExtension.new(@output) info = extension.extract end format = info[:format] @options = @options.dup - if format and respond_to?("save_as_#{format}", true) + if format @options[:format] ||= format.to_sym else @options[:format] ||= :arrow From 1440d5ab95c423003ecbe06372663a85cfccf769 Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 2 Sep 2021 20:19:57 -0400 Subject: [PATCH 15/93] ARROW-13768: [R] Allow JSON to be an optional component I templated from ARROW-11735. Let's see how all the tests go! Closes #11046 from karldw/arrow-12981 Authored-by: karldw Signed-off-by: Ian Cook --- dev/tasks/conda-recipes/r-arrow/configure.win | 2 +- dev/tasks/r/azure.linux.yml | 1 + dev/tasks/tasks.yml | 1 + r/NAMESPACE | 1 + r/R/arrow-package.R | 11 + r/R/json.R | 2 +- r/configure | 5 + r/configure.win | 2 +- r/data-raw/codegen.R | 2 +- r/inst/build_arrow_static.sh | 2 +- r/man/arrow_available.Rd | 5 + r/man/read_json_arrow.Rd | 2 +- r/src/arrowExports.cpp | 910 +++++++++--------- r/src/arrow_types.h | 6 + r/src/json.cpp | 12 +- r/tests/testthat/test-json.R | 2 + r/tools/autobrew | 2 +- r/vignettes/install.Rmd | 1 + 18 files changed, 505 insertions(+), 464 deletions(-) diff --git a/dev/tasks/conda-recipes/r-arrow/configure.win b/dev/tasks/conda-recipes/r-arrow/configure.win index 43a5945558a..0b11d1335c0 100755 --- a/dev/tasks/conda-recipes/r-arrow/configure.win +++ b/dev/tasks/conda-recipes/r-arrow/configure.win @@ -3,7 +3,7 @@ set -euxo pipefail # Remove the -I../inst/include/ when unvendoring cpp11 in ARROW-13610 -echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -I../inst/include/" > src/Makevars.win +echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3 -DARROW_R_WITH_JSON -I../inst/include/" > src/Makevars.win echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win echo 'CXX_STD=CXX11' >> src/Makevars.win echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -lparquet -larrow" >> src/Makevars.win diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml index 8564935162a..ff7c112dddb 100644 --- a/dev/tasks/r/azure.linux.yml +++ b/dev/tasks/r/azure.linux.yml @@ -54,6 +54,7 @@ jobs: {{ flags|default("") }} \ -e ARROW_DATASET={{ arrow_dataset|default("") }} \ -e ARROW_PARQUET={{ arrow_parquet|default("") }} \ + -e ARROW_JSON={{ arrow_json|default("") }} \ -e ARROW_S3={{ arrow_s3|default("") }} \ -e ARROW_WITH_RE2={{ arrow_with_re2|default("") }} \ -e ARROW_WITH_UTF8PROC={{ arrow_with_utf8proc|default("") }} \ diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 8b87c982983..176d44ec35f 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1103,6 +1103,7 @@ tasks: r_tag: latest arrow_dataset: "OFF" arrow_parquet: "OFF" + arrow_json: "OFF" arrow_s3: "OFF" arrow_with_re2: "OFF" arrow_with_utf8proc: "OFF" diff --git a/r/NAMESPACE b/r/NAMESPACE index 8bcc58653fb..8ce6d162eb0 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -187,6 +187,7 @@ export(UnionDataset) export(arrow_available) export(arrow_info) export(arrow_with_dataset) +export(arrow_with_json) export(arrow_with_parquet) export(arrow_with_s3) export(binary) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 10c14a00af4..537eebb1b1d 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -105,12 +105,14 @@ #' * The Arrow C++ library (check with `arrow_available()`) #' * Arrow Dataset support enabled (check with `arrow_with_dataset()`) #' * Parquet support enabled (check with `arrow_with_parquet()`) +#' * JSON support enabled (check with `arrow_with_json()`) #' * Amazon S3 support enabled (check with `arrow_with_s3()`) #' @export #' @examples #' arrow_available() #' arrow_with_dataset() #' arrow_with_parquet() +#' arrow_with_json() #' arrow_with_s3() #' @seealso If any of these are `FALSE`, see #' `vignette("install", package = "arrow")` for guidance on reinstalling the @@ -145,6 +147,14 @@ arrow_with_s3 <- function() { }) } +#' @rdname arrow_available +#' @export +arrow_with_json <- function() { + tryCatch(.Call(`_json_available`), error = function(e) { + return(FALSE) + }) +} + option_use_threads <- function() { !is_false(getOption("arrow.use_threads")) } @@ -174,6 +184,7 @@ arrow_info <- function() { capabilities = c( dataset = arrow_with_dataset(), parquet = arrow_with_parquet(), + json = arrow_with_json(), s3 = arrow_with_s3(), utf8proc = "utf8_upper" %in% compute_funcs, re2 = "replace_substring_regex" %in% compute_funcs, diff --git a/r/R/json.R b/r/R/json.R index 6560a07fe06..0d54c8a8aee 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -25,7 +25,7 @@ #' #' @return A `data.frame`, or a Table if `as_data_frame = FALSE`. #' @export -#' @examplesIf arrow_available() +#' @examplesIf arrow_with_json() #' tf <- tempfile() #' on.exit(unlink(tf)) #' writeLines(' diff --git a/r/configure b/r/configure index d56fc9040f0..88aef7e1d35 100755 --- a/r/configure +++ b/r/configure @@ -267,6 +267,11 @@ if [ $? -eq 0 ]; then BUNDLED_LIBS="$BUNDLED_LIBS -lssl -lcrypto -lcurl" fi fi + # Check for JSON + grep 'set(ARROW_JSON "ON")' $ARROW_OPTS_CMAKE >/dev/null 2>&1 + if [ $? -eq 0 ]; then + PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_JSON" + fi # prepend PKG_DIRS and append BUNDLED_LIBS to PKG_LIBS PKG_LIBS="$PKG_DIRS $PKG_LIBS $BUNDLED_LIBS" echo "PKG_CFLAGS=$PKG_CFLAGS" diff --git a/r/configure.win b/r/configure.win index d830a238c39..6d731bb0988 100644 --- a/r/configure.win +++ b/r/configure.win @@ -49,7 +49,7 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer -laws-cpp-sdk-identity-man # NOTE: If you make changes to the libraries below, you should also change # ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD -PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET" +PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON" PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH)$(CRT) '"-lparquet -larrow_dataset -larrow -larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 -lole32 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}" # S3 and re2 support only for Rtools40 (i.e. R >= 4.0) diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index 7bdd8486d39..46b02fd64bf 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -44,7 +44,7 @@ # Ensure that all machines are sorting the same way invisible(Sys.setlocale("LC_COLLATE", "C")) -features <- c("arrow", "dataset", "parquet", "s3") +features <- c("arrow", "dataset", "parquet", "s3", "json") suppressPackageStartupMessages({ library(decor) diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 84a9f971246..578d8b6e5b2 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -59,7 +59,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_FILESYSTEM=ON \ -DARROW_JEMALLOC=${ARROW_JEMALLOC:-$ARROW_DEFAULT_PARAM} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-ON} \ - -DARROW_JSON=ON \ + -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_PARQUET=${ARROW_PARQUET:-ON} \ -DARROW_S3=${ARROW_S3:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-$ARROW_DEFAULT_PARAM} \ diff --git a/r/man/arrow_available.Rd b/r/man/arrow_available.Rd index fdb23dfba30..3061d10dc9c 100644 --- a/r/man/arrow_available.Rd +++ b/r/man/arrow_available.Rd @@ -5,6 +5,7 @@ \alias{arrow_with_dataset} \alias{arrow_with_parquet} \alias{arrow_with_s3} +\alias{arrow_with_json} \title{Is the C++ Arrow library available?} \usage{ arrow_available() @@ -14,6 +15,8 @@ arrow_with_dataset() arrow_with_parquet() arrow_with_s3() + +arrow_with_json() } \value{ \code{TRUE} or \code{FALSE} depending on whether the package was installed @@ -22,6 +25,7 @@ with: \item The Arrow C++ library (check with \code{arrow_available()}) \item Arrow Dataset support enabled (check with \code{arrow_with_dataset()}) \item Parquet support enabled (check with \code{arrow_with_parquet()}) +\item JSON support enabled (check with \code{arrow_with_json()}) \item Amazon S3 support enabled (check with \code{arrow_with_s3()}) } } @@ -33,6 +37,7 @@ for diagnostic purposes. arrow_available() arrow_with_dataset() arrow_with_parquet() +arrow_with_json() arrow_with_s3() } \seealso{ diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 53d7107ae81..610867ca403 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -39,7 +39,7 @@ A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}. Using \link{JsonTableReader} } \examples{ -\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_json()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} tf <- tempfile() on.exit(unlink(tf)) writeLines(' diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index d99abf2605d..cb69ce17442 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4408,7 +4408,7 @@ extern "C" SEXP _arrow_io___BufferOutputStream__Write(SEXP stream_sexp, SEXP byt #endif // json.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) std::shared_ptr json___ReadOptions__initialize(bool use_threads, int block_size); extern "C" SEXP _arrow_json___ReadOptions__initialize(SEXP use_threads_sexp, SEXP block_size_sexp){ BEGIN_CPP11 @@ -4424,7 +4424,7 @@ extern "C" SEXP _arrow_json___ReadOptions__initialize(SEXP use_threads_sexp, SEX #endif // json.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) std::shared_ptr json___ParseOptions__initialize1(bool newlines_in_values); extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_sexp){ BEGIN_CPP11 @@ -4439,7 +4439,7 @@ extern "C" SEXP _arrow_json___ParseOptions__initialize1(SEXP newlines_in_values_ #endif // json.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) std::shared_ptr json___ParseOptions__initialize2(bool newlines_in_values, const std::shared_ptr& explicit_schema); extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_sexp, SEXP explicit_schema_sexp){ BEGIN_CPP11 @@ -4455,7 +4455,7 @@ extern "C" SEXP _arrow_json___ParseOptions__initialize2(SEXP newlines_in_values_ #endif // json.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) std::shared_ptr json___TableReader__Make(const std::shared_ptr& input, const std::shared_ptr& read_options, const std::shared_ptr& parse_options); extern "C" SEXP _arrow_json___TableReader__Make(SEXP input_sexp, SEXP read_options_sexp, SEXP parse_options_sexp){ BEGIN_CPP11 @@ -4472,7 +4472,7 @@ extern "C" SEXP _arrow_json___TableReader__Make(SEXP input_sexp, SEXP read_optio #endif // json.cpp -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) std::shared_ptr json___TableReader__Read(const std::shared_ptr& table_reader); extern "C" SEXP _arrow_json___TableReader__Read(SEXP table_reader_sexp){ BEGIN_CPP11 @@ -7020,455 +7020,465 @@ return Rf_ScalarLogical( #endif ); } +extern "C" SEXP _json_available() { +return Rf_ScalarLogical( +#if defined(ARROW_R_WITH_JSON) + TRUE +#else + FALSE +#endif +); +} static const R_CallMethodDef CallEntries[] = { { "_arrow_available", (DL_FUNC)& _arrow_available, 0 }, { "_dataset_available", (DL_FUNC)& _dataset_available, 0 }, { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, - { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, - { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, - { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, - { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, - { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, - { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, - { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, - { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, - { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, - { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, - { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, - { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, - { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, - { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, - { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, - { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, - { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, - { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, - { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, - { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, - { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, - { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, - { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, - { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, - { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, - { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, - { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, - { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, - { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, - { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, - { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, - { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, - { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, - { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, - { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, - { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, - { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, - { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, - { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, - { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, - { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, - { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, - { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, - { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, - { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, - { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, - { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, - { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, - { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, - { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, - { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, - { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, - { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, - { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, - { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, - { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, - { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, - { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, - { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, - { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, - { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, - { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, - { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, - { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, - { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, - { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, - { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, - { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, - { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, - { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, - { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, - { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, - { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, - { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, - { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, - { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, - { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, - { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, - { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, - { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, - { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, - { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, - { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, - { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, - { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, - { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, - { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, - { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, - { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, - { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, - { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, - { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, - { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, - { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, - { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, - { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, - { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, - { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, - { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, - { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, - { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, - { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, - { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, - { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, - { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, - { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, - { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, - { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, - { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, - { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, - { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, - { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, - { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, - { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, - { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, - { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, - { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, - { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, - { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, - { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, - { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, - { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, - { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, - { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, - { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, - { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, - { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, - { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, - { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, - { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, - { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, - { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, - { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, - { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, - { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, - { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, - { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, - { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, - { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, - { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, - { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, - { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, - { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, - { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, - { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, - { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, - { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, - { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, - { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, - { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, - { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, - { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, - { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, - { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, - { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, - { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, - { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, - { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, - { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, - { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, - { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, - { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, - { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, - { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, - { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, - { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, - { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, - { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, - { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, - { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, - { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, - { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, - { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, - { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, - { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, - { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, - { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, - { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, - { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, - { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, - { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, - { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, - { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, - { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, - { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, - { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, - { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, - { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, - { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, - { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, - { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, - { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, - { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, - { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, - { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, - { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, - { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, - { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, - { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, - { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, - { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, - { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, - { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, - { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, - { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, - { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, - { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, - { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, - { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, - { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, - { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, - { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, - { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, - { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, - { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, - { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, - { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, - { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, - { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, - { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, - { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, - { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, - { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, - { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, - { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, - { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, - { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, - { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, - { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, - { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, - { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, - { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, - { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, - { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, - { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, - { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, - { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, - { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, - { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, - { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, - { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, - { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, - { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, - { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, - { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, - { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, - { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, - { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, - { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, - { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, - { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, - { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, - { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, - { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, - { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, - { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, - { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, - { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, - { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, - { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, - { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, - { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, - { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, - { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, - { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, - { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, - { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, - { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, - { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, - { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, - { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, - { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, - { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, - { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, - { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, - { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, - { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, - { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, - { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, - { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, - { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, - { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, - { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, - { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, - { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, - { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, - { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, - { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, - { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, - { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, - { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, - { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, - { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, - { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, - { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, - { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, - { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, - { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, - { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, - { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, - { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, - { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, - { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, - { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, - { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, - { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, - { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, - { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, - { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, - { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, - { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, - { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, - { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, - { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, - { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, - { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, - { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, - { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, - { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, - { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, - { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, - { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, - { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, - { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, - { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, - { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, - { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, - { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, - { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, - { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, - { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, - { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, - { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, - { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, - { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, - { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, - { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, - { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, - { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, - { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, - { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, - { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, - { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, - { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, - { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, - { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, - { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, - { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, - { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, - { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, - { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, - { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, - { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, - { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, - { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, - { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, - { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, - { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, - { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, - { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, - { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, - { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, - { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, - { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, - { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, - { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, - { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, - { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, - { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, - { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, - { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, - { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, - { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, - { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, - { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, - { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, - { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, - { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, - { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, - { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, - { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, - { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, - { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, - { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, - { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, - { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, - { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, - { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, - { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, - { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, - { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, - { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, - { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, - { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, - { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, - { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, - { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, - { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, - { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, - { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, - { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, - { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, - { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, - { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, - { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, - { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, - { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, - { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, - { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, - { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, - { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, - { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, - { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, - { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, - { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, - { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, - { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, - { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, - { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, - { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, - { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, - { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, - { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, + { "_json_available", (DL_FUNC)& _json_available, 0 }, + { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, + { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, + { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, + { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, + { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, + { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, + { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, + { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, + { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, + { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, + { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, + { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, + { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, + { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, + { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, + { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, + { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, + { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, + { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, + { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, + { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, + { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, + { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, + { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, + { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, + { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, + { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, + { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, + { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, + { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, + { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, + { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, + { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, + { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, + { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, + { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, + { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, + { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, + { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, + { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, + { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, + { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, + { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, + { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, + { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, + { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, + { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, + { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, + { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, + { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, + { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, + { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, + { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, + { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, + { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, + { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, + { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, + { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, + { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, + { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, + { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, + { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, + { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, + { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, + { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, + { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, + { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, + { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, + { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, + { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, + { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, + { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, + { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, + { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, + { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, + { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, + { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, + { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, + { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, + { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, + { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, + { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, + { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, + { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, + { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, + { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, + { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, + { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, + { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, + { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, + { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, + { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, + { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, + { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, + { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, + { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, + { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, + { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, + { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, + { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, + { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, + { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, + { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, + { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, + { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, + { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, + { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, + { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, + { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, + { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, + { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, + { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, + { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, + { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, + { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, + { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, + { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, + { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, + { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, + { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, + { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, + { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, + { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, + { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, + { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, + { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, + { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, + { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, + { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, + { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, + { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, + { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, + { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, + { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, + { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, + { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, + { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, + { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, + { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, + { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, + { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, + { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, + { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, + { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, + { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, + { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, + { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, + { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, + { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, + { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, + { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, + { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, + { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, + { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, + { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, + { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, + { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, + { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, + { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, + { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, + { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, + { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, + { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, + { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, + { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, + { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, + { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, + { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, + { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, + { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, + { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, + { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, + { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, + { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, + { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, + { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, + { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, + { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, + { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, + { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, + { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, + { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, + { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, + { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, + { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, + { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, + { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, + { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, + { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, + { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, + { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, + { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, + { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, + { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, + { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, + { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, + { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, + { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, + { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, + { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, + { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, + { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, + { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, + { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, + { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, + { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, + { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, + { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, + { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, + { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, + { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, + { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, + { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, + { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, + { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, + { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, + { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, + { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, + { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, + { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, + { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, + { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, + { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, + { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, + { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, + { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, + { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, + { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, + { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, + { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, + { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, + { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, + { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, + { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, + { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, + { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, + { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, + { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, + { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, + { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, + { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, + { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, + { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, + { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, + { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, + { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, + { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, + { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, + { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, + { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, + { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, + { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, + { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, + { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, + { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, + { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, + { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, + { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, + { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, + { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, + { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, + { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, + { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, + { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, + { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, + { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, + { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, + { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, + { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, + { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, + { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, + { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, + { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, + { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, + { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, + { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, + { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, + { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, + { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, + { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, + { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, + { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, + { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, + { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, + { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, + { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, + { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, + { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, + { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, + { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, + { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, + { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, + { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, + { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, + { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, + { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, + { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, + { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, + { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, + { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, + { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, + { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, + { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, + { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, + { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, + { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, + { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, + { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, + { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, + { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, + { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, + { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, + { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, + { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, + { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, + { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, + { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, + { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, + { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, + { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, + { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, + { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, + { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, + { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, + { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, + { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, + { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, + { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, + { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, + { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, + { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, + { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, + { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, + { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, + { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, + { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, + { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, + { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, + { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, + { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, + { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, + { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, + { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, + { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, + { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, + { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, + { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, + { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, + { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, + { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, + { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, + { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, + { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, + { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, + { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, + { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, + { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, + { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, + { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, + { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, + { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, + { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, + { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, + { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, + { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, + { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, + { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, + { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, + { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, + { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, + { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, + { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, + { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, + { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, + { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, + { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, + { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, + { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, + { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, + { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, + { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, + { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, + { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, + { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, + { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, + { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, + { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, + { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, + { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, + { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, + { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, + { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, + { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, + { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, + { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, + { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, + { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, + { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, + { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, + { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, + { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, + { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, + { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, + { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, + { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, + { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, + { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, + { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, + { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, + { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, + { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, + { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, + { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, + { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, + { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, + { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, + { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, + { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, + { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, + { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, {NULL, NULL, 0} }; extern "C" void R_init_arrow(DllInfo* dll){ @@ -7480,5 +7490,3 @@ extern "C" void R_init_arrow(DllInfo* dll){ #endif } - - diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 9419d956877..88a3339dd1e 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -43,7 +43,11 @@ #include #include #include + +#if defined(ARROW_R_WITH_JSON) #include +#endif + #include #include @@ -221,9 +225,11 @@ R6_CLASS_NAME(parquet::arrow::FileWriter, "ParquetFileWriter"); R6_CLASS_NAME(arrow::ipc::feather::Reader, "FeatherReader"); +#if defined(ARROW_R_WITH_JSON) R6_CLASS_NAME(arrow::json::ReadOptions, "JsonReadOptions"); R6_CLASS_NAME(arrow::json::ParseOptions, "JsonParseOptions"); R6_CLASS_NAME(arrow::json::TableReader, "JsonTableReader"); +#endif #undef R6_CLASS_NAME diff --git a/r/src/json.cpp b/r/src/json.cpp index edc5e075754..ec00e54be6c 100644 --- a/r/src/json.cpp +++ b/r/src/json.cpp @@ -16,11 +16,11 @@ // under the License. #include "./arrow_types.h" -#if defined(ARROW_R_WITH_ARROW) +#if defined(ARROW_R_WITH_JSON) #include -// [[arrow::export]] +// [[json::export]] std::shared_ptr json___ReadOptions__initialize(bool use_threads, int block_size) { auto res = @@ -30,7 +30,7 @@ std::shared_ptr json___ReadOptions__initialize(bool us return res; } -// [[arrow::export]] +// [[json::export]] std::shared_ptr json___ParseOptions__initialize1( bool newlines_in_values) { auto res = @@ -39,7 +39,7 @@ std::shared_ptr json___ParseOptions__initialize1( return res; } -// [[arrow::export]] +// [[json::export]] std::shared_ptr json___ParseOptions__initialize2( bool newlines_in_values, const std::shared_ptr& explicit_schema) { auto res = @@ -49,7 +49,7 @@ std::shared_ptr json___ParseOptions__initialize2( return res; } -// [[arrow::export]] +// [[json::export]] std::shared_ptr json___TableReader__Make( const std::shared_ptr& input, const std::shared_ptr& read_options, @@ -58,7 +58,7 @@ std::shared_ptr json___TableReader__Make( *read_options, *parse_options)); } -// [[arrow::export]] +// [[json::export]] std::shared_ptr json___TableReader__Read( const std::shared_ptr& table_reader) { return ValueOrStop(table_reader->Read()); diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index 668b040d868..c39e1b7a423 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +skip_if_not_available("json") + context("JsonTableReader") test_that("Can read json file with scalars columns (ARROW-5503)", { diff --git a/r/tools/autobrew b/r/tools/autobrew index 0288a6eacd3..d40729e18aa 100644 --- a/r/tools/autobrew +++ b/r/tools/autobrew @@ -60,7 +60,7 @@ for FILE in $BREWDIR/Cellar/*/*/lib/*.a; do PKG_LIBS=`echo $PKG_LIBS | sed "s/-l$LIBNAME/-lbrew$LIBNAME/g"` done -PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include" +PKG_CFLAGS="-I$BREWDIR/opt/$PKG_BREW_NAME/include -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_JSON -DARROW_R_WITH_S3" unset HOMEBREW_NO_ANALYTICS unset HOMEBREW_NO_AUTO_UPDATE diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index aca860473d8..92daff31529 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -294,6 +294,7 @@ Some features are optional when you build Arrow from source. With the exception * `ARROW_JEMALLOC` for the `jemalloc` memory allocator * `ARROW_PARQUET` * `ARROW_DATASET` +* `ARROW_JSON` for the JSON parsing library * `ARROW_WITH_RE2` for the RE2 regular expression library, used in some string compute functions * `ARROW_WITH_UTF8PROC` for the UTF8Proc string library, used in many other string compute functions From a45fc3fccd13fed1acf73124012f0fe754022275 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 3 Sep 2021 08:43:28 -0400 Subject: [PATCH 16/93] ARROW-13782: [C++] Add skip_nulls/min_count to tdigest/mode/quantile Closes #11061 from lidavidm/arrow-13782 Authored-by: David Li Signed-off-by: David Li --- cpp/src/arrow/compute/api_aggregate.cc | 46 +++- cpp/src/arrow/compute/api_aggregate.h | 29 ++- .../arrow/compute/kernels/aggregate_mode.cc | 35 ++- .../compute/kernels/aggregate_quantile.cc | 18 +- .../compute/kernels/aggregate_tdigest.cc | 30 ++- .../arrow/compute/kernels/aggregate_test.cc | 231 ++++++++++++++++-- .../arrow/compute/kernels/hash_aggregate.cc | 42 +++- .../compute/kernels/hash_aggregate_test.cc | 48 ++-- python/pyarrow/_compute.pyx | 29 ++- python/pyarrow/compute.py | 10 +- python/pyarrow/includes/libarrow.pxd | 14 +- python/pyarrow/tests/test_compute.py | 11 +- r/src/compute.cpp | 10 +- 13 files changed, 462 insertions(+), 91 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 6d7bdfa6cf9..1216fe27d4e 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -85,18 +85,23 @@ static auto kScalarAggregateOptionsType = GetFunctionOptionsType(DataMember("mode", &CountOptions::mode)); -static auto kModeOptionsType = - GetFunctionOptionsType(DataMember("n", &ModeOptions::n)); +static auto kModeOptionsType = GetFunctionOptionsType( + DataMember("n", &ModeOptions::n), DataMember("skip_nulls", &ModeOptions::skip_nulls), + DataMember("min_count", &ModeOptions::min_count)); static auto kVarianceOptionsType = GetFunctionOptionsType( DataMember("ddof", &VarianceOptions::ddof), DataMember("skip_nulls", &VarianceOptions::skip_nulls), DataMember("min_count", &VarianceOptions::min_count)); static auto kQuantileOptionsType = GetFunctionOptionsType( DataMember("q", &QuantileOptions::q), - DataMember("interpolation", &QuantileOptions::interpolation)); + DataMember("interpolation", &QuantileOptions::interpolation), + DataMember("skip_nulls", &QuantileOptions::skip_nulls), + DataMember("min_count", &QuantileOptions::min_count)); static auto kTDigestOptionsType = GetFunctionOptionsType( DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta), - DataMember("buffer_size", &TDigestOptions::buffer_size)); + DataMember("buffer_size", &TDigestOptions::buffer_size), + DataMember("skip_nulls", &TDigestOptions::skip_nulls), + DataMember("min_count", &TDigestOptions::min_count)); static auto kIndexOptionsType = GetFunctionOptionsType(DataMember("value", &IndexOptions::value)); } // namespace @@ -112,7 +117,11 @@ CountOptions::CountOptions(CountMode mode) : FunctionOptions(internal::kCountOptionsType), mode(mode) {} constexpr char CountOptions::kTypeName[]; -ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {} +ModeOptions::ModeOptions(int64_t n, bool skip_nulls, uint32_t min_count) + : FunctionOptions(internal::kModeOptionsType), + n{n}, + skip_nulls{skip_nulls}, + min_count{min_count} {} constexpr char ModeOptions::kTypeName[]; VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count) @@ -122,27 +131,38 @@ VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count) min_count(min_count) {} constexpr char VarianceOptions::kTypeName[]; -QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation) +QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation, + bool skip_nulls, uint32_t min_count) : FunctionOptions(internal::kQuantileOptionsType), q{q}, - interpolation{interpolation} {} -QuantileOptions::QuantileOptions(std::vector q, enum Interpolation interpolation) + interpolation{interpolation}, + skip_nulls{skip_nulls}, + min_count{min_count} {} +QuantileOptions::QuantileOptions(std::vector q, enum Interpolation interpolation, + bool skip_nulls, uint32_t min_count) : FunctionOptions(internal::kQuantileOptionsType), q{std::move(q)}, - interpolation{interpolation} {} + interpolation{interpolation}, + skip_nulls{skip_nulls}, + min_count{min_count} {} constexpr char QuantileOptions::kTypeName[]; -TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size) +TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size, + bool skip_nulls, uint32_t min_count) : FunctionOptions(internal::kTDigestOptionsType), q{q}, delta{delta}, - buffer_size{buffer_size} {} + buffer_size{buffer_size}, + skip_nulls{skip_nulls}, + min_count{min_count} {} TDigestOptions::TDigestOptions(std::vector q, uint32_t delta, - uint32_t buffer_size) + uint32_t buffer_size, bool skip_nulls, uint32_t min_count) : FunctionOptions(internal::kTDigestOptionsType), q{std::move(q)}, delta{delta}, - buffer_size{buffer_size} {} + buffer_size{buffer_size}, + skip_nulls{skip_nulls}, + min_count{min_count} {} constexpr char TDigestOptions::kTypeName[]; IndexOptions::IndexOptions(std::shared_ptr value) diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 8c27da49765..c8df81773d4 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -82,11 +82,16 @@ class ARROW_EXPORT CountOptions : public FunctionOptions { /// By default, returns the most common value and count. class ARROW_EXPORT ModeOptions : public FunctionOptions { public: - explicit ModeOptions(int64_t n = 1); + explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0); constexpr static char const kTypeName[] = "ModeOptions"; static ModeOptions Defaults() { return ModeOptions{}; } int64_t n = 1; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; }; /// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel @@ -121,10 +126,12 @@ class ARROW_EXPORT QuantileOptions : public FunctionOptions { MIDPOINT, }; - explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR); + explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR, + bool skip_nulls = true, uint32_t min_count = 0); explicit QuantileOptions(std::vector q, - enum Interpolation interpolation = LINEAR); + enum Interpolation interpolation = LINEAR, + bool skip_nulls = true, uint32_t min_count = 0); constexpr static char const kTypeName[] = "QuantileOptions"; static QuantileOptions Defaults() { return QuantileOptions{}; } @@ -132,6 +139,11 @@ class ARROW_EXPORT QuantileOptions : public FunctionOptions { /// quantile must be between 0 and 1 inclusive std::vector q; enum Interpolation interpolation; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; }; /// \brief Control TDigest approximate quantile kernel behavior @@ -140,9 +152,11 @@ class ARROW_EXPORT QuantileOptions : public FunctionOptions { class ARROW_EXPORT TDigestOptions : public FunctionOptions { public: explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, - uint32_t buffer_size = 500); + uint32_t buffer_size = 500, bool skip_nulls = true, + uint32_t min_count = 0); explicit TDigestOptions(std::vector q, uint32_t delta = 100, - uint32_t buffer_size = 500); + uint32_t buffer_size = 500, bool skip_nulls = true, + uint32_t min_count = 0); constexpr static char const kTypeName[] = "TDigestOptions"; static TDigestOptions Defaults() { return TDigestOptions{}; } @@ -152,6 +166,11 @@ class ARROW_EXPORT TDigestOptions : public FunctionOptions { uint32_t delta; /// input buffer size, default 500 uint32_t buffer_size; + /// If true (the default), null values are ignored. Otherwise, if any value is null, + /// emit null. + bool skip_nulls; + /// If less than this many non-null values are observed, emit null. + uint32_t min_count; }; /// \brief Control Index kernel behavior diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 6ad0eeb6456..f225f6bf569 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -130,6 +130,13 @@ struct CountModer { Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // count values in all chunks, ignore nulls const Datum& datum = batch[0]; + + const ModeOptions& options = ModeState::Get(ctx); + if ((!options.skip_nulls && datum.null_count() > 0) || + (datum.length() - datum.null_count() < options.min_count)) { + return PrepareOutput(/*n=*/0, ctx, out).status(); + } + CountValues(this->counts.data(), datum, this->min); // generator to emit next value:count pair @@ -154,9 +161,16 @@ struct CountModer { template <> struct CountModer { Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const Datum& datum = batch[0]; + + const ModeOptions& options = ModeState::Get(ctx); + if ((!options.skip_nulls && datum.null_count() > 0) || + (datum.length() - datum.null_count() < options.min_count)) { + return PrepareOutput(/*n=*/0, ctx, out).status(); + } + int64_t counts[2]{}; - const Datum& datum = batch[0]; for (const auto& array : datum.chunks()) { if (array->length() > array->null_count()) { const int64_t true_count = @@ -167,7 +181,6 @@ struct CountModer { } } - const ModeOptions& options = ModeState::Get(ctx); const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0); const int64_t n = std::min(options.n, distinct_values); @@ -198,12 +211,19 @@ struct SortModer { using Allocator = arrow::stl::allocator; Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const Datum& datum = batch[0]; + const int64_t in_length = datum.length() - datum.null_count(); + + const ModeOptions& options = ModeState::Get(ctx); + if ((!options.skip_nulls && datum.null_count() > 0) || + (in_length < options.min_count)) { + return PrepareOutput(/*n=*/0, ctx, out).status(); + } + // copy all chunks to a buffer, ignore nulls and nans std::vector in_buffer(Allocator(ctx->memory_pool())); uint64_t nan_count = 0; - const Datum& datum = batch[0]; - const int64_t in_length = datum.length() - datum.null_count(); if (in_length > 0) { in_buffer.resize(in_length); CopyNonNullValues(datum, in_buffer.data()); @@ -305,6 +325,13 @@ struct Moder::value>> { template Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) { using CType = typename T::c_type; + + const ModeOptions& options = ModeState::Get(ctx); + if ((!options.skip_nulls && !scalar.is_valid) || + (static_cast(scalar.is_valid) < options.min_count)) { + return PrepareOutput(/*n=*/0, ctx, out).status(); + } + if (scalar.is_valid) { bool called = false; return Finalize(ctx, out, [&]() { diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 7d2ffe0770c..bfd97f813e5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -79,12 +79,18 @@ struct SortQuantiler { Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const QuantileOptions& options = QuantileState::Get(ctx); + const Datum& datum = batch[0]; // copy all chunks to a buffer, ignore nulls and nans std::vector in_buffer(Allocator(ctx->memory_pool())); + int64_t in_length = 0; + if ((!options.skip_nulls && datum.null_count() > 0) || + (datum.length() - datum.null_count() < options.min_count)) { + in_length = 0; + } else { + in_length = datum.length() - datum.null_count(); + } - const Datum& datum = batch[0]; - const int64_t in_length = datum.length() - datum.null_count(); if (in_length > 0) { in_buffer.resize(in_length); CopyNonNullValues(datum, in_buffer.data()); @@ -232,7 +238,11 @@ struct CountQuantiler { // count values in all chunks, ignore nulls const Datum& datum = batch[0]; - int64_t in_length = CountValues(this->counts.data(), datum, this->min); + int64_t in_length = 0; + if ((options.skip_nulls || (!options.skip_nulls && datum.null_count() == 0)) && + (datum.length() - datum.null_count() >= options.min_count)) { + in_length = CountValues(this->counts.data(), datum, this->min); + } // prepare out array int64_t out_length = options.q.size(); @@ -394,7 +404,7 @@ Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options, const Scalar& scalar, Datum* out) { using CType = typename T::c_type; ArrayData* output = out->mutable_array(); - if (!scalar.is_valid) { + if (!scalar.is_valid || options.min_count > 1) { output->length = 0; output->null_count = 0; return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc index be8d66c4c24..3b616c664a9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc @@ -37,14 +37,23 @@ struct TDigestImpl : public ScalarAggregator { using CType = typename ArrowType::c_type; explicit TDigestImpl(const TDigestOptions& options) - : q{options.q}, tdigest{options.delta, options.buffer_size} {} + : options{options}, + tdigest{options.delta, options.buffer_size}, + count{0}, + all_valid{true} {} Status Consume(KernelContext*, const ExecBatch& batch) override { + if (!this->all_valid) return Status::OK(); + if (!options.skip_nulls && batch[0].null_count() > 0) { + this->all_valid = false; + return Status::OK(); + } if (batch[0].is_array()) { const ArrayData& data = *batch[0].array(); const CType* values = data.GetValues(1); if (data.length > data.GetNullCount()) { + this->count += data.length - data.GetNullCount(); VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) { for (int64_t i = 0; i < len; ++i) { @@ -55,6 +64,7 @@ struct TDigestImpl : public ScalarAggregator { } else { const CType value = UnboxScalar::Unbox(*batch[0].scalar()); if (batch[0].scalar()->is_valid) { + this->count += 1; for (int64_t i = 0; i < batch.length; i++) { this->tdigest.NanAdd(value); } @@ -64,13 +74,21 @@ struct TDigestImpl : public ScalarAggregator { } Status MergeFrom(KernelContext*, KernelState&& src) override { - auto& other = checked_cast(src); + const auto& other = checked_cast(src); + if (!this->all_valid || !other.all_valid) { + this->all_valid = false; + return Status::OK(); + } this->tdigest.Merge(other.tdigest); + this->count += other.count; return Status::OK(); } Status Finalize(KernelContext* ctx, Datum* out) override { - const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size(); + const int64_t out_length = + (this->tdigest.is_empty() || !this->all_valid || this->count < options.min_count) + ? 0 + : options.q.size(); auto out_data = ArrayData::Make(float64(), out_length, 0); out_data->buffers.resize(2, nullptr); @@ -79,7 +97,7 @@ struct TDigestImpl : public ScalarAggregator { ctx->Allocate(out_length * sizeof(double))); double* out_buffer = out_data->template GetMutableValues(1); for (int64_t i = 0; i < out_length; ++i) { - out_buffer[i] = this->tdigest.Quantile(this->q[i]); + out_buffer[i] = this->tdigest.Quantile(this->options.q[i]); } } @@ -87,8 +105,10 @@ struct TDigestImpl : public ScalarAggregator { return Status::OK(); } - const std::vector q; + const TDigestOptions options; TDigest tdigest; + int64_t count; + bool all_valid; }; struct TDigestInitState { diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index eb73e703b6e..587e2033184 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1954,10 +1954,10 @@ class TestPrimitiveModeKernel : public ::testing::Test { using Traits = TypeTraits; using CType = typename ArrowType::c_type; - void AssertModesAre(const Datum& array, const int n, + void AssertModesAre(const Datum& array, const ModeOptions options, const std::vector& expected_modes, const std::vector& expected_counts) { - ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n})); + ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options)); ValidateOutput(out); const StructArray out_array(out.array()); ASSERT_EQ(out_array.length(), expected_modes.size()); @@ -1978,11 +1978,18 @@ class TestPrimitiveModeKernel : public ::testing::Test { const std::vector& expected_modes, const std::vector& expected_counts) { auto array = ArrayFromJSON(type_singleton(), json); - AssertModesAre(array, n, expected_modes, expected_counts); + AssertModesAre(array, ModeOptions(n), expected_modes, expected_counts); + } + + void AssertModesAre(const std::string& json, const ModeOptions options, + const std::vector& expected_modes, + const std::vector& expected_counts) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertModesAre(array, options, expected_modes, expected_counts); } void AssertModeIs(const Datum& array, CType expected_mode, int64_t expected_count) { - AssertModesAre(array, 1, {expected_mode}, {expected_count}); + AssertModesAre(array, ModeOptions(1), {expected_mode}, {expected_count}); } void AssertModeIs(const std::string& json, CType expected_mode, @@ -1997,8 +2004,8 @@ class TestPrimitiveModeKernel : public ::testing::Test { AssertModeIs(chunked, expected_mode, expected_count); } - void AssertModesEmpty(const Datum& array, int n) { - ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n})); + void AssertModesEmpty(const Datum& array, ModeOptions options) { + ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options)); auto out_array = out.make_array(); ValidateOutput(*out_array); ASSERT_EQ(out.array()->length, 0); @@ -2006,12 +2013,17 @@ class TestPrimitiveModeKernel : public ::testing::Test { void AssertModesEmpty(const std::string& json, int n = 1) { auto array = ArrayFromJSON(type_singleton(), json); - AssertModesEmpty(array, n); + AssertModesEmpty(array, ModeOptions(n)); } void AssertModesEmpty(const std::vector& json, int n = 1) { auto chunked = ChunkedArrayFromJSON(type_singleton(), json); - AssertModesEmpty(chunked, n); + AssertModesEmpty(chunked, ModeOptions(n)); + } + + void AssertModesEmpty(const std::string& json, ModeOptions options) { + auto array = ArrayFromJSON(type_singleton(), json); + AssertModesEmpty(array, options); } std::shared_ptr type_singleton() { return Traits::type_singleton(); } @@ -2049,13 +2061,37 @@ TEST_F(TestBooleanModeKernel, Basics) { {true, false}, {3, 2}); this->AssertModesEmpty({"[null, null]", "[]", "[null]"}, 4); - auto ty = struct_({field("mode", boolean()), field("count", int64())}); - Datum mode_true = ArrayFromJSON(ty, "[[true, 1]]"); - Datum mode_false = ArrayFromJSON(ty, "[[false, 1]]"); - Datum mode_empty = ArrayFromJSON(ty, "[]"); - EXPECT_THAT(Mode(Datum(true)), ResultWith(mode_true)); - EXPECT_THAT(Mode(Datum(false)), ResultWith(mode_false)); - EXPECT_THAT(Mode(MakeNullScalar(boolean())), ResultWith(mode_empty)); + auto in_ty = boolean(); + this->AssertModesAre("[true, false, false, null]", ModeOptions(/*n=*/1), {false}, {2}); + this->AssertModesEmpty("[true, false, false, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesAre("[true, false, false, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), + {false}, {2}); + this->AssertModesEmpty("[false, false, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3)); + this->AssertModesAre("[true, false, false]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), + {false}, {2}); + this->AssertModesEmpty("[true, false, false, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesEmpty("[true, false]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesAre(ScalarFromJSON(in_ty, "true"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {true}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "true"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "true"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + + this->AssertModesAre(ScalarFromJSON(in_ty, "true"), ModeOptions(/*n=*/1), {true}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), ModeOptions(/*n=*/1)); } TYPED_TEST_SUITE(TestIntegerModeKernel, IntegralArrowTypes); @@ -2077,10 +2113,35 @@ TYPED_TEST(TestIntegerModeKernel, Basics) { this->AssertModesEmpty("[null, null, null]", 10); auto in_ty = this->type_singleton(); - auto ty = struct_({field("mode", in_ty), field("count", int64())}); - EXPECT_THAT(Mode(*MakeScalar(in_ty, 5)), - ResultWith(Datum(ArrayFromJSON(ty, "[[5, 1]]")))); - EXPECT_THAT(Mode(MakeNullScalar(in_ty)), ResultWith(Datum(ArrayFromJSON(ty, "[]")))); + + this->AssertModesAre("[1, 2, 2, null]", ModeOptions(/*n=*/1), {2}, {2}); + this->AssertModesEmpty("[1, 2, 2, null]", ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesAre("[1, 2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), {2}, + {2}); + this->AssertModesEmpty("[2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3)); + this->AssertModesAre( + "[1, 2, 2]", ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {2}, {2}); + this->AssertModesEmpty("[1, 2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesEmpty("[1, 2]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesAre(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {1}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + + this->AssertModesAre(ScalarFromJSON(in_ty, "5"), ModeOptions(/*n=*/1), {5}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), ModeOptions(/*n=*/1)); } TYPED_TEST_SUITE(TestFloatingModeKernel, RealArrowTypes); @@ -2108,10 +2169,35 @@ TYPED_TEST(TestFloatingModeKernel, Floats) { this->AssertModesAre("[NaN, NaN, 1, null, 1, 2, 2]", 3, {1, 2, NAN}, {2, 2, 2}); auto in_ty = this->type_singleton(); - auto ty = struct_({field("mode", in_ty), field("count", int64())}); - EXPECT_THAT(Mode(*MakeScalar(in_ty, 5.0)), - ResultWith(Datum(ArrayFromJSON(ty, "[[5.0, 1]]")))); - EXPECT_THAT(Mode(MakeNullScalar(in_ty)), ResultWith(Datum(ArrayFromJSON(ty, "[]")))); + + this->AssertModesAre("[1, 2, 2, null]", ModeOptions(/*n=*/1), {2}, {2}); + this->AssertModesEmpty("[1, 2, 2, null]", ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesAre("[1, 2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3), {2}, + {2}); + this->AssertModesEmpty("[2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/3)); + this->AssertModesAre( + "[1, 2, 2]", ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3), {2}, {2}); + this->AssertModesEmpty("[1, 2, 2, null]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesEmpty("[1, 2]", + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/3)); + this->AssertModesAre(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false), {1}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/true, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "1"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), + ModeOptions(/*n=*/1, /*skip_nulls=*/false, /*min_count=*/2)); + + this->AssertModesAre(ScalarFromJSON(in_ty, "5"), ModeOptions(/*n=*/1), {5}, {1}); + this->AssertModesEmpty(ScalarFromJSON(in_ty, "null"), ModeOptions(/*n=*/1)); } TEST_F(TestInt8ModeKernelValueRange, Basics) { @@ -2672,6 +2758,36 @@ TYPED_TEST(TestIntegerQuantileKernel, Basics) { this->AssertQuantilesEmpty({"[null, null]", "[]", "[null]"}, {0.3, 0.4}); auto ty = this->type_singleton(); + + QuantileOptions keep_nulls(/*q=*/0.5, QuantileOptions::LINEAR, /*skip_nulls=*/false, + /*min_count=*/0); + QuantileOptions min_count(/*q=*/0.5, QuantileOptions::LINEAR, /*skip_nulls=*/true, + /*min_count=*/3); + QuantileOptions keep_nulls_min_count(/*q=*/0.5, QuantileOptions::LINEAR, + /*skip_nulls=*/false, /*min_count=*/3); + auto not_empty = ResultWith(ArrayFromJSON(float64(), "[3.0]")); + auto empty = ResultWith(ArrayFromJSON(float64(), "[]")); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), keep_nulls), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), keep_nulls), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), keep_nulls), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), min_count), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), min_count), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), keep_nulls_min_count), + not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), keep_nulls_min_count), + empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), keep_nulls_min_count), empty); + for (const auto interpolation : this->interpolations_) { QuantileOptions options({0.0, 0.5, 1.0}, interpolation); auto expected_ty = (interpolation == QuantileOptions::LINEAR || @@ -2718,6 +2834,36 @@ TYPED_TEST(TestFloatingQuantileKernel, Floats) { this->AssertQuantilesEmpty({"[NaN, NaN]", "[]", "[null]"}, {0.3, 0.4}); auto ty = this->type_singleton(); + + QuantileOptions keep_nulls(/*q=*/0.5, QuantileOptions::LINEAR, /*skip_nulls=*/false, + /*min_count=*/0); + QuantileOptions min_count(/*q=*/0.5, QuantileOptions::LINEAR, /*skip_nulls=*/true, + /*min_count=*/3); + QuantileOptions keep_nulls_min_count(/*q=*/0.5, QuantileOptions::LINEAR, + /*skip_nulls=*/false, /*min_count=*/3); + auto not_empty = ResultWith(ArrayFromJSON(float64(), "[3.0]")); + auto empty = ResultWith(ArrayFromJSON(float64(), "[]")); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), keep_nulls), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), keep_nulls), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), keep_nulls), not_empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), keep_nulls), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), min_count), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), min_count), not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5]"), keep_nulls_min_count), + not_empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 2, 4, 5, null]"), keep_nulls_min_count), + empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5]"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ArrayFromJSON(ty, "[1, 5, null]"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "3"), keep_nulls_min_count), empty); + EXPECT_THAT(Quantile(ScalarFromJSON(ty, "null"), keep_nulls_min_count), empty); + for (const auto interpolation : this->interpolations_) { QuantileOptions options({0.0, 0.5, 1.0}, interpolation); auto expected_ty = (interpolation == QuantileOptions::LINEAR || @@ -3015,5 +3161,44 @@ TEST(TestTDigestKernel, Scalar) { } } +TEST(TestTDigestKernel, Options) { + auto ty = float64(); + TDigestOptions keep_nulls(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/0); + TDigestOptions min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/true, /*min_count=*/3); + TDigestOptions keep_nulls_min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/3); + + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, 3.0]"), keep_nulls), + ResultWith(ArrayFromJSON(ty, "[2.0]"))); + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, 3.0, null]"), keep_nulls), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "1.0"), keep_nulls), + ResultWith(ArrayFromJSON(ty, "[1.0]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "null"), keep_nulls), + ResultWith(ArrayFromJSON(ty, "[]"))); + + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, 3.0, null]"), min_count), + ResultWith(ArrayFromJSON(ty, "[2.0]"))); + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, null]"), min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "1.0"), min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "null"), min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, 3.0]"), keep_nulls_min_count), + ResultWith(ArrayFromJSON(ty, "[2.0]"))); + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0]"), keep_nulls_min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ArrayFromJSON(ty, "[1.0, 2.0, 3.0, null]"), keep_nulls_min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "1.0"), keep_nulls_min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); + EXPECT_THAT(TDigest(ScalarFromJSON(ty, "null"), keep_nulls_min_count), + ResultWith(ArrayFromJSON(ty, "[]"))); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 3ea692857cf..23bb73f2a7f 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1593,6 +1593,8 @@ struct GroupedTDigestImpl : public GroupedAggregator { options_ = *checked_cast(options); ctx_ = ctx; pool_ = ctx->memory_pool(); + counts_ = TypedBufferBuilder(pool_); + no_nulls_ = TypedBufferBuilder(pool_); return Status::OK(); } @@ -1602,12 +1604,21 @@ struct GroupedTDigestImpl : public GroupedAggregator { for (int64_t i = 0; i < added_groups; i++) { tdigests_.emplace_back(options_.delta, options_.buffer_size); } + RETURN_NOT_OK(counts_.Append(new_num_groups, 0)); + RETURN_NOT_OK(no_nulls_.Append(new_num_groups, true)); return Status::OK(); } Status Consume(const ExecBatch& batch) override { - VisitGroupedValuesNonNull( - batch, [&](uint32_t g, CType value) { tdigests_[g].NanAdd(value); }); + int64_t* counts = counts_.mutable_data(); + uint8_t* no_nulls = no_nulls_.mutable_data(); + VisitGroupedValues( + batch, + [&](uint32_t g, CType value) { + tdigests_[g].NanAdd(value); + counts[g]++; + }, + [&](uint32_t g) { BitUtil::SetBitTo(no_nulls, g, false); }); return Status::OK(); } @@ -1615,15 +1626,26 @@ struct GroupedTDigestImpl : public GroupedAggregator { const ArrayData& group_id_mapping) override { auto other = checked_cast(&raw_other); + int64_t* counts = counts_.mutable_data(); + uint8_t* no_nulls = no_nulls_.mutable_data(); + + const int64_t* other_counts = other->counts_.data(); + const uint8_t* other_no_nulls = no_nulls_.mutable_data(); + auto g = group_id_mapping.GetValues(1); for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { tdigests_[*g].Merge(other->tdigests_[other_g]); + counts[*g] += other_counts[other_g]; + BitUtil::SetBitTo( + no_nulls, *g, + BitUtil::GetBit(no_nulls, *g) && BitUtil::GetBit(other_no_nulls, other_g)); } return Status::OK(); } Result Finalize() override { + const int64_t* counts = counts_.data(); std::shared_ptr null_bitmap; ARROW_ASSIGN_OR_RAISE( std::shared_ptr values, @@ -1633,7 +1655,7 @@ struct GroupedTDigestImpl : public GroupedAggregator { double* results = reinterpret_cast(values->mutable_data()); for (int64_t i = 0; static_cast(i) < tdigests_.size(); ++i) { - if (!tdigests_[i].is_empty()) { + if (!tdigests_[i].is_empty() && counts[i] >= options_.min_count) { for (int64_t j = 0; j < slot_length; j++) { results[i * slot_length + j] = tdigests_[i].Quantile(options_.q[j]); } @@ -1649,6 +1671,18 @@ struct GroupedTDigestImpl : public GroupedAggregator { std::fill(&results[i * slot_length], &results[(i + 1) * slot_length], 0.0); } + if (!options_.skip_nulls) { + null_count = kUnknownNullCount; + if (null_bitmap) { + arrow::internal::BitmapAnd(null_bitmap->data(), /*left_offset=*/0, + no_nulls_.data(), /*right_offset=*/0, + static_cast(tdigests_.size()), + /*out_offset=*/0, null_bitmap->mutable_data()); + } else { + ARROW_ASSIGN_OR_RAISE(null_bitmap, no_nulls_.Finish()); + } + } + auto child = ArrayData::Make(float64(), tdigests_.size() * options_.q.size(), {nullptr, std::move(values)}, /*null_count=*/0); return ArrayData::Make(out_type(), tdigests_.size(), {std::move(null_bitmap)}, @@ -1661,6 +1695,8 @@ struct GroupedTDigestImpl : public GroupedAggregator { TDigestOptions options_; std::vector tdigests_; + TypedBufferBuilder counts_; + TypedBufferBuilder no_nulls_; ExecContext* ctx_; MemoryPool* pool_; }; diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 32e8efa0ab8..df13bd569ea 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1086,27 +1086,40 @@ TEST(GroupBy, VarianceAndStddev) { TEST(GroupBy, TDigest) { auto batch = RecordBatchFromJSON( schema({field("argument", float64()), field("key", int64())}), R"([ - [1, 1], - [null, 1], - [0, 2], - [null, 3], - [4, null], - [3, 1], - [0, 2], - [-1, 2], - [1, null], - [NaN, 3] + [1, 1], + [null, 1], + [0, 2], + [null, 3], + [1, 4], + [4, null], + [3, 1], + [0, 2], + [-1, 2], + [1, null], + [NaN, 3], + [1, 4], + [1, 4], + [null, 4] ])"); TDigestOptions options1(std::vector{0.5, 0.9, 0.99}); TDigestOptions options2(std::vector{0.5, 0.9, 0.99}, /*delta=*/50, /*buffer_size=*/1024); + TDigestOptions keep_nulls(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/0); + TDigestOptions min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/true, /*min_count=*/3); + TDigestOptions keep_nulls_min_count(/*q=*/0.5, /*delta=*/100, /*buffer_size=*/500, + /*skip_nulls=*/false, /*min_count=*/3); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, internal::GroupBy( { batch->GetColumnByName("argument"), batch->GetColumnByName("argument"), batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), + batch->GetColumnByName("argument"), }, { batch->GetColumnByName("key"), @@ -1115,6 +1128,9 @@ TEST(GroupBy, TDigest) { {"hash_tdigest", nullptr}, {"hash_tdigest", &options1}, {"hash_tdigest", &options2}, + {"hash_tdigest", &keep_nulls}, + {"hash_tdigest", &min_count}, + {"hash_tdigest", &keep_nulls_min_count}, })); AssertDatumsApproxEqual( @@ -1122,13 +1138,17 @@ TEST(GroupBy, TDigest) { field("hash_tdigest", fixed_size_list(float64(), 1)), field("hash_tdigest", fixed_size_list(float64(), 3)), field("hash_tdigest", fixed_size_list(float64(), 3)), + field("hash_tdigest", fixed_size_list(float64(), 1)), + field("hash_tdigest", fixed_size_list(float64(), 1)), + field("hash_tdigest", fixed_size_list(float64(), 1)), field("key_0", int64()), }), R"([ - [[1.0], [1.0, 3.0, 3.0], [1.0, 3.0, 3.0], 1], - [[0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], 2], - [null, null, null, 3], - [[1.0], [1.0, 4.0, 4.0], [1.0, 4.0, 4.0], null] + [[1.0], [1.0, 3.0, 3.0], [1.0, 3.0, 3.0], null, null, null, 1], + [[0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0], [0.0], [0.0], 2], + [null, null, null, null, null, null, 3], + [[1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], null, [1.0], null, 4], + [[1.0], [1.0, 4.0, 4.0], [1.0, 4.0, 4.0], [1.0], null, null, null] ])"), aggregated_and_grouped, /*verbose=*/true); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 39bb5315f7a..29c579f85a9 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -924,13 +924,13 @@ class IndexOptions(_IndexOptions): cdef class _ModeOptions(FunctionOptions): - def _set_options(self, n): - self.wrapped.reset(new CModeOptions(n)) + def _set_options(self, n, skip_nulls, min_count): + self.wrapped.reset(new CModeOptions(n, skip_nulls, min_count)) class ModeOptions(_ModeOptions): - def __init__(self, n=1): - self._set_options(n) + def __init__(self, n=1, skip_nulls=True, min_count=0): + self._set_options(n, skip_nulls, min_count) cdef class _SetLookupOptions(FunctionOptions): @@ -1096,7 +1096,7 @@ class SortOptions(_SortOptions): cdef class _QuantileOptions(FunctionOptions): - def _set_options(self, quantiles, interp): + def _set_options(self, quantiles, interp, skip_nulls, min_count): interp_dict = { 'linear': CQuantileInterp_LINEAR, 'lower': CQuantileInterp_LOWER, @@ -1109,24 +1109,29 @@ cdef class _QuantileOptions(FunctionOptions): '{!r} is not a valid interpolation' .format(interp)) self.wrapped.reset( - new CQuantileOptions(quantiles, interp_dict[interp])) + new CQuantileOptions(quantiles, interp_dict[interp], + skip_nulls, min_count)) class QuantileOptions(_QuantileOptions): - def __init__(self, *, q=0.5, interpolation='linear'): + def __init__(self, *, q=0.5, interpolation='linear', + skip_nulls=True, min_count=0): if not isinstance(q, (list, tuple, np.ndarray)): q = [q] - self._set_options(q, interpolation) + self._set_options(q, interpolation, skip_nulls, min_count) cdef class _TDigestOptions(FunctionOptions): - def _set_options(self, quantiles, delta, buffer_size): + def _set_options(self, quantiles, delta, buffer_size, + skip_nulls, min_count): self.wrapped.reset( - new CTDigestOptions(quantiles, delta, buffer_size)) + new CTDigestOptions(quantiles, delta, buffer_size, + skip_nulls, min_count)) class TDigestOptions(_TDigestOptions): - def __init__(self, *, q=0.5, delta=100, buffer_size=500): + def __init__(self, *, q=0.5, delta=100, buffer_size=500, + skip_nulls=True, min_count=0): if not isinstance(q, (list, tuple, np.ndarray)): q = [q] - self._set_options(q, delta, buffer_size) + self._set_options(q, delta, buffer_size, skip_nulls, min_count) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 6f8b9fa3dae..4bc4034f64c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -445,7 +445,7 @@ def match_substring_regex(array, pattern, *, ignore_case=False): MatchSubstringOptions(pattern, ignore_case)) -def mode(array, n=1): +def mode(array, n=1, skip_nulls=True, min_count=0): """ Return top-n most common values and number of times they occur in a passed numerical (chunked) array, in descending order of occurance. If there are @@ -454,6 +454,12 @@ def mode(array, n=1): Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray + skip_nulls : bool, default True + If True, ignore nulls in the input. Else return an empty array + if any input is null. + min_count : int, default 0 + If there are fewer than this many values in the input, return + an empty array. Returns ------- @@ -470,7 +476,7 @@ def mode(array, n=1): >>> modes[1] """ - options = ModeOptions(n=n) + options = ModeOptions(n=n, skip_nulls=skip_nulls, min_count=min_count) return call_function("mode", [array], options) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4f9f4184b2d..29351e0b648 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1993,8 +1993,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CModeOptions \ "arrow::compute::ModeOptions"(CFunctionOptions): - CModeOptions(int64_t n) + CModeOptions(int64_t n, c_bool skip_nulls, uint32_t min_count) int64_t n + c_bool skip_nulls + uint32_t min_count cdef cppclass CIndexOptions \ "arrow::compute::IndexOptions"(CFunctionOptions): @@ -2041,17 +2043,23 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CQuantileOptions \ "arrow::compute::QuantileOptions"(CFunctionOptions): - CQuantileOptions(vector[double] q, CQuantileInterp interpolation) + CQuantileOptions(vector[double] q, CQuantileInterp interpolation, + c_bool skip_nulls, uint32_t min_count) vector[double] q CQuantileInterp interpolation + c_bool skip_nulls + uint32_t min_count cdef cppclass CTDigestOptions \ "arrow::compute::TDigestOptions"(CFunctionOptions): CTDigestOptions(vector[double] q, - unsigned int delta, unsigned int buffer_size) + unsigned int delta, unsigned int buffer_size, + c_bool skip_nulls, uint32_t min_count) vector[double] q unsigned int delta unsigned int buffer_size + c_bool skip_nulls + uint32_t min_count enum DatumType" arrow::Datum::type": DatumType_NONE" arrow::Datum::NONE" diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index c6a106fbebd..bbef46f2477 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -306,6 +306,14 @@ def test_mode_array(): arr = pa.array([], type='int64') assert len(pc.mode(arr)) == 0 + arr = pa.array([1, 1, 3, 4, 3, None], type='int64') + mode = pc.mode(arr, skip_nulls=False) + assert len(mode) == 0 + mode = pc.mode(arr, min_count=6) + assert len(mode) == 0 + mode = pc.mode(arr, skip_nulls=False, min_count=5) + assert len(mode) == 0 + def test_mode_chunked_array(): # ARROW-9917 @@ -650,7 +658,8 @@ def test_generated_signatures(): "options=None, skip_nulls=True, min_count=1)") sig = inspect.signature(pc.quantile) assert str(sig) == ("(array, *, memory_pool=None, " - "options=None, q=0.5, interpolation='linear')") + "options=None, q=0.5, interpolation='linear', " + "skip_nulls=True, min_count=0)") sig = inspect.signature(pc.binary_join_element_wise) assert str(sig) == ("(*strings, memory_pool=None, options=None, " "null_handling='emit_null', null_replacement='')") diff --git a/r/src/compute.cpp b/r/src/compute.cpp index e84f70016a5..7d17f111d74 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -225,6 +225,12 @@ std::shared_ptr make_compute_options( cpp11::as_cpp( interpolation); } + if (!Rf_isNull(options["na.min_count"])) { + out->min_count = cpp11::as_cpp(options["na.min_count"]); + } + if (!Rf_isNull(options["na.rm"])) { + out->skip_nulls = cpp11::as_cpp(options["na.rm"]); + } return out; } @@ -376,8 +382,8 @@ std::shared_ptr make_compute_options( using Options = arrow::compute::VarianceOptions; auto out = std::make_shared(); out->ddof = cpp11::as_cpp(options["ddof"]); - if (!Rf_isNull(options["na.min_count"])) { - out->min_count = cpp11::as_cpp(options["na.min_count"]); + if (!Rf_isNull(options["min_count"])) { + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["na.rm"])) { out->skip_nulls = cpp11::as_cpp(options["na.rm"]); From 5ead37593472c42f61c76396dde7dcb8954bde70 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 3 Sep 2021 09:44:54 -0400 Subject: [PATCH 17/93] ARROW-13855: [C++][Python] Implement C data interface support for extension types Closes #11071 from pitrou/ARROW-13855-export-extension Authored-by: Antoine Pitrou Signed-off-by: David Li --- cpp/src/arrow/c/bridge.cc | 128 +++++-- cpp/src/arrow/c/bridge_test.cc | 349 +++++++++++++----- cpp/src/arrow/extension_type_test.cc | 2 + cpp/src/arrow/ipc/read_write_test.cc | 10 +- cpp/src/arrow/ipc/test_common.cc | 17 + cpp/src/arrow/ipc/test_common.h | 3 + cpp/src/arrow/testing/extension_type.h | 41 +- cpp/src/arrow/testing/gtest_util.cc | 84 +++-- .../arrow/testing/json_integration_test.cc | 7 +- cpp/src/arrow/util/key_value_metadata.cc | 8 +- cpp/src/arrow/util/key_value_metadata.h | 5 +- python/pyarrow/includes/libarrow.pxd | 9 + python/pyarrow/tests/test_cffi.py | 64 +++- python/pyarrow/tests/test_extension_type.py | 28 ++ python/pyarrow/types.pxi | 39 ++ 15 files changed, 626 insertions(+), 168 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 8b8153465ee..9484b44590a 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -28,6 +28,7 @@ #include "arrow/buffer.h" #include "arrow/c/helpers.h" #include "arrow/c/util_internal.h" +#include "arrow/extension_type.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" #include "arrow/result.h" @@ -56,8 +57,6 @@ using internal::ArrayExportTraits; using internal::SchemaExportGuard; using internal::SchemaExportTraits; -// TODO export / import Extension types and arrays - namespace { Status ExportingNotImplemented(const DataType& type) { @@ -171,23 +170,26 @@ struct SchemaExporter { export_.name_ = field.name(); flags_ = field.nullable() ? ARROW_FLAG_NULLABLE : 0; - const DataType& type = *field.type(); - RETURN_NOT_OK(ExportFormat(type)); - RETURN_NOT_OK(ExportChildren(type.fields())); + const DataType* type = UnwrapExtension(field.type().get()); + RETURN_NOT_OK(ExportFormat(*type)); + RETURN_NOT_OK(ExportChildren(type->fields())); RETURN_NOT_OK(ExportMetadata(field.metadata().get())); return Status::OK(); } - Status ExportType(const DataType& type) { + Status ExportType(const DataType& orig_type) { flags_ = ARROW_FLAG_NULLABLE; - RETURN_NOT_OK(ExportFormat(type)); - RETURN_NOT_OK(ExportChildren(type.fields())); + const DataType* type = UnwrapExtension(&orig_type); + RETURN_NOT_OK(ExportFormat(*type)); + RETURN_NOT_OK(ExportChildren(type->fields())); + // There may be additional metadata to export + RETURN_NOT_OK(ExportMetadata(nullptr)); return Status::OK(); } Status ExportSchema(const Schema& schema) { - static StructType dummy_struct_type({}); + static const StructType dummy_struct_type({}); flags_ = 0; RETURN_NOT_OK(ExportFormat(dummy_struct_type)); @@ -232,6 +234,17 @@ struct SchemaExporter { c_struct->release = ReleaseExportedSchema; } + const DataType* UnwrapExtension(const DataType* type) { + if (type->id() == Type::EXTENSION) { + const auto& ext_type = checked_cast(*type); + additional_metadata_.reserve(2); + additional_metadata_.emplace_back(kExtensionTypeKeyName, ext_type.extension_name()); + additional_metadata_.emplace_back(kExtensionMetadataKeyName, ext_type.Serialize()); + return ext_type.storage_type().get(); + } + return type; + } + Status ExportFormat(const DataType& type) { if (type.id() == Type::DICTIONARY) { const auto& dict_type = checked_cast(type); @@ -259,10 +272,29 @@ struct SchemaExporter { return Status::OK(); } - Status ExportMetadata(const KeyValueMetadata* metadata) { - if (metadata != nullptr && metadata->size() >= 0) { - ARROW_ASSIGN_OR_RAISE(export_.metadata_, EncodeMetadata(*metadata)); + Status ExportMetadata(const KeyValueMetadata* orig_metadata) { + static const KeyValueMetadata empty_metadata; + + if (orig_metadata == nullptr) { + orig_metadata = &empty_metadata; } + if (additional_metadata_.empty()) { + if (orig_metadata->size() > 0) { + ARROW_ASSIGN_OR_RAISE(export_.metadata_, EncodeMetadata(*orig_metadata)); + } + return Status::OK(); + } + // Additional metadata needs to be appended to the existing + // (for extension types) + KeyValueMetadata metadata(orig_metadata->keys(), orig_metadata->values()); + for (const auto& kv : additional_metadata_) { + // The metadata may already be there => ignore + if (metadata.Contains(kv.first)) { + continue; + } + metadata.Append(kv.first, kv.second); + } + ARROW_ASSIGN_OR_RAISE(export_.metadata_, EncodeMetadata(metadata)); return Status::OK(); } @@ -442,6 +474,7 @@ struct SchemaExporter { ExportedSchemaPrivateData export_; int64_t flags_ = 0; + std::vector> additional_metadata_; std::unique_ptr dict_exporter_; std::vector child_exporters_; }; @@ -721,7 +754,13 @@ class FormatStringParser { size_t index_; }; -Result> DecodeMetadata(const char* metadata) { +struct DecodedMetadata { + std::shared_ptr metadata; + std::string extension_name; + std::string extension_serialized; +}; + +Result DecodeMetadata(const char* metadata) { auto read_int32 = [&](int32_t* out) -> Status { int32_t v; memcpy(&v, metadata, 4); @@ -744,21 +783,29 @@ Result> DecodeMetadata(const char* metadata) { return Status::OK(); }; + DecodedMetadata decoded; + if (metadata == nullptr) { - return nullptr; + return decoded; } int32_t npairs; RETURN_NOT_OK(read_int32(&npairs)); if (npairs == 0) { - return nullptr; + return decoded; } std::vector keys(npairs); std::vector values(npairs); for (int32_t i = 0; i < npairs; ++i) { RETURN_NOT_OK(read_string(&keys[i])); RETURN_NOT_OK(read_string(&values[i])); + if (keys[i] == kExtensionTypeKeyName) { + decoded.extension_name = values[i]; + } else if (keys[i] == kExtensionMetadataKeyName) { + decoded.extension_serialized = values[i]; + } } - return key_value_metadata(std::move(keys), std::move(values)); + decoded.metadata = key_value_metadata(std::move(keys), std::move(values)); + return decoded; } struct SchemaImporter { @@ -775,10 +822,9 @@ struct SchemaImporter { } Result> MakeField() const { - ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata)); const char* name = c_struct_->name ? c_struct_->name : ""; bool nullable = (c_struct_->flags & ARROW_FLAG_NULLABLE) != 0; - return field(name, type_, nullable, std::move(metadata)); + return field(name, type_, nullable, std::move(metadata_.metadata)); } Result> MakeSchema() const { @@ -787,8 +833,7 @@ struct SchemaImporter { "Cannot import schema: ArrowSchema describes non-struct type ", type_->ToString()); } - ARROW_ASSIGN_OR_RAISE(auto metadata, DecodeMetadata(c_struct_->metadata)); - return schema(type_->fields(), std::move(metadata)); + return schema(type_->fields(), std::move(metadata_.metadata)); } Result> MakeType() const { return type_; } @@ -836,6 +881,20 @@ struct SchemaImporter { bool ordered = (c_struct_->flags & ARROW_FLAG_DICTIONARY_ORDERED) != 0; type_ = dictionary(type_, dict_importer.type_, ordered); } + + // Import metadata + ARROW_ASSIGN_OR_RAISE(metadata_, DecodeMetadata(c_struct_->metadata)); + + // Detect extension type + if (!metadata_.extension_name.empty()) { + const auto registered_ext_type = GetExtensionType(metadata_.extension_name); + if (registered_ext_type) { + ARROW_ASSIGN_OR_RAISE( + type_, registered_ext_type->Deserialize(std::move(type_), + metadata_.extension_serialized)); + } + } + return Status::OK(); } @@ -1130,6 +1189,7 @@ struct SchemaImporter { int64_t recursion_level_; std::vector child_importers_; std::shared_ptr type_; + DecodedMetadata metadata_; }; } // namespace @@ -1255,8 +1315,15 @@ struct ArrayImporter { } Status DoImport() { + // Unwrap extension type + const DataType* storage_type = type_.get(); + if (storage_type->id() == Type::EXTENSION) { + storage_type = + checked_cast(*storage_type).storage_type().get(); + } + // First import children (required for reconstituting parent array data) - const auto& fields = type_->fields(); + const auto& fields = storage_type->fields(); if (c_struct_->n_children != static_cast(fields.size())) { return Status::Invalid("ArrowArray struct has ", c_struct_->n_children, " children, expected ", fields.size(), " for type ", @@ -1270,15 +1337,15 @@ struct ArrayImporter { } // Import main data - RETURN_NOT_OK(ImportMainData()); + RETURN_NOT_OK(VisitTypeInline(*storage_type, this)); - bool is_dict_type = (type_->id() == Type::DICTIONARY); + bool is_dict_type = (storage_type->id() == Type::DICTIONARY); if (c_struct_->dictionary != nullptr) { if (!is_dict_type) { return Status::Invalid("Import type is ", type_->ToString(), " but dictionary field in ArrowArray struct is not null"); } - const auto& dict_type = checked_cast(*type_); + const auto& dict_type = checked_cast(*storage_type); // Import dictionary values ArrayImporter dict_importer(dict_type.value_type()); RETURN_NOT_OK(dict_importer.ImportDict(this, c_struct_->dictionary)); @@ -1292,13 +1359,11 @@ struct ArrayImporter { return Status::OK(); } - Status ImportMainData() { return VisitTypeInline(*type_, this); } - Status Visit(const DataType& type) { return Status::NotImplemented("Cannot import array of type ", type_->ToString()); } - Status Visit(const FixedWidthType& type) { return ImportFixedSizePrimitive(); } + Status Visit(const FixedWidthType& type) { return ImportFixedSizePrimitive(type); } Status Visit(const NullType& type) { RETURN_NOT_OK(CheckNoChildren()); @@ -1352,16 +1417,15 @@ struct ArrayImporter { return Status::OK(); } - Status ImportFixedSizePrimitive() { - const auto& fw_type = checked_cast(*type_); + Status ImportFixedSizePrimitive(const FixedWidthType& type) { RETURN_NOT_OK(CheckNoChildren()); RETURN_NOT_OK(CheckNumBuffers(2)); RETURN_NOT_OK(AllocateArrayData()); RETURN_NOT_OK(ImportNullBitmap()); - if (BitUtil::IsMultipleOf8(fw_type.bit_width())) { - RETURN_NOT_OK(ImportFixedSizeBuffer(1, fw_type.bit_width() / 8)); + if (BitUtil::IsMultipleOf8(type.bit_width())) { + RETURN_NOT_OK(ImportFixedSizeBuffer(1, type.bit_width() / 8)); } else { - DCHECK_EQ(fw_type.bit_width(), 1); + DCHECK_EQ(type.bit_width(), 1); RETURN_NOT_OK(ImportBitsBuffer(1)); } return Status::OK(); diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 54ce0efcf9d..c51cb66c03b 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -31,8 +31,10 @@ #include "arrow/c/util_internal.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" @@ -45,6 +47,7 @@ using internal::ArrayExportGuard; using internal::ArrayExportTraits; using internal::ArrayStreamExportGuard; using internal::ArrayStreamExportTraits; +using internal::checked_cast; using internal::SchemaExportGuard; using internal::SchemaExportTraits; @@ -122,6 +125,10 @@ using ArrayReleaseCallback = ReleaseCallback; static const std::vector kMetadataKeys1{"key1", "key2"}; static const std::vector kMetadataValues1{"", "bar"}; + +static const std::vector kMetadataKeys2{"key"}; +static const std::vector kMetadataValues2{"abcde"}; + // clang-format off static const std::string kEncodedMetadata1{ // NOLINT: runtime/string #if ARROW_LITTLE_ENDIAN @@ -133,11 +140,7 @@ static const std::string kEncodedMetadata1{ // NOLINT: runtime/string 0, 0, 0, 4, 'k', 'e', 'y', '1', 0, 0, 0, 0, 0, 0, 0, 4, 'k', 'e', 'y', '2', 0, 0, 0, 3, 'b', 'a', 'r'}; #endif -// clang-format on -static const std::vector kMetadataKeys2{"key"}; -static const std::vector kMetadataValues2{"abcde"}; -// clang-format off static const std::string kEncodedMetadata2{ // NOLINT: runtime/string #if ARROW_LITTLE_ENDIAN 1, 0, 0, 0, @@ -146,6 +149,51 @@ static const std::string kEncodedMetadata2{ // NOLINT: runtime/string 0, 0, 0, 1, 0, 0, 0, 3, 'k', 'e', 'y', 0, 0, 0, 5, 'a', 'b', 'c', 'd', 'e'}; #endif + +static const std::string kEncodedUuidMetadata = // NOLINT: runtime/string +#if ARROW_LITTLE_ENDIAN + std::string {2, 0, 0, 0} + + std::string {20, 0, 0, 0} + kExtensionTypeKeyName + + std::string {4, 0, 0, 0} + "uuid" + + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + + std::string {15, 0, 0, 0} + "uuid-serialized"; +#else + std::string {0, 0, 0, 2} + + std::string {0, 0, 0, 20} + kExtensionTypeKeyName + + std::string {0, 0, 0, 4} + "uuid" + + std::string {0, 0, 0, 24} + kExtensionMetadataKeyName + + std::string {0, 0, 0, 15} + "uuid-serialized"; +#endif + +static const std::string kEncodedDictExtensionMetadata = // NOLINT: runtime/string +#if ARROW_LITTLE_ENDIAN + std::string {2, 0, 0, 0} + + std::string {20, 0, 0, 0} + kExtensionTypeKeyName + + std::string {14, 0, 0, 0} + "dict-extension" + + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + + std::string {25, 0, 0, 0} + "dict-extension-serialized"; +#else + std::string {0, 0, 0, 2} + + std::string {0, 0, 0, 20} + kExtensionTypeKeyName + + std::string {0, 0, 0, 14} + "dict-extension" + + std::string {0, 0, 0, 24} + kExtensionMetadataKeyName + + std::string {0, 0, 0, 25} + "dict-extension-serialized"; +#endif + +static const std::string kEncodedComplex128Metadata = // NOLINT: runtime/string +#if ARROW_LITTLE_ENDIAN + std::string {2, 0, 0, 0} + + std::string {20, 0, 0, 0} + kExtensionTypeKeyName + + std::string {10, 0, 0, 0} + "complex128" + + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + + std::string {21, 0, 0, 0} + "complex128-serialized"; +#else + std::string {0, 0, 0, 2} + + std::string {0, 0, 0, 20} + kExtensionTypeKeyName + + std::string {0, 0, 0, 10} + "complex128" + + std::string {0, 0, 0, 24} + kExtensionMetadataKeyName + + std::string {0, 0, 0, 21} + "complex128-serialized"; +#endif // clang-format on static constexpr int64_t kDefaultFlags = ARROW_FLAG_NULLABLE; @@ -404,6 +452,16 @@ TEST_F(TestSchemaExport, Dictionary) { } } +TEST_F(TestSchemaExport, Extension) { + TestPrimitive(uuid(), "w:16", "", kDefaultFlags, kEncodedUuidMetadata); + + TestNested(dict_extension_type(), {"c", "u"}, {"", ""}, {kDefaultFlags, kDefaultFlags}, + {kEncodedDictExtensionMetadata, ""}); + + TestNested(complex128(), {"+s", "g", "g"}, {"", "real", "imag"}, + {ARROW_FLAG_NULLABLE, 0, 0}, {kEncodedComplex128Metadata, "", ""}); +} + TEST_F(TestSchemaExport, ExportField) { TestPrimitive(field("thing", null()), "n", "thing", ARROW_FLAG_NULLABLE); // With nullable = false @@ -507,11 +565,9 @@ class TestArrayExport : public ::testing::Test { public: void SetUp() override { pool_ = default_memory_pool(); } - static std::function*)> JSONArrayFactory( + static std::function>()> JSONArrayFactory( std::shared_ptr type, const char* json) { - return [=](std::shared_ptr* out) -> Status { - return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); - }; + return [=]() { return ArrayFromJSON(type, json); }; } template @@ -519,7 +575,7 @@ class TestArrayExport : public ::testing::Test { auto orig_bytes = pool_->bytes_allocated(); std::shared_ptr arr; - ASSERT_OK(factory(&arr)); + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); const ArrayData& data = *arr->data(); // non-owning reference struct ArrowArray c_export; ASSERT_OK(ExportArray(*arr, &c_export)); @@ -562,7 +618,7 @@ class TestArrayExport : public ::testing::Test { auto orig_bytes = pool_->bytes_allocated(); std::shared_ptr arr; - ASSERT_OK(factory(&arr)); + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); const ArrayData& data = *arr->data(); // non-owning reference struct ArrowArray c_export_temp, c_export_final; ASSERT_OK(ExportArray(*arr, &c_export_temp)); @@ -607,7 +663,7 @@ class TestArrayExport : public ::testing::Test { auto orig_bytes = pool_->bytes_allocated(); std::shared_ptr arr; - ASSERT_OK(factory(&arr)); + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); struct ArrowArray c_export_parent, c_export_child; ASSERT_OK(ExportArray(*arr, &c_export_parent)); @@ -661,7 +717,7 @@ class TestArrayExport : public ::testing::Test { auto orig_bytes = pool_->bytes_allocated(); std::shared_ptr arr; - ASSERT_OK(factory(&arr)); + ASSERT_OK_AND_ASSIGN(arr, ToResult(factory())); struct ArrowArray c_export_parent; ASSERT_OK(ExportArray(*arr, &c_export_parent)); @@ -752,10 +808,7 @@ TEST_F(TestArrayExport, Primitive) { } TEST_F(TestArrayExport, PrimitiveSliced) { - auto factory = [](std::shared_ptr* out) -> Status { - *out = ArrayFromJSON(int16(), "[1, 2, null, -3]")->Slice(1, 2); - return Status::OK(); - }; + auto factory = []() { return ArrayFromJSON(int16(), "[1, 2, null, -3]")->Slice(1, 2); }; TestPrimitive(factory); } @@ -802,18 +855,17 @@ TEST_F(TestArrayExport, List) { TEST_F(TestArrayExport, ListSliced) { { - auto factory = [](std::shared_ptr* out) -> Status { - *out = ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") - ->Slice(1, 2); - return Status::OK(); + auto factory = []() { + return ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); }; TestNested(factory); } { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); auto offsets = ArrayFromJSON(int32(), "[0, 2, 3, 5, 6]")->Slice(2, 4); - return ListArray::FromArrays(*offsets, *values).Value(out); + return ListArray::FromArrays(*offsets, *values); }; TestNested(factory); } @@ -847,28 +899,25 @@ TEST_F(TestArrayExport, Union) { TEST_F(TestArrayExport, Dictionary) { { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); auto indices = ArrayFromJSON(uint16(), "[0, 2, 1, null, 1]"); return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), - indices, values) - .Value(out); + indices, values); }; TestNested(factory); } { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); return DictionaryArray::FromArrays( - dictionary(indices->type(), values->type(), /*ordered=*/true), indices, - values) - .Value(out); + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values); }; TestNested(factory); } { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() -> Result> { auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); ARROW_ASSIGN_OR_RAISE( @@ -876,13 +925,20 @@ TEST_F(TestArrayExport, Dictionary) { DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), indices, values)); auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); - RETURN_NOT_OK(LargeListArray::FromArrays(*offsets, *dict_array).Value(out)); - return (*out)->ValidateFull(); + ARROW_ASSIGN_OR_RAISE(auto arr, LargeListArray::FromArrays(*offsets, *dict_array)); + RETURN_NOT_OK(arr->ValidateFull()); + return arr; }; TestNested(factory); } } +TEST_F(TestArrayExport, Extension) { + TestPrimitive(ExampleUuid); + TestPrimitive(ExampleSmallint); + TestPrimitive(ExampleComplex128); +} + TEST_F(TestArrayExport, MovePrimitive) { TestMovePrimitive(int8(), "[1, 2, null, -3]"); TestMovePrimitive(fixed_size_binary(3), R"(["foo", "bar", null])"); @@ -898,17 +954,16 @@ TEST_F(TestArrayExport, MoveNested) { TEST_F(TestArrayExport, MoveDictionary) { { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), - indices, values) - .Value(out); + indices, values); }; TestMoveNested(factory); } { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() -> Result> { auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); ARROW_ASSIGN_OR_RAISE( @@ -916,8 +971,9 @@ TEST_F(TestArrayExport, MoveDictionary) { DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), indices, values)); auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); - RETURN_NOT_OK(LargeListArray::FromArrays(*offsets, *dict_array).Value(out)); - return (*out)->ValidateFull(); + ARROW_ASSIGN_OR_RAISE(auto arr, LargeListArray::FromArrays(*offsets, *dict_array)); + RETURN_NOT_OK(arr->ValidateFull()); + return arr; }; TestMoveNested(factory); } @@ -934,7 +990,7 @@ TEST_F(TestArrayExport, MoveChild) { R"([[1, "foo"], [2, null]])", /*child_id=*/1); { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() -> Result> { auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); ARROW_ASSIGN_OR_RAISE( @@ -942,8 +998,9 @@ TEST_F(TestArrayExport, MoveChild) { DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), indices, values)); auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); - RETURN_NOT_OK(LargeListArray::FromArrays(*offsets, *dict_array).Value(out)); - return (*out)->ValidateFull(); + ARROW_ASSIGN_OR_RAISE(auto arr, LargeListArray::FromArrays(*offsets, *dict_array)); + RETURN_NOT_OK(arr->ValidateFull()); + return arr; }; TestMoveChild(factory, /*child_id=*/0); } @@ -1400,6 +1457,32 @@ TEST_F(TestSchemaImport, Dictionary) { CheckImport(expected); } +TEST_F(TestSchemaImport, UnregisteredExtension) { + FillPrimitive("w:16"); + c_struct_.metadata = kEncodedUuidMetadata.c_str(); + auto expected = fixed_size_binary(16); + CheckImport(expected); +} + +TEST_F(TestSchemaImport, RegisteredExtension) { + { + ExtensionTypeGuard guard(uuid()); + FillPrimitive("w:16"); + c_struct_.metadata = kEncodedUuidMetadata.c_str(); + auto expected = uuid(); + CheckImport(expected); + } + { + ExtensionTypeGuard guard(dict_extension_type()); + FillPrimitive(AddChild(), "u"); + FillPrimitive("c"); + FillDictionary(); + c_struct_.metadata = kEncodedDictExtensionMetadata.c_str(); + auto expected = dict_extension_type(); + CheckImport(expected); + } +} + TEST_F(TestSchemaImport, FormatStringError) { FillPrimitive(""); CheckImportError(); @@ -1481,6 +1564,22 @@ TEST_F(TestSchemaImport, DictionaryError) { CheckImportError(); } +TEST_F(TestSchemaImport, ExtensionError) { + ExtensionTypeGuard guard(uuid()); + + // Storage type doesn't match + FillPrimitive("w:15"); + c_struct_.metadata = kEncodedUuidMetadata.c_str(); + CheckImportError(); + + // Invalid serialization + std::string bogus_metadata = kEncodedUuidMetadata; + bogus_metadata[bogus_metadata.size() - 5] += 1; + FillPrimitive("w:16"); + c_struct_.metadata = bogus_metadata.c_str(); + CheckImportError(); +} + TEST_F(TestSchemaImport, RecursionError) { FillPrimitive(AddChild(), "c", "unused"); auto c = AddChild(); @@ -2163,21 +2262,44 @@ TEST_F(TestArrayImport, DictionaryWithOffset) { FillPrimitive(3, 0, 0, primitive_buffers_no_nulls4); FillDictionary(); - auto dict_values = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); - auto indices = ArrayFromJSON(int8(), "[1, 2, 0]"); - ASSERT_OK_AND_ASSIGN( - auto expected, - DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values)); + auto expected = DictArrayFromJSON(dictionary(int8(), utf8()), "[1, 2, 0]", + R"(["", "bar", "quux"])"); CheckImport(expected); FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); FillPrimitive(4, 0, 2, primitive_buffers_no_nulls4); FillDictionary(); - dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); - indices = ArrayFromJSON(int8(), "[0, 1, 3, 0]"); - ASSERT_OK_AND_ASSIGN(expected, DictionaryArray::FromArrays(dictionary(int8(), utf8()), - indices, dict_values)); + expected = DictArrayFromJSON(dictionary(int8(), utf8()), "[0, 1, 3, 0]", + R"(["foo", "", "bar", "quux"])"); + CheckImport(expected); +} + +TEST_F(TestArrayImport, RegisteredExtension) { + ExtensionTypeGuard guard({smallint(), dict_extension_type(), complex128()}); + + // smallint + FillPrimitive(3, 0, 0, primitive_buffers_no_nulls1_16); + auto expected = + ExtensionType::WrapArray(smallint(), ArrayFromJSON(int16(), "[513, 1027, 1541]")); + CheckImport(expected); + + // dict_extension_type + FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto storage = DictArrayFromJSON(dictionary(int8(), utf8()), "[1, 2, 0, 1, 3, 0]", + R"(["foo", "", "bar", "quux"])"); + expected = ExtensionType::WrapArray(dict_extension_type(), storage); + CheckImport(expected); + + // complex128 + FillPrimitive(AddChild(), 3, 0, /*offset=*/0, primitive_buffers_no_nulls6); + FillPrimitive(AddChild(), 3, 0, /*offset=*/3, primitive_buffers_no_nulls6); + FillStructLike(3, 0, 0, 2, buffers_no_nulls_no_data); + expected = MakeComplex128(ArrayFromJSON(float64(), "[0.0, 1.5, -2.0]"), + ArrayFromJSON(float64(), "[3.0, 4.0, 5.0]")); CheckImport(expected); } @@ -2341,8 +2463,9 @@ class TestSchemaRoundtrip : public ::testing::Test { public: void SetUp() override { pool_ = default_memory_pool(); } - template - void TestWithTypeFactory(TypeFactory&& factory) { + template + void TestWithTypeFactory(TypeFactory&& factory, + ExpectedTypeFactory&& factory_expected) { std::shared_ptr type, actual; struct ArrowSchema c_schema {}; // zeroed SchemaExportGuard schema_guard(&c_schema); @@ -2359,7 +2482,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the type ASSERT_OK_AND_ASSIGN(actual, ImportType(&c_schema)); - type = factory(); + type = factory_expected(); AssertTypeEqual(*type, *actual); type.reset(); actual.reset(); @@ -2367,6 +2490,11 @@ class TestSchemaRoundtrip : public ::testing::Test { ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); } + template + void TestWithTypeFactory(TypeFactory&& factory) { + TestWithTypeFactory(factory, factory); + } + template void TestWithSchemaFactory(SchemaFactory&& factory) { std::shared_ptr schema, actual; @@ -2459,6 +2587,27 @@ TEST_F(TestSchemaRoundtrip, Dictionary) { } } +TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { + TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); + TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); + + // Inside nested type + TestWithTypeFactory([]() { return list(dict_extension_type()); }, + []() { return list(dictionary(int8(), utf8())); }); +} + +TEST_F(TestSchemaRoundtrip, RegisteredExtension) { + ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); + TestWithTypeFactory(uuid); + TestWithTypeFactory(dict_extension_type); + TestWithTypeFactory(complex128); + + // Inside nested type + TestWithTypeFactory([]() { return list(uuid()); }); + TestWithTypeFactory([]() { return list(dict_extension_type()); }); + TestWithTypeFactory([]() { return list(complex128()); }); +} + TEST_F(TestSchemaRoundtrip, Map) { TestWithTypeFactory([&]() { return map(utf8(), int32()); }); TestWithTypeFactory([&]() { return map(list(utf8()), int32()); }); @@ -2482,28 +2631,30 @@ TEST_F(TestSchemaRoundtrip, Schema) { class TestArrayRoundtrip : public ::testing::Test { public: - using ArrayFactory = std::function*)>; + using ArrayFactory = std::function>()>; void SetUp() override { pool_ = default_memory_pool(); } static ArrayFactory JSONArrayFactory(std::shared_ptr type, const char* json) { - return [=](std::shared_ptr* out) -> Status { - return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); - }; + return [=]() { return ArrayFromJSON(type, json); }; } static ArrayFactory SlicedArrayFactory(ArrayFactory factory) { - return [=](std::shared_ptr* out) -> Status { - std::shared_ptr arr; - RETURN_NOT_OK(factory(&arr)); + return [=]() -> Result> { + ARROW_ASSIGN_OR_RAISE(auto arr, factory()); DCHECK_GE(arr->length(), 2); - *out = arr->Slice(1, arr->length() - 2); - return Status::OK(); + return arr->Slice(1, arr->length() - 2); }; } template void TestWithArrayFactory(ArrayFactory&& factory) { + TestWithArrayFactory(factory, factory); + } + + template + void TestWithArrayFactory(ArrayFactory&& factory, + ExpectedArrayFactory&& factory_expected) { std::shared_ptr array; struct ArrowArray c_array {}; struct ArrowSchema c_schema {}; @@ -2512,7 +2663,7 @@ class TestArrayRoundtrip : public ::testing::Test { auto orig_bytes = pool_->bytes_allocated(); - ASSERT_OK(factory(&array)); + ASSERT_OK_AND_ASSIGN(array, ToResult(factory())); ASSERT_OK(ExportType(*array->type(), &c_schema)); ASSERT_OK(ExportArray(*array, &c_array)); @@ -2539,7 +2690,7 @@ class TestArrayRoundtrip : public ::testing::Test { // Check value of imported array { std::shared_ptr expected; - ASSERT_OK(factory(&expected)); + ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); AssertTypeEqual(*expected->type(), *array->type()); AssertArraysEqual(*expected, *array, true); } @@ -2556,7 +2707,7 @@ class TestArrayRoundtrip : public ::testing::Test { SchemaExportGuard schema_guard(&c_schema); auto orig_bytes = pool_->bytes_allocated(); - ASSERT_OK(factory(&batch)); + ASSERT_OK_AND_ASSIGN(batch, ToResult(factory())); ASSERT_OK(ExportSchema(*batch->schema(), &c_schema)); ASSERT_OK(ExportRecordBatch(*batch, &c_array)); @@ -2579,7 +2730,7 @@ class TestArrayRoundtrip : public ::testing::Test { // Check value of imported record batch { std::shared_ptr expected; - ASSERT_OK(factory(&expected)); + ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); AssertSchemaEqual(*expected->schema(), *batch->schema()); AssertBatchesEqual(*expected, *batch); } @@ -2621,15 +2772,15 @@ TEST_F(TestArrayRoundtrip, Primitive) { } TEST_F(TestArrayRoundtrip, UnknownNullCount) { - TestWithArrayFactory([](std::shared_ptr* arr) -> Status { - *arr = ArrayFromJSON(int32(), "[0, 1, 2]"); - if ((*arr)->null_bitmap()) { + TestWithArrayFactory([]() -> Result> { + auto arr = ArrayFromJSON(int32(), "[0, 1, 2]"); + if (arr->null_bitmap()) { return Status::Invalid( "Failed precondition: " "the array shouldn't have a null bitmap."); } - (*arr)->data()->SetNullCount(kUnknownNullCount); - return Status::OK(); + arr->data()->SetNullCount(kUnknownNullCount); + return arr; }); } @@ -2670,30 +2821,62 @@ TEST_F(TestArrayRoundtrip, Nested) { TEST_F(TestArrayRoundtrip, Dictionary) { { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), - indices, values) - .Value(out); + indices, values); }; TestWithArrayFactory(factory); TestWithArrayFactory(SlicedArrayFactory(factory)); } { - auto factory = [](std::shared_ptr* out) -> Status { + auto factory = []() { auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); return DictionaryArray::FromArrays( - dictionary(indices->type(), values->type(), /*ordered=*/true), indices, - values) - .Value(out); + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values); }; TestWithArrayFactory(factory); TestWithArrayFactory(SlicedArrayFactory(factory)); } } +TEST_F(TestArrayRoundtrip, RegisteredExtension) { + ExtensionTypeGuard guard({smallint(), complex128(), dict_extension_type(), uuid()}); + + TestWithArrayFactory(ExampleSmallint); + TestWithArrayFactory(ExampleUuid); + TestWithArrayFactory(ExampleComplex128); + TestWithArrayFactory(ExampleDictExtension); + + // Nested inside outer array + auto NestedFactory = [](ArrayFactory factory) { + return [factory]() -> Result> { + ARROW_ASSIGN_OR_RAISE(auto arr, ToResult(factory())); + return FixedSizeListArray::FromArrays(arr, /*list_size=*/1); + }; + }; + TestWithArrayFactory(NestedFactory(ExampleSmallint)); + TestWithArrayFactory(NestedFactory(ExampleUuid)); + TestWithArrayFactory(NestedFactory(ExampleComplex128)); + TestWithArrayFactory(NestedFactory(ExampleDictExtension)); +} + +TEST_F(TestArrayRoundtrip, UnregisteredExtension) { + auto StorageExtractor = [](ArrayFactory factory) { + return [factory]() -> Result> { + ARROW_ASSIGN_OR_RAISE(auto arr, ToResult(factory())); + return checked_cast(*arr).storage(); + }; + }; + + TestWithArrayFactory(ExampleSmallint, StorageExtractor(ExampleSmallint)); + TestWithArrayFactory(ExampleUuid, StorageExtractor(ExampleUuid)); + TestWithArrayFactory(ExampleComplex128, StorageExtractor(ExampleComplex128)); + TestWithArrayFactory(ExampleDictExtension, StorageExtractor(ExampleDictExtension)); +} + TEST_F(TestArrayRoundtrip, RecordBatch) { auto schema = ::arrow::schema( {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); @@ -2701,22 +2884,18 @@ TEST_F(TestArrayRoundtrip, RecordBatch) { auto arr1 = ArrayFromJSON(boolean(), "[false, true, false]"); { - auto factory = [&](std::shared_ptr* out) -> Status { - *out = RecordBatch::Make(schema, 3, {arr0, arr1}); - return Status::OK(); - }; + auto factory = [&]() { return RecordBatch::Make(schema, 3, {arr0, arr1}); }; TestWithBatchFactory(factory); } { // With schema and field metadata - auto factory = [&](std::shared_ptr* out) -> Status { + auto factory = [&]() { auto f0 = schema->field(0); auto f1 = schema->field(1); f1 = f1->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); auto schema_with_md = ::arrow::schema({f0, f1}, key_value_metadata(kMetadataKeys2, kMetadataValues2)); - *out = RecordBatch::Make(schema_with_md, 3, {arr0, arr1}); - return Status::OK(); + return RecordBatch::Make(schema_with_md, 3, {arr0, arr1}); }; TestWithBatchFactory(factory); } diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index cd1c3b9790e..31222d74806 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -325,10 +325,12 @@ TEST_F(TestExtensionType, ValidateExtensionArray) { auto p1_type = std::make_shared(6); auto ext_arr2 = ExampleParametric(p1_type, "[null, 1, 2, 3]"); auto ext_arr3 = ExampleStruct(); + auto ext_arr4 = ExampleComplex128(); ASSERT_OK(ext_arr1->ValidateFull()); ASSERT_OK(ext_arr2->ValidateFull()); ASSERT_OK(ext_arr3->ValidateFull()); + ASSERT_OK(ext_arr4->ValidateFull()); } } // namespace arrow diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 245534b1d5c..d7b7fb54eaf 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -355,20 +355,18 @@ const std::vector kBatchCases = { &MakeFloatBatch, &MakeIntervals, &MakeUuid, + &MakeComplex128, &MakeDictExtension}; static int g_file_number = 0; class ExtensionTypesMixin { public: - ExtensionTypesMixin() { - // Register the extension types required to ensure roundtripping - ext_guards_.emplace_back(uuid()); - ext_guards_.emplace_back(dict_extension_type()); - } + // Register the extension types required to ensure roundtripping + ExtensionTypesMixin() : ext_guard_({uuid(), dict_extension_type(), complex128()}) {} protected: - std::vector ext_guards_; + ExtensionTypeGuard ext_guard_; }; class IpcTestFixture : public io::MemoryMapFixture, public ExtensionTypesMixin { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index c93f1f60e6e..5068eca001a 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -985,6 +985,23 @@ Status MakeUuid(std::shared_ptr* out) { return Status::OK(); } +Status MakeComplex128(std::shared_ptr* out) { + auto type = complex128(); + auto storage_type = checked_cast(*type).storage_type(); + + auto f0 = field("f0", type); + auto f1 = field("f1", type, /*nullable=*/false); + auto schema = ::arrow::schema({f0, f1}); + + auto a0 = ExtensionType::WrapArray(complex128(), + ArrayFromJSON(storage_type, "[[1.0, -2.5], null]")); + auto a1 = ExtensionType::WrapArray( + complex128(), ArrayFromJSON(storage_type, "[[1.0, -2.5], [3.0, -4.0]]")); + + *out = RecordBatch::Make(schema, a1->length(), {a0, a1}); + return Status::OK(); +} + Status MakeDictExtension(std::shared_ptr* out) { auto type = dict_extension_type(); auto storage_type = checked_cast(*type).storage_type(); diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 2217bae39fc..48df28b2d5a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -159,6 +159,9 @@ Status MakeNull(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeUuid(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeComplex128(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeDictExtension(std::shared_ptr* out); diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 4163c9d8358..5afe2340076 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -19,6 +19,7 @@ #include #include +#include #include "arrow/extension_type.h" #include "arrow/testing/visibility.h" @@ -87,6 +88,30 @@ class ARROW_TESTING_EXPORT DictExtensionType : public ExtensionType { std::string Serialize() const override { return "dict-extension-serialized"; } }; +class ARROW_TESTING_EXPORT Complex128Array : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +class ARROW_TESTING_EXPORT Complex128Type : public ExtensionType { + public: + Complex128Type() + : ExtensionType(struct_({::arrow::field("real", float64(), /*nullable=*/false), + ::arrow::field("imag", float64(), /*nullable=*/false)})) {} + + std::string extension_name() const override { return "complex128"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return "complex128-serialized"; } +}; + ARROW_TESTING_EXPORT std::shared_ptr uuid(); @@ -96,24 +121,38 @@ std::shared_ptr smallint(); ARROW_TESTING_EXPORT std::shared_ptr dict_extension_type(); +ARROW_TESTING_EXPORT +std::shared_ptr complex128(); + ARROW_TESTING_EXPORT std::shared_ptr ExampleUuid(); ARROW_TESTING_EXPORT std::shared_ptr ExampleSmallint(); +ARROW_TESTING_EXPORT +std::shared_ptr ExampleDictExtension(); + +ARROW_TESTING_EXPORT +std::shared_ptr ExampleComplex128(); + +ARROW_TESTING_EXPORT +std::shared_ptr MakeComplex128(const std::shared_ptr& real, + const std::shared_ptr& imag); + // A RAII class that registers an extension type on construction // and unregisters it on destruction. class ARROW_TESTING_EXPORT ExtensionTypeGuard { public: explicit ExtensionTypeGuard(const std::shared_ptr& type); + explicit ExtensionTypeGuard(const DataTypeVector& types); ~ExtensionTypeGuard(); ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionTypeGuard); protected: ARROW_DISALLOW_COPY_AND_ASSIGN(ExtensionTypeGuard); - std::string extension_name_; + std::vector extension_names_; }; } // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 3e7c9b78c6b..587154c1f30 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -811,6 +811,28 @@ Result> DictExtensionType::Deserialize( return std::make_shared(); } +bool Complex128Type::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr Complex128Type::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK(ExtensionEquals(checked_cast(*data->type))); + return std::make_shared(data); +} + +Result> Complex128Type::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (serialized != "complex128-serialized") { + return Status::Invalid("Type identifier did not match: '", serialized, "'"); + } + if (!storage_type->Equals(*storage_type_)) { + return Status::Invalid("Invalid storage type for Complex128Type: ", + storage_type->ToString()); + } + return std::make_shared(); +} + std::shared_ptr uuid() { return std::make_shared(); } std::shared_ptr smallint() { return std::make_shared(); } @@ -819,40 +841,58 @@ std::shared_ptr dict_extension_type() { return std::make_shared(); } -std::shared_ptr ExampleUuid() { - auto storage_type = fixed_size_binary(16); - auto ext_type = uuid(); +std::shared_ptr complex128() { return std::make_shared(); } +std::shared_ptr MakeComplex128(const std::shared_ptr& real, + const std::shared_ptr& imag) { + auto type = complex128(); + std::shared_ptr storage( + new StructArray(checked_cast(*type).storage_type(), + real->length(), {real, imag})); + return ExtensionType::WrapArray(type, storage); +} + +std::shared_ptr ExampleUuid() { auto arr = ArrayFromJSON( - storage_type, + fixed_size_binary(16), "[null, \"abcdefghijklmno0\", \"abcdefghijklmno1\", \"abcdefghijklmno2\"]"); - - auto ext_data = arr->data()->Copy(); - ext_data->type = ext_type; - return MakeArray(ext_data); + return ExtensionType::WrapArray(uuid(), arr); } std::shared_ptr ExampleSmallint() { - auto storage_type = int16(); - auto ext_type = smallint(); - auto arr = ArrayFromJSON(storage_type, "[-32768, null, 1, 2, 3, 4, 32767]"); - auto ext_data = arr->data()->Copy(); - ext_data->type = ext_type; - return MakeArray(ext_data); + auto arr = ArrayFromJSON(int16(), "[-32768, null, 1, 2, 3, 4, 32767]"); + return ExtensionType::WrapArray(smallint(), arr); } -ExtensionTypeGuard::ExtensionTypeGuard(const std::shared_ptr& type) { - ARROW_CHECK_EQ(type->id(), Type::EXTENSION); - auto ext_type = checked_pointer_cast(type); +std::shared_ptr ExampleDictExtension() { + auto arr = DictArrayFromJSON(dictionary(int8(), utf8()), "[0, 1, null, 1]", + R"(["foo", "bar"])"); + return ExtensionType::WrapArray(dict_extension_type(), arr); +} + +std::shared_ptr ExampleComplex128() { + auto arr = ArrayFromJSON(struct_({field("", float64()), field("", float64())}), + "[[1.0, -2.5], null, [3.0, -4.5]]"); + return ExtensionType::WrapArray(complex128(), arr); +} - ARROW_CHECK_OK(RegisterExtensionType(ext_type)); - extension_name_ = ext_type->extension_name(); - DCHECK(!extension_name_.empty()); +ExtensionTypeGuard::ExtensionTypeGuard(const std::shared_ptr& type) + : ExtensionTypeGuard(DataTypeVector{type}) {} + +ExtensionTypeGuard::ExtensionTypeGuard(const DataTypeVector& types) { + for (const auto& type : types) { + ARROW_CHECK_EQ(type->id(), Type::EXTENSION); + auto ext_type = checked_pointer_cast(type); + + ARROW_CHECK_OK(RegisterExtensionType(ext_type)); + extension_names_.push_back(ext_type->extension_name()); + DCHECK(!extension_names_.back().empty()); + } } ExtensionTypeGuard::~ExtensionTypeGuard() { - if (!extension_name_.empty()) { - ARROW_CHECK_OK(UnregisterExtensionType(extension_name_)); + for (const auto& name : extension_names_) { + ARROW_CHECK_OK(UnregisterExtensionType(name)); } } diff --git a/cpp/src/arrow/testing/json_integration_test.cc b/cpp/src/arrow/testing/json_integration_test.cc index 34b871c56c1..55620119550 100644 --- a/cpp/src/arrow/testing/json_integration_test.cc +++ b/cpp/src/arrow/testing/json_integration_test.cc @@ -197,8 +197,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, const std::string& command) { // Make sure the required extension types are registered, as they will be // referenced in test data. - ExtensionTypeGuard uuid_ext_guard(uuid()); - ExtensionTypeGuard dict_ext_guard(dict_extension_type()); + ExtensionTypeGuard ext_guard({uuid(), dict_extension_type()}); if (json_path == "") { return Status::Invalid("Must specify json file name"); @@ -1105,8 +1104,7 @@ class TestJsonRoundTrip : public ::testing::TestWithParam { }; void CheckRoundtrip(const RecordBatch& batch) { - ExtensionTypeGuard uuid_ext_guard(uuid()); - ExtensionTypeGuard dict_ext_guard(dict_extension_type()); + ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); TestSchemaRoundTrip(*batch.schema()); @@ -1160,6 +1158,7 @@ const std::vector kBatchCases = { &MakeFloatBatch, &MakeIntervals, &MakeUuid, + &MakeComplex128, &MakeDictExtension}; INSTANTIATE_TEST_SUITE_P(TestJsonRoundTrip, TestJsonRoundTrip, diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index ad3b686a9bd..fd179a8bf38 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -56,8 +56,6 @@ static std::vector UnorderedMapValues( return values; } -KeyValueMetadata::KeyValueMetadata() : keys_(), values_() {} - KeyValueMetadata::KeyValueMetadata( const std::unordered_map& map) : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) { @@ -85,9 +83,9 @@ void KeyValueMetadata::ToUnorderedMap( } } -void KeyValueMetadata::Append(const std::string& key, const std::string& value) { - keys_.push_back(key); - values_.push_back(value); +void KeyValueMetadata::Append(std::string key, std::string value) { + keys_.push_back(std::move(key)); + values_.push_back(std::move(value)); } Result KeyValueMetadata::Get(const std::string& key) const { diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index d42ab78f667..2a31bf378b0 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -34,16 +34,15 @@ namespace arrow { /// \brief A container for key-value pair type metadata. Not thread-safe class ARROW_EXPORT KeyValueMetadata { public: - KeyValueMetadata(); + KeyValueMetadata() = default; KeyValueMetadata(std::vector keys, std::vector values); explicit KeyValueMetadata(const std::unordered_map& map); - virtual ~KeyValueMetadata() = default; static std::shared_ptr Make(std::vector keys, std::vector values); void ToUnorderedMap(std::unordered_map* out) const; - void Append(const std::string& key, const std::string& value); + void Append(std::string key, std::string value); Result Get(const std::string& key) const; bool Contains(const std::string& key) const; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 29351e0b648..dc41e577d94 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -146,6 +146,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type id() c_bool Equals(const CDataType& other) + c_bool Equals(const shared_ptr[CDataType]& other) shared_ptr[CField] field(int i) const vector[shared_ptr[CField]] fields() @@ -2349,6 +2350,14 @@ cdef extern from 'arrow/extension_type.h' namespace 'arrow': c_string extension_name() shared_ptr[CDataType] storage_type() + @staticmethod + shared_ptr[CArray] WrapArray(shared_ptr[CDataType] ext_type, + shared_ptr[CArray] storage) + + @staticmethod + shared_ptr[CChunkedArray] WrapArray(shared_ptr[CDataType] ext_type, + shared_ptr[CChunkedArray] storage) + cdef cppclass CExtensionArray" arrow::ExtensionArray"(CArray): CExtensionArray(shared_ptr[CDataType], shared_ptr[CArray] storage) diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 2ac30fd2cf2..f0ce42909f1 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -47,15 +47,41 @@ ValueError, match="Cannot import released ArrowArrayStream") +class ParamExtType(pa.PyExtensionType): + + def __init__(self, width): + self._width = width + pa.PyExtensionType.__init__(self, pa.binary(width)) + + @property + def width(self): + return self._width + + def __reduce__(self): + return ParamExtType, (self.width,) + + def make_schema(): return pa.schema([('ints', pa.list_(pa.int32()))], metadata={b'key1': b'value1'}) +def make_extension_schema(): + return pa.schema([('ext', ParamExtType(3))], + metadata={b'key1': b'value1'}) + + def make_batch(): return pa.record_batch([[[1], [2, 42]]], make_schema()) +def make_extension_batch(): + schema = make_extension_schema() + ext_col = schema[0].type.wrap_array(pa.array([b"foo", b"bar"], + type=pa.binary(3))) + return pa.record_batch([ext_col], schema) + + def make_batches(): schema = make_schema() return [ @@ -174,19 +200,18 @@ def test_export_import_array(): pa.Array._import_from_c(ptr_array, ptr_schema) -@needs_cffi -def test_export_import_schema(): +def check_export_import_schema(schema_factory): c_schema = ffi.new("struct ArrowSchema*") ptr_schema = int(ffi.cast("uintptr_t", c_schema)) gc.collect() # Make sure no Arrow data dangles in a ref cycle old_allocated = pa.total_allocated_bytes() - make_schema()._export_to_c(ptr_schema) + schema_factory()._export_to_c(ptr_schema) assert pa.total_allocated_bytes() > old_allocated # Delete and recreate C++ object from exported pointer schema_new = pa.Schema._import_from_c(ptr_schema) - assert schema_new == make_schema() + assert schema_new == schema_factory() assert pa.total_allocated_bytes() == old_allocated del schema_new assert pa.total_allocated_bytes() == old_allocated @@ -205,7 +230,16 @@ def test_export_import_schema(): @needs_cffi -def test_export_import_batch(): +def test_export_import_schema(): + check_export_import_schema(make_schema) + + +@needs_cffi +def test_export_import_schema_with_extension(): + check_export_import_schema(make_extension_schema) + + +def check_export_import_batch(batch_factory): c_schema = ffi.new("struct ArrowSchema*") ptr_schema = int(ffi.cast("uintptr_t", c_schema)) c_array = ffi.new("struct ArrowArray*") @@ -215,8 +249,8 @@ def test_export_import_batch(): old_allocated = pa.total_allocated_bytes() # Schema is known up front - schema = make_schema() - batch = make_batch() + batch = batch_factory() + schema = batch.schema py_value = batch.to_pydict() batch._export_to_c(ptr_array) assert pa.total_allocated_bytes() > old_allocated @@ -233,14 +267,14 @@ def test_export_import_batch(): pa.RecordBatch._import_from_c(ptr_array, make_schema()) # Type is exported and imported at the same time - batch = make_batch() + batch = batch_factory() py_value = batch.to_pydict() batch._export_to_c(ptr_array, ptr_schema) # Delete and recreate C++ objects from exported pointers del batch batch_new = pa.RecordBatch._import_from_c(ptr_array, ptr_schema) assert batch_new.to_pydict() == py_value - assert batch_new.schema == make_schema() + assert batch_new.schema == batch_factory().schema assert pa.total_allocated_bytes() > old_allocated del batch_new assert pa.total_allocated_bytes() == old_allocated @@ -250,7 +284,7 @@ def test_export_import_batch(): # Not a struct type pa.int32()._export_to_c(ptr_schema) - make_batch()._export_to_c(ptr_array) + batch_factory()._export_to_c(ptr_array) with pytest.raises(ValueError, match="ArrowSchema describes non-struct type"): pa.RecordBatch._import_from_c(ptr_array, ptr_schema) @@ -259,6 +293,16 @@ def test_export_import_batch(): pa.RecordBatch._import_from_c(ptr_array, ptr_schema) +@needs_cffi +def test_export_import_batch(): + check_export_import_batch(make_batch) + + +@needs_cffi +def test_export_import_batch_with_extension(): + check_export_import_batch(make_extension_batch) + + def _export_import_batch_reader(ptr_stream, reader_factory): # Prepare input batches = make_batches() diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 391149772cc..d166c2af83e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -201,6 +201,34 @@ def test_ext_array_equality(): assert not d.equals(f) +def test_ext_array_wrap_array(): + ty = ParamExtType(3) + storage = pa.array([b"foo", b"bar", None], type=pa.binary(3)) + arr = ty.wrap_array(storage) + arr.validate(full=True) + assert isinstance(arr, pa.ExtensionArray) + assert arr.type == ty + assert arr.storage == storage + + storage = pa.chunked_array([[b"abc", b"def"], [b"ghi"]], + type=pa.binary(3)) + arr = ty.wrap_array(storage) + arr.validate(full=True) + assert isinstance(arr, pa.ChunkedArray) + assert arr.type == ty + assert arr.chunk(0).storage == storage.chunk(0) + assert arr.chunk(1).storage == storage.chunk(1) + + # Wrong storage type + storage = pa.array([b"foo", b"bar", None]) + with pytest.raises(TypeError, match="Incompatible storage type"): + ty.wrap_array(storage) + + # Not an array or chunked array + with pytest.raises(TypeError, match="Expected array or chunked array"): + ty.wrap_array(None) + + def test_ext_scalar_from_array(): data = [b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 06f753fa18d..5b478ed7746 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -708,6 +708,45 @@ cdef class BaseExtensionType(DataType): """ return pyarrow_wrap_data_type(self.ext_type.storage_type()) + def wrap_array(self, storage): + """ + Wrap the given storage array as an extension array. + + Parameters + ---------- + storage : Array or ChunkedArray + + Returns + ------- + array : Array or ChunkedArray + Extension array wrapping the storage array + """ + cdef: + shared_ptr[CDataType] c_storage_type + + if isinstance(storage, Array): + c_storage_type = ( storage).ap.type() + elif isinstance(storage, ChunkedArray): + c_storage_type = ( storage).chunked_array.type() + else: + raise TypeError( + f"Expected array or chunked array, got {storage.__class__}") + + if not c_storage_type.get().Equals(deref(self.ext_type) + .storage_type()): + raise TypeError( + f"Incompatible storage type for {self}: " + f"expected {self.storage_type}, got {storage.type}") + + if isinstance(storage, Array): + return pyarrow_wrap_array( + self.ext_type.WrapArray( + self.sp_type, ( storage).sp_array)) + else: + return pyarrow_wrap_chunked_array( + self.ext_type.WrapArray( + self.sp_type, ( storage).sp_chunked_array)) + cdef class ExtensionType(BaseExtensionType): """ From e9251b015184753c53f9a26f32e8f89a8d21a760 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 12:51:23 -0400 Subject: [PATCH 18/93] ARROW-13740: [R] summarize() should not eagerly evaluate - [x] collect() uses ExecPlan - [x] arrange() uses an OrderBySink - [x] .data inside of arrow_dplyr_query can itself be arrow_dplyr_query - [x] can build more query after calling summarize() - [x] handle non-deterministic dataset collect() tests - [x] fix group_by-expression behavior - [x] make official collapse() method with more testing of faithful behavior after collapsing - [x] make sort after summarize be configurable by option (default FALSE, though local_options TRUE in the tests) - [x] add print method for collapsed query - [x] Skip 32-bit rtools35 dataset tests/examples ~~- [ ] should queries on in-memory data evaluate eagerly (like dplyr)?~~ Followups: * ARROW-13777: [R] mutate after group_by should be ok as long as there are only scalar functions * ARROW-13778: [R] Handle complex summarize expressions * ARROW-13779: [R] Disallow expressions that depend on order after arrange() * ARROW-13852: [R] Handle Dataset schema metadata in ExecPlan * ARROW-13854: [R] More accurately determine output type of an aggregation expression * ARROW-13893: [R] Improve head/tail/[ methods on Dataset and queries Closes #10992 from nealrichardson/subquery Lead-authored-by: Neal Richardson Co-authored-by: Jonathan Keane Signed-off-by: Neal Richardson --- .github/workflows/r.yml | 1 + r/NAMESPACE | 1 + r/NEWS.md | 5 + r/R/arrow-package.R | 4 +- r/R/arrowExports.R | 8 +- r/R/dataset-scan.R | 14 +- r/R/dplyr-arrange.R | 2 +- r/R/dplyr-collect.R | 74 +- r/R/dplyr-filter.R | 2 +- r/R/dplyr-functions.R | 15 + r/R/dplyr-group-by.R | 2 +- r/R/dplyr-mutate.R | 17 +- r/R/dplyr-select.R | 6 +- r/R/dplyr-summarize.R | 69 +- r/R/dplyr.R | 85 +- r/R/duckdb.R | 2 +- r/R/expression.R | 3 + r/R/query-engine.R | 126 ++- r/src/arrowExports.cpp | 916 +++++++++--------- r/src/compute-exec.cpp | 16 +- r/src/expression.cpp | 5 + r/tests/testthat/helper-skip.R | 14 + r/tests/testthat/test-dataset.R | 24 +- r/tests/testthat/test-dplyr-collapse.R | 210 ++++ ...lyr-aggregate.R => test-dplyr-summarize.R} | 120 ++- r/tests/testthat/test-duckdb.R | 1 + r/tests/testthat/test-metadata.R | 5 +- r/tests/testthat/test-python.R | 5 +- r/tests/testthat/test-s3-minio.R | 22 +- 29 files changed, 1124 insertions(+), 650 deletions(-) create mode 100644 r/tests/testthat/test-dplyr-collapse.R rename r/tests/testthat/{test-dplyr-aggregate.R => test-dplyr-summarize.R} (72%) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 5acb47a0ae0..e160ba8128a 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -247,6 +247,7 @@ jobs: Sys.setenv( RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "libarrow.zip"), MAKEFLAGS = paste0("-j", parallel::detectCores()), + ARROW_R_DEV = TRUE, "_R_CHECK_FORCE_SUGGESTS_" = FALSE ) rcmdcheck::rcmdcheck("r", diff --git a/r/NAMESPACE b/r/NAMESPACE index 8ce6d162eb0..5e78d04de52 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -291,6 +291,7 @@ importFrom(bit64,print.integer64) importFrom(bit64,str.integer64) importFrom(methods,as) importFrom(purrr,as_mapper) +importFrom(purrr,imap) importFrom(purrr,imap_chr) importFrom(purrr,keep) importFrom(purrr,map) diff --git a/r/NEWS.md b/r/NEWS.md index 2a22681e457..eb8001d4718 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,11 @@ # arrow 5.0.0.9000 +## Breaking changes + +* `dplyr::summarize()` on an in-memory Arrow Table or RecordBatch no longer eagerly evaluates. Call `compute()` or `collect()` to evaluate the query. +* Row order of data from a Dataset query is no longer deterministic. If you need a stable sort order, you should explicitly `arrange()` the query. For calls to `summarize()`, you can set `options(arrow.summarise.sort = TRUE)` to match the current `dplyr` behavior of sorting on the grouping columns. + # arrow 5.0.0 ## More dplyr diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 537eebb1b1d..c09b8f05319 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -17,7 +17,7 @@ #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class -#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap_chr +#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind as_label set_names exec @@ -35,7 +35,7 @@ c( "select", "filter", "collect", "summarise", "group_by", "groups", "group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute", - "arrange", "rename", "pull", "relocate", "compute" + "arrange", "rename", "pull", "relocate", "compute", "collapse" ) ) for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) { diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 72a5e455858..b852a3d8ca9 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -280,8 +280,8 @@ ExecPlan_create <- function(use_threads) { .Call(`_arrow_ExecPlan_create`, use_threads) } -ExecPlan_run <- function(plan, final_node) { - .Call(`_arrow_ExecPlan_run`, plan, final_node) +ExecPlan_run <- function(plan, final_node, sort_options) { + .Call(`_arrow_ExecPlan_run`, plan, final_node, sort_options) } ExecNode_Scan <- function(plan, dataset, filter, materialized_field_names) { @@ -816,6 +816,10 @@ FixedSizeListType__list_size <- function(type) { .Call(`_arrow_FixedSizeListType__list_size`, type) } +compute___expr__equals <- function(lhs, rhs) { + .Call(`_arrow_compute___expr__equals`, lhs, rhs) +} + compute___expr__call <- function(func_name, argument_list, options) { .Call(`_arrow_compute___expr__call`, func_name, argument_list, options) } diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index 615b0f945a8..75108df1052 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -73,18 +73,14 @@ Scanner$create <- function(dataset, projection = NULL, filter = TRUE, use_threads = option_use_threads(), - use_async = NULL, + use_async = getOption("arrow.use_async", FALSE), batch_size = NULL, fragment_scan_options = NULL, ...) { - if (is.null(use_async)) { - use_async <- getOption("arrow.use_async", FALSE) - } - if (inherits(dataset, "arrow_dplyr_query")) { - if (inherits(dataset$.data, "ArrowTabular")) { - # To handle mutate() on Table/RecordBatch, we need to collect(as_data_frame=FALSE) now - dataset <- dplyr::collect(dataset, as_data_frame = FALSE) + if (is_collapsed(dataset)) { + # TODO: Is there a way to get a RecordBatchReader rather than evaluating? + dataset$.data <- as_adq(dplyr::compute(dataset$.data))$.data } proj <- c(dataset$selected_columns, dataset$temp_columns) @@ -117,7 +113,7 @@ Scanner$create <- function(dataset, ... )) } - if (inherits(dataset, c("data.frame", "RecordBatch", "Table"))) { + if (inherits(dataset, c("data.frame", "ArrowTabular"))) { dataset <- InMemoryDataset$create(dataset) } assert_is(dataset, "Dataset") diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index 345fc183295..017e1d6b302 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -30,7 +30,7 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { # Nothing to do return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # find and remove any dplyr::desc() and tidy-eval # the arrange expressions inside an Arrow data_mask sorts <- vector("list", length(exprs)) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index cec56ab9110..8a5488bf599 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -19,19 +19,8 @@ # The following S3 methods are registered on load if dplyr is present collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) { - x <- ensure_group_vars(x) - x <- ensure_arrange_vars(x) # this sets x$temp_columns - # Pull only the selected rows and cols into R - # See dataset.R for Dataset and Scanner(Builder) classes - tab <- Scanner$create(x)$ToTable() - # Arrange rows - if (length(x$arrange_vars) > 0) { - tab <- tab[ - tab$SortIndices(names(x$arrange_vars), x$arrange_desc), - names(x$selected_columns), # this omits x$temp_columns from the result - drop = FALSE - ] - } + # See query-engine.R for ExecPlan/Nodes + tab <- do_exec_plan(x) if (as_data_frame) { df <- as.data.frame(tab) tab$invalidate() @@ -47,16 +36,71 @@ collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) { x } } -collect.Dataset <- function(x, ...) dplyr::collect(arrow_dplyr_query(x), ...) +collect.Dataset <- function(x, ...) dplyr::collect(as_adq(x), ...) compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = FALSE) compute.ArrowTabular <- function(x, ...) x compute.Dataset <- compute.arrow_dplyr_query pull.arrow_dplyr_query <- function(.data, var = -1) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) var <- vars_pull(names(.data), !!enquo(var)) .data$selected_columns <- set_names(.data$selected_columns[var], var) dplyr::collect(.data)[[1]] } pull.Dataset <- pull.ArrowTabular <- pull.arrow_dplyr_query + +# TODO: Correctly handle group_vars after summarize; also in collapse() +restore_dplyr_features <- function(df, query) { + # An arrow_dplyr_query holds some attributes that Arrow doesn't know about + # After calling collect(), make sure these features are carried over + + if (length(query$group_by_vars) > 0) { + # Preserve groupings, if present + if (is.data.frame(df)) { + df <- dplyr::grouped_df( + df, + dplyr::group_vars(query), + drop = dplyr::group_by_drop_default(query) + ) + } else { + # This is a Table, via compute() or collect(as_data_frame = FALSE) + df <- as_adq(df) + df$group_by_vars <- query$group_by_vars + df$drop_empty_groups <- query$drop_empty_groups + } + } + df +} + +collapse.arrow_dplyr_query <- function(x, ...) { + # Figure out what schema will result from the query + x$schema <- implicit_schema(x) + # Nest inside a new arrow_dplyr_query + arrow_dplyr_query(x) +} +collapse.Dataset <- collapse.ArrowTabular <- function(x, ...) { + arrow_dplyr_query(x) +} + +implicit_schema <- function(.data) { + .data <- ensure_group_vars(.data) + old_schm <- .data$.data$schema + + if (is.null(.data$aggregations)) { + new_fields <- map(.data$selected_columns, ~ .$type(old_schm)) + } else { + new_fields <- map(summarize_projection(.data), ~ .$type(old_schm)) + # * Put group_by_vars first (this can't be done by summarize, + # they have to be last per the aggregate node signature, + # and they get projected to this order after aggregation) + # * Infer the output types from the aggregations + group_fields <- new_fields[.data$group_by_vars] + agg_fields <- imap( + new_fields[setdiff(names(new_fields), .data$group_by_vars)], + ~ output_type(.data$aggregations[[.y]][["fun"]], .x) + ) + new_fields <- c(group_fields, agg_fields) + } + schema(!!!new_fields) +} diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index a44750a9c81..61f27010e77 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -26,7 +26,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .preserve = FALSE) { return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # tidy-eval the filter expressions inside an Arrow data_mask filters <- lapply(filts, arrow_eval, arrow_mask(.data)) bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index e535546dd1b..72731216f50 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -840,3 +840,18 @@ agg_funcs$n <- function() { options = list() ) } + +output_type <- function(fun, input_type) { + # These are quick and dirty heuristics. + if (fun %in% c("any", "all")) { + bool() + } else if (fun %in% "sum") { + # It may upcast to a bigger type but this is close enough + input_type + } else if (fun %in% c("mean", "stddev", "variance")) { + float64() + } else { + # Just so things don't error, assume the resulting type is the same + input_type + } +} diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 42cca039022..a89144d6c4a 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -23,7 +23,7 @@ group_by.arrow_dplyr_query <- function(.data, .add = FALSE, add = .add, .drop = dplyr::group_by_drop_default(.data)) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) new_groups <- enquos(...) # ... can contain expressions (i.e. can add (or rename?) columns) and so we # need to identify those and add them on to the query with mutate. Specifically, diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index f19505c1958..051c5254e50 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -24,7 +24,7 @@ mutate.arrow_dplyr_query <- function(.data, .before = NULL, .after = NULL) { call <- match.call() - exprs <- quos(...) + exprs <- ensure_named_exprs(quos(...)) .keep <- match.arg(.keep) .before <- enquo(.before) @@ -35,7 +35,7 @@ mutate.arrow_dplyr_query <- function(.data, return(.data) } - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # Restrict the cases we support for now if (length(dplyr::group_vars(.data)) > 0) { @@ -45,11 +45,6 @@ mutate.arrow_dplyr_query <- function(.data, return(abandon_ship(call, .data, "mutate() on grouped data not supported in Arrow")) } - # Check for unnamed expressions and fix if any - unnamed <- !nzchar(names(exprs)) - # Deparse and take the first element in case they're long expressions - names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) - mask <- arrow_mask(.data) results <- list() for (i in seq_along(exprs)) { @@ -133,3 +128,11 @@ check_transmute_args <- function(..., .keep, .before, .after) { } enquos(...) } + +ensure_named_exprs <- function(exprs) { + # Check for unnamed expressions and fix if any + unnamed <- !nzchar(names(exprs)) + # Deparse and take the first element in case they're long expressions + names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) + exprs +} diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index ee740db4cfb..9a867ced964 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -22,13 +22,13 @@ tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns) select.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(arrow_dplyr_query(.data), !!!enquos(...)) + column_select(as_adq(.data), !!!enquos(...)) } select.Dataset <- select.ArrowTabular <- select.arrow_dplyr_query rename.arrow_dplyr_query <- function(.data, ...) { check_select_helpers(enexprs(...)) - column_select(arrow_dplyr_query(.data), !!!enquos(...), .FUN = vars_rename) + column_select(as_adq(.data), !!!enquos(...), .FUN = vars_rename) } rename.Dataset <- rename.ArrowTabular <- rename.arrow_dplyr_query @@ -60,7 +60,7 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL # at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R # TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829 - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) # Assign the schema to the expressions map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 394e5fe2ac9..cd93e28f07e 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -20,7 +20,7 @@ summarise.arrow_dplyr_query <- function(.data, ..., .engine = c("arrow", "duckdb")) { call <- match.call() - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) exprs <- quos(...) # Only retain the columns we need to do our aggregations vars_to_keep <- unique(c( @@ -47,11 +47,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # ARROW-13550 abort("`summarize()` with `.groups` argument not supported in Arrow") } - exprs <- quos(...) - # Check for unnamed expressions and fix if any - unnamed <- !nzchar(names(exprs)) - # Deparse and take the first element in case they're long expressions - names(exprs)[unnamed] <- map_chr(exprs[unnamed], as_label) + exprs <- ensure_named_exprs(quos(...)) mask <- arrow_mask(.data, aggregation = TRUE) @@ -68,61 +64,20 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { ) stop(msg, call. = FALSE) } - # Put it in the data mask too? - # Should we: mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } - # Now, from that, split out the data (expressions) and options - .data$aggregations <- lapply(results, function(x) x[c("fun", "options")]) - - inputs <- lapply(results, function(x) x$data) - # This is essentially a projection, and the column names don't matter - # (but must exist) - names(inputs) <- as.character(seq_along(inputs)) - .data$selected_columns <- inputs - - # Eventually, we will return .data here if (dataset) but do it eagerly now - do_exec_plan(.data, group_vars = dplyr::group_vars(.data)) + .data$aggregations <- results + # TODO: should in-memory query evaluate eagerly? + collapse.arrow_dplyr_query(.data) } -do_exec_plan <- function(.data, group_vars = NULL) { - plan <- ExecPlan$create() - - grouped <- length(group_vars) > 0 - - # Collect the target names first because we have to add back the group vars - target_names <- names(.data) - - if (grouped) { - .data <- ensure_group_vars(.data) - # We also need to prefix all of the aggregation function names with "hash_" - .data$aggregations <- lapply(.data$aggregations, function(x) { - x[["fun"]] <- paste0("hash_", x[["fun"]]) - x - }) - } - - start_node <- plan$Scan(.data) - # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again - if (inherits(.data$filtered_rows, "Expression")) { - start_node <- start_node$Filter(.data$filtered_rows) - } - # If any columns are derived we need to Project (otherwise this may be no-op) - project_node <- start_node$Project(.data$selected_columns) - - final_node <- project_node$Aggregate( - options = .data$aggregations, - target_names = target_names, - out_field_names = names(.data$aggregations), - key_names = group_vars +summarize_projection <- function(.data) { + c( + map(.data$aggregations, ~ .$data), + .data$selected_columns[.data$group_by_vars] ) +} - out <- plan$Run(final_node) - if (grouped) { - # The result will have result columns first then the grouping cols. - # dplyr orders group cols first, so adapt the result to meet that expectation. - n_results <- length(.data$aggregations) - out <- out[c((n_results + 1):ncol(out), seq_along(.data$aggregations))] - } - out +format_aggregation <- function(x) { + paste0(x$fun, "(", x$data$ToString(), ")") } diff --git a/r/R/dplyr.R b/r/R/dplyr.R index b2793bdb3c3..199120887b9 100644 --- a/r/R/dplyr.R +++ b/r/R/dplyr.R @@ -23,14 +23,10 @@ arrow_dplyr_query <- function(.data) { # An arrow_dplyr_query is a container for an Arrow data object (Table, # RecordBatch, or Dataset) and the state of the user's dplyr query--things # like selected columns, filters, and group vars. - - # For most dplyr methods, - # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query - # This works because the functions all pass .data through arrow_dplyr_query() - if (inherits(.data, "arrow_dplyr_query")) { - return(.data) + # An arrow_dplyr_query can contain another arrow_dplyr_query in .data + if (!inherits(.data, c("Dataset", "arrow_dplyr_query"))) { + .data <- InMemoryDataset$create(.data) } - # Evaluating expressions on a dataset with duplicated fieldnames will error dupes <- duplicated(names(.data)) if (any(dupes)) { @@ -42,19 +38,14 @@ arrow_dplyr_query <- function(.data) { ) )) } - structure( list( - .data = if (inherits(.data, "Dataset")) { - .data$clone() - } else { - InMemoryDataset$create(.data) - }, + .data = .data, # selected_columns is a named list: # * contents are references/expressions pointing to the data # * names are the names they should be in the end (i.e. this # records any renaming) - selected_columns = make_field_refs(names(.data)), + selected_columns = make_field_refs(names(.data$schema)), # filtered_rows will be an Expression filtered_rows = TRUE, # group_by_vars is a character vector of columns (as renamed) @@ -75,6 +66,21 @@ arrow_dplyr_query <- function(.data) { ) } +# The only difference between `arrow_dplyr_query()` and `as_adq()` is that if +# `.data` is already an `arrow_dplyr_query`, `as_adq()`, will return it as is, but +# `arrow_dplyr_query()` will nest it inside a new `arrow_dplyr_query`. The only +# place where `arrow_dplyr_query()` should be called directly is inside +# `collapse()` methods; everywhere else, call `as_adq()`. +as_adq <- function(.data) { + # For most dplyr methods, + # method.Table == method.RecordBatch == method.Dataset == method.arrow_dplyr_query + # This works because the functions all pass .data through as_adq() + if (inherits(.data, "arrow_dplyr_query")) { + return(.data) + } + arrow_dplyr_query(.data) +} + make_field_refs <- function(field_names) { set_names(lapply(field_names, Expression$field_ref), field_names) } @@ -96,9 +102,14 @@ print.arrow_dplyr_query <- function(x, ...) { } }) fields <- paste(names(types), types, sep = ": ", collapse = "\n") - cat(class(x$.data)[1], " (query)\n", sep = "") + cat(class(source_data(x))[1], " (query)\n", sep = "") cat(fields, "\n", sep = "") cat("\n") + if (length(x$aggregations)) { + cat("* Aggregations:\n") + aggs <- paste0(names(x$aggregations), ": ", map_chr(x$aggregations, format_aggregation), collapse = "\n") + cat(aggs, "\n", sep = "") + } if (!isTRUE(x$filtered_rows)) { filter_string <- x$filtered_rows$ToString() cat("* Filter: ", filter_string, "\n", sep = "") @@ -133,7 +144,10 @@ names.arrow_dplyr_query <- function(x) names(x$selected_columns) dim.arrow_dplyr_query <- function(x) { cols <- length(names(x)) - if (isTRUE(x$filtered)) { + if (is_collapsed(x)) { + # Don't evaluate just for nrow + rows <- NA_integer_ + } else if (isTRUE(x$filtered)) { rows <- x$.data$num_rows } else { rows <- Scanner$create(x)$CountRows() @@ -148,12 +162,14 @@ as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALS #' @export head.arrow_dplyr_query <- function(x, n = 6L, ...) { + # TODO (ARROW-13893): refactor out <- head.Dataset(x, n, ...) restore_dplyr_features(out, x) } #' @export tail.arrow_dplyr_query <- function(x, n = 6L, ...) { + # TODO (ARROW-13893): refactor out <- tail.Dataset(x, n, ...) restore_dplyr_features(out, x) } @@ -161,6 +177,7 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) { #' @export `[.arrow_dplyr_query` <- `[.Dataset` # TODO: ^ should also probably restore_dplyr_features, and/or that should be moved down +# TODO (ARROW-13893): refactor ensure_group_vars <- function(x) { if (inherits(x, "arrow_dplyr_query")) { @@ -191,42 +208,30 @@ ensure_arrange_vars <- function(x) { x } -restore_dplyr_features <- function(df, query) { - # An arrow_dplyr_query holds some attributes that Arrow doesn't know about - # After calling collect(), make sure these features are carried over - - if (length(query$group_by_vars) > 0) { - # Preserve groupings, if present - if (is.data.frame(df)) { - df <- dplyr::grouped_df( - df, - dplyr::group_vars(query), - drop = dplyr::group_by_drop_default(query) - ) - } else { - # This is a Table, via compute() or collect(as_data_frame = FALSE) - df <- arrow_dplyr_query(df) - df$group_by_vars <- query$group_by_vars - df$drop_empty_groups <- query$drop_empty_groups - } - } - df -} - # Helper to handle unsupported dplyr features # * For Table/RecordBatch, we collect() and then call the dplyr method in R # * For Dataset, we just error abandon_ship <- function(call, .data, msg) { + msg <- trimws(msg) dplyr_fun_name <- sub("^(.*?)\\..*", "\\1", as.character(call[[1]])) if (query_on_dataset(.data)) { stop(msg, "\nCall collect() first to pull data into R.", call. = FALSE) } # else, collect and call dplyr method - msg <- sub("\\n$", "", msg) warning(msg, "; pulling data into R", immediate. = TRUE, call. = FALSE) call$.data <- dplyr::collect(.data) call[[1]] <- get(dplyr_fun_name, envir = asNamespace("dplyr")) eval.parent(call, 2) } -query_on_dataset <- function(x) !inherits(x$.data, "InMemoryDataset") +query_on_dataset <- function(x) !inherits(source_data(x), "InMemoryDataset") + +source_data <- function(x) { + if (is_collapsed(x)) { + source_data(x$.data) + } else { + x$.data + } +} + +is_collapsed <- function(x) inherits(x$.data, "arrow_dplyr_query") diff --git a/r/R/duckdb.R b/r/R/duckdb.R index edef5cdc143..87d1b2cfad6 100644 --- a/r/R/duckdb.R +++ b/r/R/duckdb.R @@ -60,7 +60,7 @@ to_duckdb <- function(.data, con = arrow_duck_connection(), table_name = unique_arrow_tablename(), auto_disconnect = TRUE) { - .data <- arrow_dplyr_query(.data) + .data <- as_adq(.data) duckdb::duckdb_register_arrow(con, table_name, .data) tbl <- tbl(con, table_name) diff --git a/r/R/expression.R b/r/R/expression.R index aa9af9270c9..82e21ccf2e1 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -125,6 +125,9 @@ Expression <- R6Class("Expression", inherit = ArrowObject, public = list( ToString = function() compute___expr__ToString(self), + Equals = function(other, ...) { + inherits(other, "Expression") && compute___expr__equals(self, other) + }, # TODO: Implement type determination without storing # schemas in Expression objects (ARROW-13186) schema = NULL, diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 4de2f87165b..a96378671af 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -15,6 +15,19 @@ # specific language governing permissions and limitations # under the License. +do_exec_plan <- function(.data) { + plan <- ExecPlan$create() + final_node <- plan$Build(.data) + tab <- plan$Run(final_node) + + if (length(final_node$sort$temp_columns) > 0) { + # If arrange() created $temp_columns, make sure to omit them from the result + tab <- tab[, setdiff(names(tab), final_node$sort$temp_columns), drop = FALSE] + } + + tab +} + ExecPlan <- R6Class("ExecPlan", inherit = ArrowObject, public = list( @@ -31,6 +44,7 @@ ExecPlan <- R6Class("ExecPlan", field_names_in_expression ))) dataset <- dataset$.data + assert_is(dataset, "Dataset") } else { if (inherits(dataset, "ArrowTabular")) { dataset <- InMemoryDataset$create(dataset) @@ -42,11 +56,97 @@ ExecPlan <- R6Class("ExecPlan", } # ScanNode needs the filter to do predicate pushdown and skip partitions, # and it needs to know which fields to materialize (and which are unnecessary) - ExecNode_Scan(self, dataset, filter, colnames) + ExecNode_Scan(self, dataset, filter, colnames %||% character(0)) + }, + Build = function(.data) { + # This method takes an arrow_dplyr_query and chains together the + # ExecNodes that they produce. It does not evaluate them--that is Run(). + group_vars <- dplyr::group_vars(.data) + grouped <- length(group_vars) > 0 + + # Collect the target names first because we have to add back the group vars + target_names <- names(.data) + .data <- ensure_group_vars(.data) + .data <- ensure_arrange_vars(.data) # this sets .data$temp_columns + + if (inherits(.data$.data, "arrow_dplyr_query")) { + # We have a nested query. Recurse. + node <- self$Build(.data$.data) + } else { + node <- self$Scan(.data) + } + + # ARROW-13498: Even though Scan takes the filter, apparently we have to do it again + if (inherits(.data$filtered_rows, "Expression")) { + node <- node$Filter(.data$filtered_rows) + } + + if (!is.null(.data$aggregations)) { + # Project to include just the data required for each aggregation, + # plus group_by_vars (last) + # TODO: validate that none of names(aggregations) are the same as names(group_by_vars) + # dplyr does not error on this but the result it gives isn't great + node <- node$Project(summarize_projection(.data)) + + if (grouped) { + # We need to prefix all of the aggregation function names with "hash_" + .data$aggregations <- lapply(.data$aggregations, function(x) { + x[["fun"]] <- paste0("hash_", x[["fun"]]) + x + }) + } + + node <- node$Aggregate( + options = map(.data$aggregations, ~ .[c("fun", "options")]), + target_names = names(.data$aggregations), + out_field_names = names(.data$aggregations), + key_names = group_vars + ) + + if (grouped) { + # The result will have result columns first then the grouping cols. + # dplyr orders group cols first, so adapt the result to meet that expectation. + node <- node$Project( + make_field_refs(c(group_vars, names(.data$aggregations))) + ) + if (getOption("arrow.summarise.sort", FALSE)) { + # Add sorting instructions for the rows too to match dplyr + # (see below about why sorting isn't itself a Node) + node$sort <- list( + names = group_vars, + orders = rep(0L, length(group_vars)) + ) + } + } + } else { + # If any columns are derived, reordered, or renamed we need to Project + # If there are aggregations, the projection was already handled above + # We have to project at least once to eliminate some junk columns + # that the ExecPlan adds: + # __fragment_index, __batch_index, __last_in_fragment + # Presumably extraneous repeated projection of the same thing + # (as when we've done collapse() and not projected after) is cheap/no-op + projection <- c(.data$selected_columns, .data$temp_columns) + node <- node$Project(projection) + } + + # Apply sorting: this is currently not an ExecNode itself, it is a + # sink node option. + # TODO: handle some cases: + # (1) arrange > summarize > arrange + # (2) ARROW-13779: arrange then operation where order matters (e.g. cumsum) + if (length(.data$arrange_vars)) { + node$sort <- list( + names = names(.data$arrange_vars), + orders = as.integer(.data$arrange_desc), + temp_columns = names(.data$temp_columns) + ) + } + node }, Run = function(node) { assert_is(node, "ExecNode") - ExecPlan_run(self, node) + ExecPlan_run(self, node, node$sort %||% list()) } ) ) @@ -57,16 +157,30 @@ ExecPlan$create <- function(use_threads = option_use_threads()) { ExecNode <- R6Class("ExecNode", inherit = ArrowObject, public = list( + # `sort` is a slight hack to be able to keep around arrange() params, + # which don't currently yield their own ExecNode but rather are consumed + # in the SinkNode (in ExecPlan$run()) + sort = NULL, + preserve_sort = function(new_node) { + new_node$sort <- self$sort + new_node + }, Project = function(cols) { - assert_is_list_of(cols, "Expression") - ExecNode_Project(self, cols, names(cols)) + if (length(cols)) { + assert_is_list_of(cols, "Expression") + self$preserve_sort(ExecNode_Project(self, cols, names(cols))) + } else { + self$preserve_sort(ExecNode_Project(self, character(0), character(0))) + } }, Filter = function(expr) { assert_is(expr, "Expression") - ExecNode_Filter(self, expr) + self$preserve_sort(ExecNode_Filter(self, expr)) }, Aggregate = function(options, target_names, out_field_names, key_names) { - ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + self$preserve_sort( + ExecNode_Aggregate(self, options, target_names, out_field_names, key_names) + ) } ) ) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index cb69ce17442..f33b81c08f0 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1094,16 +1094,17 @@ extern "C" SEXP _arrow_ExecPlan_create(SEXP use_threads_sexp){ // compute-exec.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ExecPlan_run(const std::shared_ptr& plan, const std::shared_ptr& final_node); -extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ +std::shared_ptr ExecPlan_run(const std::shared_ptr& plan, const std::shared_ptr& final_node, cpp11::list sort_options); +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp){ BEGIN_CPP11 arrow::r::Input&>::type plan(plan_sexp); arrow::r::Input&>::type final_node(final_node_sexp); - return cpp11::as_sexp(ExecPlan_run(plan, final_node)); + arrow::r::Input::type sort_options(sort_options_sexp); + return cpp11::as_sexp(ExecPlan_run(plan, final_node, sort_options)); END_CPP11 } #else -extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp){ +extern "C" SEXP _arrow_ExecPlan_run(SEXP plan_sexp, SEXP final_node_sexp, SEXP sort_options_sexp){ Rf_error("Cannot call ExecPlan_run(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -3173,6 +3174,22 @@ extern "C" SEXP _arrow_FixedSizeListType__list_size(SEXP type_sexp){ } #endif +// expression.cpp +#if defined(ARROW_R_WITH_ARROW) +bool compute___expr__equals(const std::shared_ptr& lhs, const std::shared_ptr& rhs); +extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type lhs(lhs_sexp); + arrow::r::Input&>::type rhs(rhs_sexp); + return cpp11::as_sexp(compute___expr__equals(lhs, rhs)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_compute___expr__equals(SEXP lhs_sexp, SEXP rhs_sexp){ + Rf_error("Cannot call compute___expr__equals(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // expression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr compute___expr__call(std::string func_name, cpp11::list argument_list, cpp11::list options); @@ -7035,450 +7052,451 @@ static const R_CallMethodDef CallEntries[] = { { "_parquet_available", (DL_FUNC)& _parquet_available, 0 }, { "_s3_available", (DL_FUNC)& _s3_available, 0 }, { "_json_available", (DL_FUNC)& _json_available, 0 }, - { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, - { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, - { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, - { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, - { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, - { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, - { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, - { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, - { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, - { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, - { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, - { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, - { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, - { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, - { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, - { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, - { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, - { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, - { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, - { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, - { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, - { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, - { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, - { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, - { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, - { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, - { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, - { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, - { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, - { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, - { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, - { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, - { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, - { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, - { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, - { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, - { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, - { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, - { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, - { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, - { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, - { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, - { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, - { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, - { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, - { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, - { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, - { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, - { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, - { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, - { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, - { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, - { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, - { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, - { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, - { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, - { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, - { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, - { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, - { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, - { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, - { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, - { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, - { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, - { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, - { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, - { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, - { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, - { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, - { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, - { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 2}, - { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, - { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, - { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, - { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, - { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, - { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, - { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, - { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, - { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, - { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, - { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, - { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, - { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, - { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, - { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, - { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, - { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, - { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, - { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, - { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, - { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, - { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, - { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, - { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, - { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, - { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, - { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, - { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, - { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, - { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, - { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, - { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, - { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, - { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, - { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, - { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, - { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, - { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, - { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, - { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, - { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, - { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, - { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, - { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, - { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, - { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, - { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, - { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, - { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, - { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, - { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, - { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, - { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, - { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, - { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, - { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, - { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, - { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, - { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, - { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, - { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, - { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, - { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, - { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, - { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, - { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, - { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, - { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, - { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, - { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, - { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, - { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, - { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, - { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, - { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, - { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, - { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, - { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, - { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, - { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, - { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, - { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, - { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, - { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, - { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, - { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, - { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, - { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, - { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, - { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, - { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, - { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, - { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, - { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, - { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, - { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, - { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, - { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, - { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, - { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, - { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, - { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, - { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, - { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, - { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, - { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, - { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, - { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, - { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, - { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, - { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, - { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, - { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, - { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, - { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, - { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, - { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, - { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, - { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, - { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, - { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, - { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, - { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, - { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, - { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, - { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, - { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, - { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, - { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, - { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, - { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, - { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, - { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, - { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, - { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, - { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, - { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, - { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, - { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, - { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, - { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, - { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, - { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, - { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, - { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, - { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, - { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, - { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, - { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, - { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, - { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, - { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, - { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, - { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, - { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, - { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, - { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, - { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, - { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, - { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, - { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, - { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, - { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, - { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, - { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, - { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, - { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, - { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, - { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, - { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, - { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, - { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, - { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, - { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, - { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, - { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, - { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, - { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, - { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, - { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, - { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, - { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, - { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, - { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, - { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, - { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, - { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, - { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, - { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, - { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, - { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, - { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, - { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, - { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, - { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, - { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, - { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, - { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, - { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, - { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, - { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, - { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, - { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, - { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, - { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, - { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, - { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, - { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, - { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, - { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, - { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, - { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, - { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, - { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, - { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, - { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, - { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, - { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, - { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, - { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, - { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, - { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, - { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, - { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, - { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, - { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, - { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, - { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, - { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, - { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, - { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, - { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, - { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, - { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, - { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, - { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, - { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, - { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, - { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, - { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, - { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, - { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, - { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, - { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, - { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, - { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, - { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, - { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, - { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, - { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, - { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, - { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, - { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, - { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, - { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, - { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, - { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, - { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, - { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, - { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, - { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, - { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, - { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, - { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, - { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, - { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, - { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, - { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, - { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, - { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, - { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, - { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, - { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, - { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, - { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, - { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, - { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, - { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, - { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, - { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, - { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, - { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, - { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, - { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, - { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, - { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, - { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, - { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, - { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, - { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, - { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, - { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, - { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, - { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, - { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, - { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, - { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, - { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, - { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, - { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, - { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, - { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, - { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, - { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, - { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, - { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, - { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, - { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, - { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, - { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, - { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, - { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, - { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, - { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, - { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, - { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, - { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, - { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, - { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, - { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, - { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, - { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, - { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, - { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, - { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, - { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, - { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, - { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, - { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, - { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, - { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, - { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, - { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, - { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, - { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, - { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, - { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, - { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, - { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, - { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, - { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, - { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, - { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, - { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, - { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, - { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, - { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, - { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, - { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, - { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, - { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, - { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, - { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, - { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, - { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, - { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, - { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, - { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, + { "_arrow_is_altrep", (DL_FUNC) &_arrow_is_altrep, 1}, + { "_arrow_Array__Slice1", (DL_FUNC) &_arrow_Array__Slice1, 2}, + { "_arrow_Array__Slice2", (DL_FUNC) &_arrow_Array__Slice2, 3}, + { "_arrow_Array__IsNull", (DL_FUNC) &_arrow_Array__IsNull, 2}, + { "_arrow_Array__IsValid", (DL_FUNC) &_arrow_Array__IsValid, 2}, + { "_arrow_Array__length", (DL_FUNC) &_arrow_Array__length, 1}, + { "_arrow_Array__offset", (DL_FUNC) &_arrow_Array__offset, 1}, + { "_arrow_Array__null_count", (DL_FUNC) &_arrow_Array__null_count, 1}, + { "_arrow_Array__type", (DL_FUNC) &_arrow_Array__type, 1}, + { "_arrow_Array__ToString", (DL_FUNC) &_arrow_Array__ToString, 1}, + { "_arrow_Array__type_id", (DL_FUNC) &_arrow_Array__type_id, 1}, + { "_arrow_Array__Equals", (DL_FUNC) &_arrow_Array__Equals, 2}, + { "_arrow_Array__ApproxEquals", (DL_FUNC) &_arrow_Array__ApproxEquals, 2}, + { "_arrow_Array__Diff", (DL_FUNC) &_arrow_Array__Diff, 2}, + { "_arrow_Array__data", (DL_FUNC) &_arrow_Array__data, 1}, + { "_arrow_Array__RangeEquals", (DL_FUNC) &_arrow_Array__RangeEquals, 5}, + { "_arrow_Array__View", (DL_FUNC) &_arrow_Array__View, 2}, + { "_arrow_Array__Validate", (DL_FUNC) &_arrow_Array__Validate, 1}, + { "_arrow_DictionaryArray__indices", (DL_FUNC) &_arrow_DictionaryArray__indices, 1}, + { "_arrow_DictionaryArray__dictionary", (DL_FUNC) &_arrow_DictionaryArray__dictionary, 1}, + { "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, + { "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, + { "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, + { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, + { "_arrow_LargeListArray__value_type", (DL_FUNC) &_arrow_LargeListArray__value_type, 1}, + { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, + { "_arrow_LargeListArray__values", (DL_FUNC) &_arrow_LargeListArray__values, 1}, + { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, + { "_arrow_LargeListArray__value_length", (DL_FUNC) &_arrow_LargeListArray__value_length, 2}, + { "_arrow_FixedSizeListArray__value_length", (DL_FUNC) &_arrow_FixedSizeListArray__value_length, 2}, + { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, + { "_arrow_LargeListArray__value_offset", (DL_FUNC) &_arrow_LargeListArray__value_offset, 2}, + { "_arrow_FixedSizeListArray__value_offset", (DL_FUNC) &_arrow_FixedSizeListArray__value_offset, 2}, + { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, + { "_arrow_LargeListArray__raw_value_offsets", (DL_FUNC) &_arrow_LargeListArray__raw_value_offsets, 1}, + { "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, + { "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 2}, + { "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, + { "_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 2}, + { "_arrow_ArrayData__get_type", (DL_FUNC) &_arrow_ArrayData__get_type, 1}, + { "_arrow_ArrayData__get_length", (DL_FUNC) &_arrow_ArrayData__get_length, 1}, + { "_arrow_ArrayData__get_null_count", (DL_FUNC) &_arrow_ArrayData__get_null_count, 1}, + { "_arrow_ArrayData__get_offset", (DL_FUNC) &_arrow_ArrayData__get_offset, 1}, + { "_arrow_ArrayData__buffers", (DL_FUNC) &_arrow_ArrayData__buffers, 1}, + { "_arrow_Buffer__is_mutable", (DL_FUNC) &_arrow_Buffer__is_mutable, 1}, + { "_arrow_Buffer__ZeroPadding", (DL_FUNC) &_arrow_Buffer__ZeroPadding, 1}, + { "_arrow_Buffer__capacity", (DL_FUNC) &_arrow_Buffer__capacity, 1}, + { "_arrow_Buffer__size", (DL_FUNC) &_arrow_Buffer__size, 1}, + { "_arrow_r___RBuffer__initialize", (DL_FUNC) &_arrow_r___RBuffer__initialize, 1}, + { "_arrow_Buffer__data", (DL_FUNC) &_arrow_Buffer__data, 1}, + { "_arrow_Buffer__Equals", (DL_FUNC) &_arrow_Buffer__Equals, 2}, + { "_arrow_ChunkedArray__length", (DL_FUNC) &_arrow_ChunkedArray__length, 1}, + { "_arrow_ChunkedArray__null_count", (DL_FUNC) &_arrow_ChunkedArray__null_count, 1}, + { "_arrow_ChunkedArray__num_chunks", (DL_FUNC) &_arrow_ChunkedArray__num_chunks, 1}, + { "_arrow_ChunkedArray__chunk", (DL_FUNC) &_arrow_ChunkedArray__chunk, 2}, + { "_arrow_ChunkedArray__chunks", (DL_FUNC) &_arrow_ChunkedArray__chunks, 1}, + { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, + { "_arrow_ChunkedArray__Slice1", (DL_FUNC) &_arrow_ChunkedArray__Slice1, 2}, + { "_arrow_ChunkedArray__Slice2", (DL_FUNC) &_arrow_ChunkedArray__Slice2, 3}, + { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, + { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, + { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, + { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, + { "_arrow_ChunkedArray__from_list", (DL_FUNC) &_arrow_ChunkedArray__from_list, 2}, + { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, + { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, + { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, + { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, + { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, + { "_arrow_ExecPlan_create", (DL_FUNC) &_arrow_ExecPlan_create, 1}, + { "_arrow_ExecPlan_run", (DL_FUNC) &_arrow_ExecPlan_run, 3}, + { "_arrow_ExecNode_Scan", (DL_FUNC) &_arrow_ExecNode_Scan, 4}, + { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, + { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, + { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 5}, + { "_arrow_RecordBatch__cast", (DL_FUNC) &_arrow_RecordBatch__cast, 3}, + { "_arrow_Table__cast", (DL_FUNC) &_arrow_Table__cast, 3}, + { "_arrow_compute__CallFunction", (DL_FUNC) &_arrow_compute__CallFunction, 3}, + { "_arrow_compute__GetFunctionNames", (DL_FUNC) &_arrow_compute__GetFunctionNames, 0}, + { "_arrow_build_info", (DL_FUNC) &_arrow_build_info, 0}, + { "_arrow_runtime_info", (DL_FUNC) &_arrow_runtime_info, 0}, + { "_arrow_csv___WriteOptions__initialize", (DL_FUNC) &_arrow_csv___WriteOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__initialize", (DL_FUNC) &_arrow_csv___ReadOptions__initialize, 1}, + { "_arrow_csv___ParseOptions__initialize", (DL_FUNC) &_arrow_csv___ParseOptions__initialize, 1}, + { "_arrow_csv___ReadOptions__column_names", (DL_FUNC) &_arrow_csv___ReadOptions__column_names, 1}, + { "_arrow_csv___ConvertOptions__initialize", (DL_FUNC) &_arrow_csv___ConvertOptions__initialize, 1}, + { "_arrow_csv___TableReader__Make", (DL_FUNC) &_arrow_csv___TableReader__Make, 4}, + { "_arrow_csv___TableReader__Read", (DL_FUNC) &_arrow_csv___TableReader__Read, 1}, + { "_arrow_TimestampParser__kind", (DL_FUNC) &_arrow_TimestampParser__kind, 1}, + { "_arrow_TimestampParser__format", (DL_FUNC) &_arrow_TimestampParser__format, 1}, + { "_arrow_TimestampParser__MakeStrptime", (DL_FUNC) &_arrow_TimestampParser__MakeStrptime, 1}, + { "_arrow_TimestampParser__MakeISO8601", (DL_FUNC) &_arrow_TimestampParser__MakeISO8601, 0}, + { "_arrow_csv___WriteCSV__Table", (DL_FUNC) &_arrow_csv___WriteCSV__Table, 3}, + { "_arrow_csv___WriteCSV__RecordBatch", (DL_FUNC) &_arrow_csv___WriteCSV__RecordBatch, 3}, + { "_arrow_dataset___Dataset__NewScan", (DL_FUNC) &_arrow_dataset___Dataset__NewScan, 1}, + { "_arrow_dataset___Dataset__schema", (DL_FUNC) &_arrow_dataset___Dataset__schema, 1}, + { "_arrow_dataset___Dataset__type_name", (DL_FUNC) &_arrow_dataset___Dataset__type_name, 1}, + { "_arrow_dataset___Dataset__ReplaceSchema", (DL_FUNC) &_arrow_dataset___Dataset__ReplaceSchema, 2}, + { "_arrow_dataset___UnionDataset__create", (DL_FUNC) &_arrow_dataset___UnionDataset__create, 2}, + { "_arrow_dataset___InMemoryDataset__create", (DL_FUNC) &_arrow_dataset___InMemoryDataset__create, 1}, + { "_arrow_dataset___UnionDataset__children", (DL_FUNC) &_arrow_dataset___UnionDataset__children, 1}, + { "_arrow_dataset___FileSystemDataset__format", (DL_FUNC) &_arrow_dataset___FileSystemDataset__format, 1}, + { "_arrow_dataset___FileSystemDataset__filesystem", (DL_FUNC) &_arrow_dataset___FileSystemDataset__filesystem, 1}, + { "_arrow_dataset___FileSystemDataset__files", (DL_FUNC) &_arrow_dataset___FileSystemDataset__files, 1}, + { "_arrow_dataset___DatasetFactory__Finish1", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish1, 2}, + { "_arrow_dataset___DatasetFactory__Finish2", (DL_FUNC) &_arrow_dataset___DatasetFactory__Finish2, 2}, + { "_arrow_dataset___DatasetFactory__Inspect", (DL_FUNC) &_arrow_dataset___DatasetFactory__Inspect, 2}, + { "_arrow_dataset___UnionDatasetFactory__Make", (DL_FUNC) &_arrow_dataset___UnionDatasetFactory__Make, 1}, + { "_arrow_dataset___FileSystemDatasetFactory__Make0", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make0, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make2", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make2, 4}, + { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, + { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, + { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, + { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, + { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 2}, + { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, + { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, + { "_arrow_dataset___IpcFileWriteOptions__update2", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update2, 4}, + { "_arrow_dataset___IpcFileWriteOptions__update1", (DL_FUNC) &_arrow_dataset___IpcFileWriteOptions__update1, 3}, + { "_arrow_dataset___CsvFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___CsvFileWriteOptions__update, 2}, + { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, + { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 3}, + { "_arrow_dataset___FragmentScanOptions__type_name", (DL_FUNC) &_arrow_dataset___FragmentScanOptions__type_name, 1}, + { "_arrow_dataset___CsvFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___CsvFragmentScanOptions__Make, 2}, + { "_arrow_dataset___ParquetFragmentScanOptions__Make", (DL_FUNC) &_arrow_dataset___ParquetFragmentScanOptions__Make, 3}, + { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 2}, + { "_arrow_dataset___DirectoryPartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning__MakeFactory, 2}, + { "_arrow_dataset___HivePartitioning", (DL_FUNC) &_arrow_dataset___HivePartitioning, 3}, + { "_arrow_dataset___HivePartitioning__MakeFactory", (DL_FUNC) &_arrow_dataset___HivePartitioning__MakeFactory, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectNames", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectNames, 2}, + { "_arrow_dataset___ScannerBuilder__ProjectExprs", (DL_FUNC) &_arrow_dataset___ScannerBuilder__ProjectExprs, 3}, + { "_arrow_dataset___ScannerBuilder__Filter", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Filter, 2}, + { "_arrow_dataset___ScannerBuilder__UseThreads", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseThreads, 2}, + { "_arrow_dataset___ScannerBuilder__UseAsync", (DL_FUNC) &_arrow_dataset___ScannerBuilder__UseAsync, 2}, + { "_arrow_dataset___ScannerBuilder__BatchSize", (DL_FUNC) &_arrow_dataset___ScannerBuilder__BatchSize, 2}, + { "_arrow_dataset___ScannerBuilder__FragmentScanOptions", (DL_FUNC) &_arrow_dataset___ScannerBuilder__FragmentScanOptions, 2}, + { "_arrow_dataset___ScannerBuilder__schema", (DL_FUNC) &_arrow_dataset___ScannerBuilder__schema, 1}, + { "_arrow_dataset___ScannerBuilder__Finish", (DL_FUNC) &_arrow_dataset___ScannerBuilder__Finish, 1}, + { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, + { "_arrow_dataset___Scanner__ScanBatches", (DL_FUNC) &_arrow_dataset___Scanner__ScanBatches, 1}, + { "_arrow_dataset___Scanner__ToRecordBatchReader", (DL_FUNC) &_arrow_dataset___Scanner__ToRecordBatchReader, 1}, + { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, + { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, + { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, + { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, + { "_arrow_dataset___Scanner__TakeRows", (DL_FUNC) &_arrow_dataset___Scanner__TakeRows, 2}, + { "_arrow_dataset___Scanner__CountRows", (DL_FUNC) &_arrow_dataset___Scanner__CountRows, 1}, + { "_arrow_Int8__initialize", (DL_FUNC) &_arrow_Int8__initialize, 0}, + { "_arrow_Int16__initialize", (DL_FUNC) &_arrow_Int16__initialize, 0}, + { "_arrow_Int32__initialize", (DL_FUNC) &_arrow_Int32__initialize, 0}, + { "_arrow_Int64__initialize", (DL_FUNC) &_arrow_Int64__initialize, 0}, + { "_arrow_UInt8__initialize", (DL_FUNC) &_arrow_UInt8__initialize, 0}, + { "_arrow_UInt16__initialize", (DL_FUNC) &_arrow_UInt16__initialize, 0}, + { "_arrow_UInt32__initialize", (DL_FUNC) &_arrow_UInt32__initialize, 0}, + { "_arrow_UInt64__initialize", (DL_FUNC) &_arrow_UInt64__initialize, 0}, + { "_arrow_Float16__initialize", (DL_FUNC) &_arrow_Float16__initialize, 0}, + { "_arrow_Float32__initialize", (DL_FUNC) &_arrow_Float32__initialize, 0}, + { "_arrow_Float64__initialize", (DL_FUNC) &_arrow_Float64__initialize, 0}, + { "_arrow_Boolean__initialize", (DL_FUNC) &_arrow_Boolean__initialize, 0}, + { "_arrow_Utf8__initialize", (DL_FUNC) &_arrow_Utf8__initialize, 0}, + { "_arrow_LargeUtf8__initialize", (DL_FUNC) &_arrow_LargeUtf8__initialize, 0}, + { "_arrow_Binary__initialize", (DL_FUNC) &_arrow_Binary__initialize, 0}, + { "_arrow_LargeBinary__initialize", (DL_FUNC) &_arrow_LargeBinary__initialize, 0}, + { "_arrow_Date32__initialize", (DL_FUNC) &_arrow_Date32__initialize, 0}, + { "_arrow_Date64__initialize", (DL_FUNC) &_arrow_Date64__initialize, 0}, + { "_arrow_Null__initialize", (DL_FUNC) &_arrow_Null__initialize, 0}, + { "_arrow_Decimal128Type__initialize", (DL_FUNC) &_arrow_Decimal128Type__initialize, 2}, + { "_arrow_FixedSizeBinary__initialize", (DL_FUNC) &_arrow_FixedSizeBinary__initialize, 1}, + { "_arrow_Timestamp__initialize", (DL_FUNC) &_arrow_Timestamp__initialize, 2}, + { "_arrow_Time32__initialize", (DL_FUNC) &_arrow_Time32__initialize, 1}, + { "_arrow_Time64__initialize", (DL_FUNC) &_arrow_Time64__initialize, 1}, + { "_arrow_list__", (DL_FUNC) &_arrow_list__, 1}, + { "_arrow_large_list__", (DL_FUNC) &_arrow_large_list__, 1}, + { "_arrow_fixed_size_list__", (DL_FUNC) &_arrow_fixed_size_list__, 2}, + { "_arrow_struct__", (DL_FUNC) &_arrow_struct__, 1}, + { "_arrow_DataType__ToString", (DL_FUNC) &_arrow_DataType__ToString, 1}, + { "_arrow_DataType__name", (DL_FUNC) &_arrow_DataType__name, 1}, + { "_arrow_DataType__Equals", (DL_FUNC) &_arrow_DataType__Equals, 2}, + { "_arrow_DataType__num_fields", (DL_FUNC) &_arrow_DataType__num_fields, 1}, + { "_arrow_DataType__fields", (DL_FUNC) &_arrow_DataType__fields, 1}, + { "_arrow_DataType__id", (DL_FUNC) &_arrow_DataType__id, 1}, + { "_arrow_ListType__ToString", (DL_FUNC) &_arrow_ListType__ToString, 1}, + { "_arrow_FixedWidthType__bit_width", (DL_FUNC) &_arrow_FixedWidthType__bit_width, 1}, + { "_arrow_DateType__unit", (DL_FUNC) &_arrow_DateType__unit, 1}, + { "_arrow_TimeType__unit", (DL_FUNC) &_arrow_TimeType__unit, 1}, + { "_arrow_DecimalType__precision", (DL_FUNC) &_arrow_DecimalType__precision, 1}, + { "_arrow_DecimalType__scale", (DL_FUNC) &_arrow_DecimalType__scale, 1}, + { "_arrow_TimestampType__timezone", (DL_FUNC) &_arrow_TimestampType__timezone, 1}, + { "_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1}, + { "_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3}, + { "_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1}, + { "_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1}, + { "_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1}, + { "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, + { "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, + { "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, + { "_arrow_StructType__field_names", (DL_FUNC) &_arrow_StructType__field_names, 1}, + { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, + { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, + { "_arrow_LargeListType__value_field", (DL_FUNC) &_arrow_LargeListType__value_field, 1}, + { "_arrow_LargeListType__value_type", (DL_FUNC) &_arrow_LargeListType__value_type, 1}, + { "_arrow_FixedSizeListType__value_field", (DL_FUNC) &_arrow_FixedSizeListType__value_field, 1}, + { "_arrow_FixedSizeListType__value_type", (DL_FUNC) &_arrow_FixedSizeListType__value_type, 1}, + { "_arrow_FixedSizeListType__list_size", (DL_FUNC) &_arrow_FixedSizeListType__list_size, 1}, + { "_arrow_compute___expr__equals", (DL_FUNC) &_arrow_compute___expr__equals, 2}, + { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, + { "_arrow_field_names_in_expression", (DL_FUNC) &_arrow_field_names_in_expression, 1}, + { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, + { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, + { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, + { "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1}, + { "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2}, + { "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2}, + { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, + { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, + { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, + { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, + { "_arrow_ipc___feather___Reader__schema", (DL_FUNC) &_arrow_ipc___feather___Reader__schema, 1}, + { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, + { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, + { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, + { "_arrow_Field__Equals", (DL_FUNC) &_arrow_Field__Equals, 2}, + { "_arrow_Field__nullable", (DL_FUNC) &_arrow_Field__nullable, 1}, + { "_arrow_Field__type", (DL_FUNC) &_arrow_Field__type, 1}, + { "_arrow_fs___FileInfo__type", (DL_FUNC) &_arrow_fs___FileInfo__type, 1}, + { "_arrow_fs___FileInfo__set_type", (DL_FUNC) &_arrow_fs___FileInfo__set_type, 2}, + { "_arrow_fs___FileInfo__path", (DL_FUNC) &_arrow_fs___FileInfo__path, 1}, + { "_arrow_fs___FileInfo__set_path", (DL_FUNC) &_arrow_fs___FileInfo__set_path, 2}, + { "_arrow_fs___FileInfo__size", (DL_FUNC) &_arrow_fs___FileInfo__size, 1}, + { "_arrow_fs___FileInfo__set_size", (DL_FUNC) &_arrow_fs___FileInfo__set_size, 2}, + { "_arrow_fs___FileInfo__base_name", (DL_FUNC) &_arrow_fs___FileInfo__base_name, 1}, + { "_arrow_fs___FileInfo__extension", (DL_FUNC) &_arrow_fs___FileInfo__extension, 1}, + { "_arrow_fs___FileInfo__mtime", (DL_FUNC) &_arrow_fs___FileInfo__mtime, 1}, + { "_arrow_fs___FileInfo__set_mtime", (DL_FUNC) &_arrow_fs___FileInfo__set_mtime, 2}, + { "_arrow_fs___FileSelector__base_dir", (DL_FUNC) &_arrow_fs___FileSelector__base_dir, 1}, + { "_arrow_fs___FileSelector__allow_not_found", (DL_FUNC) &_arrow_fs___FileSelector__allow_not_found, 1}, + { "_arrow_fs___FileSelector__recursive", (DL_FUNC) &_arrow_fs___FileSelector__recursive, 1}, + { "_arrow_fs___FileSelector__create", (DL_FUNC) &_arrow_fs___FileSelector__create, 3}, + { "_arrow_fs___FileSystem__GetTargetInfos_Paths", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_Paths, 2}, + { "_arrow_fs___FileSystem__GetTargetInfos_FileSelector", (DL_FUNC) &_arrow_fs___FileSystem__GetTargetInfos_FileSelector, 2}, + { "_arrow_fs___FileSystem__CreateDir", (DL_FUNC) &_arrow_fs___FileSystem__CreateDir, 3}, + { "_arrow_fs___FileSystem__DeleteDir", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDir, 2}, + { "_arrow_fs___FileSystem__DeleteDirContents", (DL_FUNC) &_arrow_fs___FileSystem__DeleteDirContents, 2}, + { "_arrow_fs___FileSystem__DeleteFile", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFile, 2}, + { "_arrow_fs___FileSystem__DeleteFiles", (DL_FUNC) &_arrow_fs___FileSystem__DeleteFiles, 2}, + { "_arrow_fs___FileSystem__Move", (DL_FUNC) &_arrow_fs___FileSystem__Move, 3}, + { "_arrow_fs___FileSystem__CopyFile", (DL_FUNC) &_arrow_fs___FileSystem__CopyFile, 3}, + { "_arrow_fs___FileSystem__OpenInputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputStream, 2}, + { "_arrow_fs___FileSystem__OpenInputFile", (DL_FUNC) &_arrow_fs___FileSystem__OpenInputFile, 2}, + { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, + { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, + { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, + { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, + { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, + { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, + { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, + { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, + { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, + { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, + { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, + { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, + { "_arrow_io___RandomAccessFile__GetSize", (DL_FUNC) &_arrow_io___RandomAccessFile__GetSize, 1}, + { "_arrow_io___RandomAccessFile__supports_zero_copy", (DL_FUNC) &_arrow_io___RandomAccessFile__supports_zero_copy, 1}, + { "_arrow_io___RandomAccessFile__Seek", (DL_FUNC) &_arrow_io___RandomAccessFile__Seek, 2}, + { "_arrow_io___RandomAccessFile__Tell", (DL_FUNC) &_arrow_io___RandomAccessFile__Tell, 1}, + { "_arrow_io___RandomAccessFile__Read0", (DL_FUNC) &_arrow_io___RandomAccessFile__Read0, 1}, + { "_arrow_io___RandomAccessFile__ReadAt", (DL_FUNC) &_arrow_io___RandomAccessFile__ReadAt, 3}, + { "_arrow_io___MemoryMappedFile__Create", (DL_FUNC) &_arrow_io___MemoryMappedFile__Create, 2}, + { "_arrow_io___MemoryMappedFile__Open", (DL_FUNC) &_arrow_io___MemoryMappedFile__Open, 2}, + { "_arrow_io___MemoryMappedFile__Resize", (DL_FUNC) &_arrow_io___MemoryMappedFile__Resize, 2}, + { "_arrow_io___ReadableFile__Open", (DL_FUNC) &_arrow_io___ReadableFile__Open, 1}, + { "_arrow_io___BufferReader__initialize", (DL_FUNC) &_arrow_io___BufferReader__initialize, 1}, + { "_arrow_io___Writable__write", (DL_FUNC) &_arrow_io___Writable__write, 2}, + { "_arrow_io___OutputStream__Tell", (DL_FUNC) &_arrow_io___OutputStream__Tell, 1}, + { "_arrow_io___FileOutputStream__Open", (DL_FUNC) &_arrow_io___FileOutputStream__Open, 1}, + { "_arrow_io___BufferOutputStream__Create", (DL_FUNC) &_arrow_io___BufferOutputStream__Create, 1}, + { "_arrow_io___BufferOutputStream__capacity", (DL_FUNC) &_arrow_io___BufferOutputStream__capacity, 1}, + { "_arrow_io___BufferOutputStream__Finish", (DL_FUNC) &_arrow_io___BufferOutputStream__Finish, 1}, + { "_arrow_io___BufferOutputStream__Tell", (DL_FUNC) &_arrow_io___BufferOutputStream__Tell, 1}, + { "_arrow_io___BufferOutputStream__Write", (DL_FUNC) &_arrow_io___BufferOutputStream__Write, 2}, + { "_arrow_json___ReadOptions__initialize", (DL_FUNC) &_arrow_json___ReadOptions__initialize, 2}, + { "_arrow_json___ParseOptions__initialize1", (DL_FUNC) &_arrow_json___ParseOptions__initialize1, 1}, + { "_arrow_json___ParseOptions__initialize2", (DL_FUNC) &_arrow_json___ParseOptions__initialize2, 2}, + { "_arrow_json___TableReader__Make", (DL_FUNC) &_arrow_json___TableReader__Make, 3}, + { "_arrow_json___TableReader__Read", (DL_FUNC) &_arrow_json___TableReader__Read, 1}, + { "_arrow_MemoryPool__default", (DL_FUNC) &_arrow_MemoryPool__default, 0}, + { "_arrow_MemoryPool__bytes_allocated", (DL_FUNC) &_arrow_MemoryPool__bytes_allocated, 1}, + { "_arrow_MemoryPool__max_memory", (DL_FUNC) &_arrow_MemoryPool__max_memory, 1}, + { "_arrow_MemoryPool__backend_name", (DL_FUNC) &_arrow_MemoryPool__backend_name, 1}, + { "_arrow_supported_memory_backends", (DL_FUNC) &_arrow_supported_memory_backends, 0}, + { "_arrow_ipc___Message__body_length", (DL_FUNC) &_arrow_ipc___Message__body_length, 1}, + { "_arrow_ipc___Message__metadata", (DL_FUNC) &_arrow_ipc___Message__metadata, 1}, + { "_arrow_ipc___Message__body", (DL_FUNC) &_arrow_ipc___Message__body, 1}, + { "_arrow_ipc___Message__Verify", (DL_FUNC) &_arrow_ipc___Message__Verify, 1}, + { "_arrow_ipc___Message__type", (DL_FUNC) &_arrow_ipc___Message__type, 1}, + { "_arrow_ipc___Message__Equals", (DL_FUNC) &_arrow_ipc___Message__Equals, 2}, + { "_arrow_ipc___ReadRecordBatch__Message__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__Message__Schema, 2}, + { "_arrow_ipc___ReadSchema_InputStream", (DL_FUNC) &_arrow_ipc___ReadSchema_InputStream, 1}, + { "_arrow_ipc___ReadSchema_Message", (DL_FUNC) &_arrow_ipc___ReadSchema_Message, 1}, + { "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1}, + { "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, + { "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2}, + { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3}, + { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2}, + { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1}, + { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroup2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroup2, 3}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups1, 2}, + { "_arrow_parquet___arrow___FileReader__ReadRowGroups2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadRowGroups2, 3}, + { "_arrow_parquet___arrow___FileReader__num_rows", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_rows, 1}, + { "_arrow_parquet___arrow___FileReader__num_columns", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_columns, 1}, + { "_arrow_parquet___arrow___FileReader__num_row_groups", (DL_FUNC) &_arrow_parquet___arrow___FileReader__num_row_groups, 1}, + { "_arrow_parquet___arrow___FileReader__ReadColumn", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadColumn, 2}, + { "_arrow_parquet___ArrowWriterProperties___create", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___create, 3}, + { "_arrow_parquet___WriterProperties___Builder__create", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__create, 0}, + { "_arrow_parquet___WriterProperties___Builder__version", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__version, 2}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compressions", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compressions, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_compression_levels, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_use_dictionary, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__set_write_statistics, 3}, + { "_arrow_parquet___ArrowWriterProperties___Builder__data_page_size", (DL_FUNC) &_arrow_parquet___ArrowWriterProperties___Builder__data_page_size, 2}, + { "_arrow_parquet___WriterProperties___Builder__build", (DL_FUNC) &_arrow_parquet___WriterProperties___Builder__build, 1}, + { "_arrow_parquet___arrow___ParquetFileWriter__Open", (DL_FUNC) &_arrow_parquet___arrow___ParquetFileWriter__Open, 4}, + { "_arrow_parquet___arrow___FileWriter__WriteTable", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__WriteTable, 3}, + { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, + { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, + { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, + { "_arrow_allocate_arrow_schema", (DL_FUNC) &_arrow_allocate_arrow_schema, 0}, + { "_arrow_delete_arrow_schema", (DL_FUNC) &_arrow_delete_arrow_schema, 1}, + { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, + { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, + { "_arrow_allocate_arrow_array_stream", (DL_FUNC) &_arrow_allocate_arrow_array_stream, 0}, + { "_arrow_delete_arrow_array_stream", (DL_FUNC) &_arrow_delete_arrow_array_stream, 1}, + { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 2}, + { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 2}, + { "_arrow_ImportSchema", (DL_FUNC) &_arrow_ImportSchema, 1}, + { "_arrow_ImportField", (DL_FUNC) &_arrow_ImportField, 1}, + { "_arrow_ImportType", (DL_FUNC) &_arrow_ImportType, 1}, + { "_arrow_ImportRecordBatchReader", (DL_FUNC) &_arrow_ImportRecordBatchReader, 1}, + { "_arrow_ExportType", (DL_FUNC) &_arrow_ExportType, 2}, + { "_arrow_ExportField", (DL_FUNC) &_arrow_ExportField, 2}, + { "_arrow_ExportSchema", (DL_FUNC) &_arrow_ExportSchema, 2}, + { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 3}, + { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 3}, + { "_arrow_ExportRecordBatchReader", (DL_FUNC) &_arrow_ExportRecordBatchReader, 2}, + { "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 3}, + { "_arrow_vec_to_arrow", (DL_FUNC) &_arrow_vec_to_arrow, 2}, + { "_arrow_DictionaryArray__FromArrays", (DL_FUNC) &_arrow_DictionaryArray__FromArrays, 3}, + { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, + { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, + { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, + { "_arrow_RecordBatch__RenameColumns", (DL_FUNC) &_arrow_RecordBatch__RenameColumns, 2}, + { "_arrow_RecordBatch__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_RecordBatch__ReplaceSchemaMetadata, 2}, + { "_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, + { "_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, + { "_arrow_RecordBatch__GetColumnByName", (DL_FUNC) &_arrow_RecordBatch__GetColumnByName, 2}, + { "_arrow_RecordBatch__SelectColumns", (DL_FUNC) &_arrow_RecordBatch__SelectColumns, 2}, + { "_arrow_RecordBatch__Equals", (DL_FUNC) &_arrow_RecordBatch__Equals, 3}, + { "_arrow_RecordBatch__AddColumn", (DL_FUNC) &_arrow_RecordBatch__AddColumn, 4}, + { "_arrow_RecordBatch__SetColumn", (DL_FUNC) &_arrow_RecordBatch__SetColumn, 4}, + { "_arrow_RecordBatch__RemoveColumn", (DL_FUNC) &_arrow_RecordBatch__RemoveColumn, 2}, + { "_arrow_RecordBatch__column_name", (DL_FUNC) &_arrow_RecordBatch__column_name, 2}, + { "_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, + { "_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, + { "_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, + { "_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, + { "_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, + { "_arrow_RecordBatch__from_arrays", (DL_FUNC) &_arrow_RecordBatch__from_arrays, 2}, + { "_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, + { "_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, + { "_arrow_RecordBatchReader__batches", (DL_FUNC) &_arrow_RecordBatchReader__batches, 1}, + { "_arrow_Table__from_RecordBatchReader", (DL_FUNC) &_arrow_Table__from_RecordBatchReader, 1}, + { "_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, + { "_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, + { "_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, + { "_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, + { "_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, + { "_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, + { "_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, + { "_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, + { "_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, + { "_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, + { "_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 4}, + { "_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 4}, + { "_arrow_Array__GetScalar", (DL_FUNC) &_arrow_Array__GetScalar, 2}, + { "_arrow_Scalar__ToString", (DL_FUNC) &_arrow_Scalar__ToString, 1}, + { "_arrow_StructScalar__field", (DL_FUNC) &_arrow_StructScalar__field, 2}, + { "_arrow_StructScalar__GetFieldByName", (DL_FUNC) &_arrow_StructScalar__GetFieldByName, 2}, + { "_arrow_Scalar__as_vector", (DL_FUNC) &_arrow_Scalar__as_vector, 1}, + { "_arrow_MakeArrayFromScalar", (DL_FUNC) &_arrow_MakeArrayFromScalar, 2}, + { "_arrow_Scalar__is_valid", (DL_FUNC) &_arrow_Scalar__is_valid, 1}, + { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, + { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, + { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, + { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, + { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, + { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, + { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, + { "_arrow_Schema__AddField", (DL_FUNC) &_arrow_Schema__AddField, 3}, + { "_arrow_Schema__SetField", (DL_FUNC) &_arrow_Schema__SetField, 3}, + { "_arrow_Schema__RemoveField", (DL_FUNC) &_arrow_Schema__RemoveField, 2}, + { "_arrow_Schema__GetFieldByName", (DL_FUNC) &_arrow_Schema__GetFieldByName, 2}, + { "_arrow_Schema__fields", (DL_FUNC) &_arrow_Schema__fields, 1}, + { "_arrow_Schema__field_names", (DL_FUNC) &_arrow_Schema__field_names, 1}, + { "_arrow_Schema__HasMetadata", (DL_FUNC) &_arrow_Schema__HasMetadata, 1}, + { "_arrow_Schema__metadata", (DL_FUNC) &_arrow_Schema__metadata, 1}, + { "_arrow_Schema__WithMetadata", (DL_FUNC) &_arrow_Schema__WithMetadata, 2}, + { "_arrow_Schema__serialize", (DL_FUNC) &_arrow_Schema__serialize, 1}, + { "_arrow_Schema__Equals", (DL_FUNC) &_arrow_Schema__Equals, 3}, + { "_arrow_arrow__UnifySchemas", (DL_FUNC) &_arrow_arrow__UnifySchemas, 1}, + { "_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, + { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, + { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, + { "_arrow_Table__ReplaceSchemaMetadata", (DL_FUNC) &_arrow_Table__ReplaceSchemaMetadata, 2}, + { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, + { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, + { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, + { "_arrow_Table__ColumnNames", (DL_FUNC) &_arrow_Table__ColumnNames, 1}, + { "_arrow_Table__RenameColumns", (DL_FUNC) &_arrow_Table__RenameColumns, 2}, + { "_arrow_Table__Slice1", (DL_FUNC) &_arrow_Table__Slice1, 2}, + { "_arrow_Table__Slice2", (DL_FUNC) &_arrow_Table__Slice2, 3}, + { "_arrow_Table__Equals", (DL_FUNC) &_arrow_Table__Equals, 3}, + { "_arrow_Table__Validate", (DL_FUNC) &_arrow_Table__Validate, 1}, + { "_arrow_Table__ValidateFull", (DL_FUNC) &_arrow_Table__ValidateFull, 1}, + { "_arrow_Table__GetColumnByName", (DL_FUNC) &_arrow_Table__GetColumnByName, 2}, + { "_arrow_Table__RemoveColumn", (DL_FUNC) &_arrow_Table__RemoveColumn, 2}, + { "_arrow_Table__AddColumn", (DL_FUNC) &_arrow_Table__AddColumn, 4}, + { "_arrow_Table__SetColumn", (DL_FUNC) &_arrow_Table__SetColumn, 4}, + { "_arrow_Table__SelectColumns", (DL_FUNC) &_arrow_Table__SelectColumns, 2}, + { "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1}, + { "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2}, + { "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0}, + { "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1}, + { "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0}, + { "_arrow_SetIOThreadPoolCapacity", (DL_FUNC) &_arrow_SetIOThreadPoolCapacity, 1}, + { "_arrow_Array__infer_type", (DL_FUNC) &_arrow_Array__infer_type, 1}, + { "_arrow_Table__Reset", (DL_FUNC) &_arrow_Table__Reset, 1}, + { "_arrow_RecordBatch__Reset", (DL_FUNC) &_arrow_RecordBatch__Reset, 1}, {NULL, NULL, 0} }; extern "C" void R_init_arrow(DllInfo* dll){ @@ -7490,3 +7508,5 @@ extern "C" void R_init_arrow(DllInfo* dll){ #endif } + + diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index 3d0bbca63d2..cab1a09c6ae 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -57,12 +57,22 @@ std::shared_ptr MakeExecNodeOrStop( // [[arrow::export]] std::shared_ptr ExecPlan_run( const std::shared_ptr& plan, - const std::shared_ptr& final_node) { + const std::shared_ptr& final_node, cpp11::list sort_options) { // For now, don't require R to construct SinkNodes. // Instead, just pass the node we should collect as an argument. arrow::AsyncGenerator> sink_gen; - MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, - compute::SinkNodeOptions{&sink_gen}); + + // Sorting uses a different sink node; there is no general sort yet + if (sort_options.size() > 0) { + MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()}, + compute::OrderBySinkNodeOptions{ + *std::dynamic_pointer_cast( + make_compute_options("sort_indices", sort_options)), + &sink_gen}); + } else { + MakeExecNodeOrStop("sink", plan.get(), {final_node.get()}, + compute::SinkNodeOptions{&sink_gen}); + } StopIfNotOk(plan->Validate()); StopIfNotOk(plan->StartProducing()); diff --git a/r/src/expression.cpp b/r/src/expression.cpp index 3fcba46e911..97a8a746bba 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -27,6 +27,11 @@ namespace compute = ::arrow::compute; std::shared_ptr make_compute_options(std::string func_name, cpp11::list options); +// [[arrow::export]] +bool compute___expr__equals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + return lhs->Equals(*rhs); +} // [[arrow::export]] std::shared_ptr compute___expr__call(std::string func_name, cpp11::list argument_list, diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 906963e38d1..3ec18a63019 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -25,6 +25,10 @@ skip_if_not_available <- function(feature) { if (feature == "re2") { # RE2 does not support valgrind (on purpose): https://github.com/google/re2/issues/177 skip_on_valgrind() + } else if (feature == "dataset") { + # These tests often hang on 32-bit windows rtools35, and we haven't been + # able to figure out how to make them work safely + skip_if_multithreading_disabled() } yes <- feature %in% names(build_features) && build_features[feature] @@ -35,6 +39,7 @@ skip_if_not_available <- function(feature) { skip_if_no_pyarrow <- function() { skip_on_valgrind() + skip_on_os("windows") skip_if_not_installed("reticulate") if (!reticulate::py_module_available("pyarrow")) { @@ -68,6 +73,15 @@ skip_on_valgrind <- function() { } } +skip_if_multithreading_disabled <- function() { + is_32bit <- .Machine$sizeof.pointer < 8 + is_old_r <- getRversion() < "4.0.0" + is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" + if (is_32bit && is_old_r && is_windows) { + skip("Multithreading does not work properly on this system") + } +} + process_is_running <- function(x) { cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index b027dc98702..41265f0e638 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -27,15 +27,6 @@ ipc_dir <- make_temp_dir() csv_dir <- make_temp_dir() tsv_dir <- make_temp_dir() -skip_if_multithreading_disabled <- function() { - is_32bit <- .Machine$sizeof.pointer < 8 - is_old_r <- getRversion() < "4.0.0" - is_windows <- tolower(Sys.info()[["sysname"]]) == "windows" - if (is_32bit && is_old_r && is_windows) { - skip("Multithreading does not work properly on this system") - } -} - first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") df1 <- tibble( @@ -133,7 +124,7 @@ test_that("Simple interface for datasets", { # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + ds %>% arrange(part) %>% pull(part), c(rep(1, 10), rep(2, 10)) ) }) @@ -348,13 +339,12 @@ test_that("IPC/Feather format data", { # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + ds %>% arrange(part) %>% pull(part), c(rep(3, 10), rep(4, 10)) ) }) test_that("CSV dataset", { - skip_if_multithreading_disabled() ds <- open_dataset(csv_dir, partitioning = "part", format = "csv") expect_r6_class(ds$format, "CsvFileFormat") expect_r6_class(ds$filesystem, "LocalFileSystem") @@ -376,13 +366,12 @@ test_that("CSV dataset", { ) # Collecting virtual partition column works expect_equal( - collect(ds) %>% pull(part), + collect(ds) %>% arrange(part) %>% pull(part), c(rep(5, 10), rep(6, 10)) ) }) test_that("CSV scan options", { - skip_if_multithreading_disabled() options <- FragmentScanOptions$create("text") expect_equal(options$type, "csv") options <- FragmentScanOptions$create("csv", @@ -429,7 +418,6 @@ test_that("CSV scan options", { }) test_that("compressed CSV dataset", { - skip_if_multithreading_disabled() skip_if_not_available("gzip") dst_dir <- make_temp_dir() dst_file <- file.path(dst_dir, "data.csv.gz") @@ -453,7 +441,6 @@ test_that("compressed CSV dataset", { }) test_that("CSV dataset options", { - skip_if_multithreading_disabled() dst_dir <- make_temp_dir() dst_file <- file.path(dst_dir, "data.csv") df <- tibble(chr = letters[1:10]) @@ -481,7 +468,6 @@ test_that("CSV dataset options", { }) test_that("Other text delimited dataset", { - skip_if_multithreading_disabled() ds1 <- open_dataset(tsv_dir, partitioning = "part", format = "tsv") expect_equivalent( ds1 %>% @@ -510,7 +496,6 @@ test_that("Other text delimited dataset", { }) test_that("readr parse options", { - skip_if_multithreading_disabled() arrow_opts <- names(formals(CsvParseOptions$create)) readr_opts <- names(formals(readr_to_csv_parse_options)) @@ -804,7 +789,7 @@ test_that("filter scalar validation doesn't crash (ARROW-7772)", { test_that("collect() on Dataset works (if fits in memory)", { skip_if_not_available("parquet") expect_equal( - collect(open_dataset(dataset_dir)), + collect(open_dataset(dataset_dir)) %>% arrange(int), rbind(df1, df2) ) }) @@ -1662,7 +1647,6 @@ test_that("Writing a dataset: Parquet format options", { }) test_that("Writing a dataset: CSV format options", { - skip_if_multithreading_disabled() df <- tibble( int = 1:10, dbl = as.numeric(1:10), diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R new file mode 100644 index 00000000000..331f7b7b62c --- /dev/null +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -0,0 +1,210 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +skip_if_not_available("dataset") + +withr::local_options(list(arrow.summarise.sort = TRUE)) + +library(dplyr) +library(stringr) + +tbl <- example_data +# Add some better string data +tbl$verses <- verses[[1]] +# c(" a ", " b ", " c ", ...) increasing padding +# nchar = 3 5 7 9 11 13 15 17 19 21 +tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") +tbl$some_grouping <- rep(c(1, 2), 5) + +tab <- Table$create(tbl) + +test_that("implicit_schema with select", { + expect_equal( + tab %>% + select(int, lgl) %>% + implicit_schema(), + schema(int = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with rename", { + expect_equal( + tab %>% + select(numbers = int, lgl) %>% + implicit_schema(), + schema(numbers = int32(), lgl = bool()) + ) +}) + +test_that("implicit_schema with mutate", { + expect_equal( + tab %>% + transmute( + numbers = int * 4, + words = as.character(int) + ) %>% + implicit_schema(), + schema(numbers = float64(), words = utf8()) + ) +}) + +test_that("implicit_schema with summarize", { + expect_equal( + tab %>% + summarize( + avg = mean(int) + ) %>% + implicit_schema(), + schema(avg = float64()) + ) +}) + +test_that("implicit_schema with group_by summarize", { + expect_equal( + tab %>% + group_by(some_grouping) %>% + summarize( + avg = mean(int * 5L) + ) %>% + implicit_schema(), + schema(some_grouping = float64(), avg = float64()) + ) +}) + +test_that("collapse", { + q <- tab %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) + expect_false(is_collapsed(q)) + expect_true(is_collapsed(collapse(q))) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + collapse() %>% + filter(int < 5) %>% + select(int, twice) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl > 2, chr == "d" | chr == "f") %>% + collapse() %>% + select(chr, int, lgl) %>% + collapse() %>% + filter(int < 5) %>% + select(int, chr) %>% + collect(), + tbl + ) +}) + +test_that("Properties of collapsed query", { + q <- tab %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + mutate(extra = total * 5) + + # print(tbl %>% + # filter(dbl > 2) %>% + # select(chr, int, lgl) %>% + # mutate(twice = int * 2L) %>% + # group_by(lgl) %>% + # summarize(total = sum(int, na.rm = TRUE)) %>% + # mutate(extra = total * 5)) + + # # A tibble: 3 × 3 + # lgl total extra + # + # 1 FALSE 8 40 + # 2 TRUE 8 40 + # 3 NA 25 125 + + # Avoid evaluating just for nrow + expect_identical(dim(q), c(NA_integer_, 3L)) + + expect_output( + print(q), + "InMemoryDataset (query) +lgl: bool +total: int32 +extra: double (multiply_checked(total, 5)) + +See $.data for the source Arrow object", + fixed = TRUE + ) + expect_output( + print(q$.data), + "InMemoryDataset (query) +int: int32 +lgl: bool + +* Aggregations: +total: sum(int) +* Filter: (dbl > 2) +* Grouped by lgl +See $.data for the source Arrow object", + fixed = TRUE + ) + + expect_equal( + head(q, 1) %>% collect(), + tibble::tibble(lgl = FALSE, total = 8L, extra = 40) + ) + expect_equal( + tail(q, 1) %>% collect(), + tibble::tibble(lgl = NA, total = 25L, extra = 125) + ) +}) + +test_that("query_on_dataset handles collapse()", { + expect_false(query_on_dataset( + tab %>% + select(int, chr) + )) + expect_false(query_on_dataset( + tab %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) + + ds_dir <- tempfile() + dir.create(ds_dir) + on.exit(unlink(ds_dir)) + write_parquet(tab, file.path(ds_dir, "file.parquet")) + ds <- open_dataset(ds_dir) + + expect_true(query_on_dataset( + ds %>% + select(int, chr) + )) + expect_true(query_on_dataset( + ds %>% + select(int, chr) %>% + collapse() %>% + select(int) + )) +}) diff --git a/r/tests/testthat/test-dplyr-aggregate.R b/r/tests/testthat/test-dplyr-summarize.R similarity index 72% rename from r/tests/testthat/test-dplyr-aggregate.R rename to r/tests/testthat/test-dplyr-summarize.R index 3a04b6d2314..78d36630e56 100644 --- a/r/tests/testthat/test-dplyr-aggregate.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -17,6 +17,8 @@ skip_if_not_available("dataset") +withr::local_options(list(arrow.summarise.sort = TRUE)) + library(dplyr) library(stringr) @@ -33,7 +35,8 @@ test_that("summarize", { input %>% select(int, chr) %>% filter(int > 5) %>% - summarize(min_int = min(int)), + summarize(min_int = min(int)) %>% + collect(), tbl, warning = TRUE ) @@ -42,12 +45,27 @@ test_that("summarize", { input %>% select(int, chr) %>% filter(int > 5) %>% - summarize(min_int = min(int) / 2), + summarize(min_int = min(int) / 2) %>% + collect(), tbl, warning = TRUE ) }) +test_that("summarize() doesn't evaluate eagerly", { + expect_s3_class( + Table$create(tbl) %>% + summarize(total = sum(int)), + "arrow_dplyr_query" + ) + expect_r6_class( + Table$create(tbl) %>% + summarize(total = sum(int)) %>% + compute(), + "ArrowTabular" + ) +}) + test_that("Can aggregate in Arrow", { expect_dplyr_equal( input %>% @@ -68,7 +86,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -77,7 +94,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int * 4, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -86,7 +102,6 @@ test_that("Group by sum on dataset", { input %>% group_by(some_grouping) %>% summarize(total = sum(int)) %>% - arrange(some_grouping) %>% collect(), tbl, ) @@ -97,7 +112,6 @@ test_that("Group by mean on dataset", { input %>% group_by(some_grouping) %>% summarize(mean = mean(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -106,7 +120,6 @@ test_that("Group by mean on dataset", { input %>% group_by(some_grouping) %>% summarize(mean = mean(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -117,7 +130,6 @@ test_that("Group by sd on dataset", { input %>% group_by(some_grouping) %>% summarize(sd = sd(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -127,7 +139,6 @@ test_that("Group by sd on dataset", { input %>% group_by(some_grouping) %>% summarize(sd = sd(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -138,7 +149,6 @@ test_that("Group by var on dataset", { input %>% group_by(some_grouping) %>% summarize(var = var(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -148,7 +158,6 @@ test_that("Group by var on dataset", { input %>% group_by(some_grouping) %>% summarize(var = var(int, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -178,7 +187,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(any(lgl, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -186,7 +194,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(all(lgl, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -194,7 +201,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(any(lgl, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -202,7 +208,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(all(lgl, na.rm = FALSE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -212,7 +217,6 @@ test_that("Group by any/all", { mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% summarize(any(has_words, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -221,7 +225,6 @@ test_that("Group by any/all", { mutate(has_words = nchar(verses) < 0) %>% group_by(some_grouping) %>% summarize(all(has_words, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -229,7 +232,6 @@ test_that("Group by any/all", { input %>% group_by(some_grouping) %>% summarize(has_words = all(nchar(verses) < 0, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -274,7 +276,6 @@ test_that("Filter and aggregate", { filter(some_grouping == 2) %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% collect(), tbl ) @@ -284,7 +285,86 @@ test_that("Filter and aggregate", { filter(int > 5) %>% group_by(some_grouping) %>% summarize(total = sum(int, na.rm = TRUE)) %>% - arrange(some_grouping) %>% + collect(), + tbl + ) +}) + +test_that("Group by edge cases", { + expect_dplyr_equal( + input %>% + group_by(some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + group_by(alt = some_grouping * 2) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + collect(), + tbl + ) +}) + +test_that("Do things after summarize", { + group2_sum <- tbl %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + pull() %>% + tail(1) + + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + filter(int > 5) %>% + summarize(total = sum(int, na.rm = TRUE)) %>% + filter(total == group2_sum) %>% + mutate(extra = total * 5) %>% + collect(), + tbl + ) + + expect_dplyr_equal( + input %>% + filter(dbl > 2) %>% + select(chr, int, lgl) %>% + mutate(twice = int * 2L) %>% + group_by(lgl) %>% + summarize( + count = n(), + total = sum(twice, na.rm = TRUE) + ) %>% + mutate(mean = total / count) %>% + collect(), + tbl + ) +}) + +test_that("Expressions on aggregations", { + # This is what it effectively is + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize( + any = any(lgl), + all = all(lgl) + ) %>% + compute() %>% + ungroup() %>% # TODO: loosen the restriction on mutate after group_by + mutate(some = any & !all) %>% + select(some_grouping, some) %>% + collect(), + tbl + ) + # More concisely: + skip("TODO: ARROW-13778") + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize(any(lgl) & !all(lgl)) %>% collect(), tbl ) diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R index cdfcb62d02d..56343ad729e 100644 --- a/r/tests/testthat/test-duckdb.R +++ b/r/tests/testthat/test-duckdb.R @@ -18,6 +18,7 @@ skip_if_not_installed("duckdb", minimum_version = "0.2.8") skip_if_not_installed("dbplyr") skip_if_not_available("dataset") + # when we remove this, we should also remove the FALSE in run_duckdb_examples skip("These tests are flaking: https://github.com/duckdb/duckdb/issues/2100") library(duckdb) diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index bc6d285b333..6ae5b54fbf3 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -239,9 +239,10 @@ test_that("metadata of list elements (ARROW-10386)", { ds <- open_dataset(dst_dir) expect_warning( df_from_ds <- collect(ds), - "Row-level metadata is not compatible with this operation and has been ignored" + NA # TODO: ARROW-13852 + # "Row-level metadata is not compatible with this operation and has been ignored" ) - expect_equal(df_from_ds[c(1, 4, 3, 2), ], df, check.attributes = FALSE) + expect_equal(arrange(df_from_ds, int), arrange(df, int), check.attributes = FALSE) # however there is *no* warning if we don't select the metadata column expect_warning( diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R index 9e67219e19a..d5815247d51 100644 --- a/r/tests/testthat/test-python.R +++ b/r/tests/testthat/test-python.R @@ -20,9 +20,10 @@ context("To/from Python") test_that("install_pyarrow", { skip_on_cran() skip_if_not_dev_mode() - # Python problems on Apple M1 still - skip_if(grepl("arm-apple|aarch64.*darwin", R.Version()$platform)) + # Windows CI machine doesn't pick up the right python or something + skip_on_os("windows") skip_if_not_installed("reticulate") + venv <- try(reticulate::virtualenv_create("arrow-test")) # Bail out if virtualenv isn't available skip_if(inherits(venv, "try-error")) diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 94451e5351a..a2a13cbf887 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -86,8 +86,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("open_dataset with an S3 file (not directory) URI", { skip_if_not_available("parquet") expect_identical( - open_dataset(minio_uri("test.parquet")) %>% collect(), - example_data + open_dataset(minio_uri("test.parquet")) %>% collect() %>% arrange(int), + example_data %>% arrange(int) ) }) @@ -96,8 +96,10 @@ if (arrow_with_s3() && process_is_running("minio server")) { open_dataset( c(minio_uri("test.feather"), minio_uri("test2.feather")), format = "feather" - ) %>% collect(), - rbind(example_data, example_data) + ) %>% + arrange(int) %>% + collect(), + rbind(example_data, example_data) %>% arrange(int) ) }) @@ -153,8 +155,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("open_dataset with fs", { ds <- open_dataset(fs$path(minio_path("hive_dir"))) expect_identical( - ds %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) @@ -170,16 +172,16 @@ if (arrow_with_s3() && process_is_running("minio server")) { expect_length(dir(td), 2) ds <- open_dataset(td) expect_identical( - ds %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) # Let's copy the other way and use a SubTreeFileSystem rather than URI copy_files(td, fs$path(minio_path("hive_dir2"))) ds2 <- open_dataset(fs$path(minio_path("hive_dir2"))) expect_identical( - ds2 %>% select(dbl, lgl) %>% collect(), - rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), + rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) } From 858ac57d71db96eda1f5ef52751d2c3140806b21 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 3 Sep 2021 13:31:21 -0400 Subject: [PATCH 19/93] ARROW-13874: [R] Implement TrimOptions Closes #11074 from thisisnic/ARROW-13874_trimpotions Authored-by: Nic Crane Signed-off-by: Neal Richardson --- r/src/compute.cpp | 7 +++ r/tests/testthat/test-compute-no-bindings.R | 53 +++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 r/tests/testthat/test-compute-no-bindings.R diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 7d17f111d74..446e011f548 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -360,6 +360,13 @@ std::shared_ptr make_compute_options( return std::make_shared(max_splits, reverse); } + if (func_name == "utf8_trim" || func_name == "utf8_ltrim" || + func_name == "utf8_rtrim" || func_name == "ascii_trim" || + func_name == "ascii_ltrim" || func_name == "ascii_rtrim") { + using Options = arrow::compute::TrimOptions; + return std::make_shared(cpp11::as_cpp(options["characters"])); + } + if (func_name == "utf8_slice_codeunits") { using Options = arrow::compute::SliceOptions; diff --git a/r/tests/testthat/test-compute-no-bindings.R b/r/tests/testthat/test-compute-no-bindings.R new file mode 100644 index 00000000000..33b97e8ed76 --- /dev/null +++ b/r/tests/testthat/test-compute-no-bindings.R @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("non-bound compute kernels using TrimOptions", { + expect_equal( + call_function("utf8_trim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("racadabr") + ) + + expect_equal( + call_function("utf8_ltrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("racadabra") + ) + + expect_equal( + call_function("utf8_rtrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function("utf8_rtrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function("ascii_ltrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("racadabra") + ) + + expect_equal( + call_function("ascii_rtrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("abracadabr") + ) + + expect_equal( + call_function("ascii_rtrim", Scalar$create("abracadabra"), options = list(characters = "ab")), + Scalar$create("abracadabr") + ) +}) From a49048bd49cd3885b974259db2c5621ef4bb3cdc Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 3 Sep 2021 13:51:08 -0400 Subject: [PATCH 20/93] ARROW-13543: [R] Handle summarize() with 0 arguments or no aggregate functions Closes #11078 from nealrichardson/summarize-0 Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- r/R/dplyr-summarize.R | 2 +- r/tests/testthat/test-dplyr-summarize.R | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index cd93e28f07e..3a6c76e28cb 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -51,7 +51,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { mask <- arrow_mask(.data, aggregation = TRUE) - results <- list() + results <- empty_named_list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 78d36630e56..9f149673c5a 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -369,3 +369,13 @@ test_that("Expressions on aggregations", { tbl ) }) + +test_that("Summarize with 0 arguments", { + expect_dplyr_equal( + input %>% + group_by(some_grouping) %>% + summarize() %>% + collect(), + tbl + ) +}) From f12c18e619c168363e4edace8a09a591b87eee41 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 5 Sep 2021 04:12:48 +0900 Subject: [PATCH 21/93] ARROW-13899: [Ruby] Implement slicer by compute kernels Closes #11083 from kou/ruby-slicer-expression Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 163 +++++++++++++++++++++ c_glib/arrow-glib/compute.h | 16 +++ c_glib/arrow-glib/compute.hpp | 4 + c_glib/test/test-is-in.rb | 24 ++++ c_glib/test/test-set-lookup-options.rb | 43 ++++++ ruby/red-arrow/lib/arrow/datum.rb | 2 + ruby/red-arrow/lib/arrow/slicer.rb | 187 ++++++------------------- ruby/red-arrow/test/test-slicer.rb | 11 +- 8 files changed, 301 insertions(+), 149 deletions(-) create mode 100644 c_glib/test/test-set-lookup-options.rb diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 825d296dd26..b0839799d9a 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -165,6 +165,9 @@ G_BEGIN_DECLS * #GArrowSortOptions is a class to customize the `sort_indices` * function. * + * #GArrowSetLookupOptions is a class to customize the `is_in` function + * and `index_in` function. + * * There are many functions to compute data on an array. */ @@ -2417,6 +2420,157 @@ garrow_sort_options_set_sort_keys(GArrowSortOptions *options, } +typedef struct GArrowSetLookupOptionsPrivate_ { + GArrowDatum *value_set; +} GArrowSetLookupOptionsPrivate; + +enum { + PROP_SET_LOOKUP_OPTIONS_VALUE_SET = 1, + PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowSetLookupOptions, + garrow_set_lookup_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +#define GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_set_lookup_options_get_instance_private( \ + GARROW_SET_LOOKUP_OPTIONS(object))) + +static void +garrow_set_lookup_options_dispose(GObject *object) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + + if (priv->value_set) { + g_object_unref(priv->value_set); + priv->value_set = NULL; + } + + G_OBJECT_CLASS(garrow_set_lookup_options_parent_class)->dispose(object); +} + +static void +garrow_set_lookup_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = + garrow_set_lookup_options_get_raw(GARROW_SET_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_SET_LOOKUP_OPTIONS_VALUE_SET: + priv->value_set = GARROW_DATUM(g_value_dup_object(value)); + options->value_set = garrow_datum_get_raw(priv->value_set); + break; + case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: + options->skip_nulls = g_value_get_boolean(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_set_lookup_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SET_LOOKUP_OPTIONS_GET_PRIVATE(object); + auto options = + garrow_set_lookup_options_get_raw(GARROW_SET_LOOKUP_OPTIONS(object)); + + switch (prop_id) { + case PROP_SET_LOOKUP_OPTIONS_VALUE_SET: + g_value_set_object(value, priv->value_set); + break; + case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: + g_value_set_boolean(value, options->skip_nulls); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_set_lookup_options_init(GArrowSetLookupOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::SetLookupOptions()); +} + +static void +garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_set_lookup_options_dispose; + gobject_class->set_property = garrow_set_lookup_options_set_property; + gobject_class->get_property = garrow_set_lookup_options_get_property; + + + arrow::compute::SetLookupOptions options; + + GParamSpec *spec; + /** + * GArrowSetLookupOptions:value-set: + * + * The set of values to look up input values into. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("value-set", + "Value set", + "The set of values to look up input values into", + GARROW_TYPE_DATUM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, + PROP_SET_LOOKUP_OPTIONS_VALUE_SET, + spec); + + /** + * GArrowSetLookupOptions:skip-nulls: + * + * Whether NULLs are skipped or not. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("skip-nulls", + "Skip NULLs", + "Whether NULLs are skipped or not", + options.skip_nulls, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, + spec); +} + +/** + * garrow_set_lookup_options_new: + * @value_set: A #GArrowArrayDatum or #GArrowChunkedArrayDatum to be looked up. + * + * Returns: A newly created #GArrowSetLookupOptions. + * + * Since: 6.0.0 + */ +GArrowSetLookupOptions * +garrow_set_lookup_options_new(GArrowDatum *value_set) +{ + return GARROW_SET_LOOKUP_OPTIONS( + g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, + "value-set", value_set, + NULL)); +} + + /** * garrow_array_cast: * @array: A #GArrowArray. @@ -3755,3 +3909,12 @@ garrow_sort_options_get_raw(GArrowSortOptions *options) return static_cast( garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); } + +arrow::compute::SetLookupOptions * +garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} + + diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 108b27ff7ba..239cc50f9e5 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -436,6 +436,22 @@ garrow_sort_options_add_sort_key(GArrowSortOptions *options, GArrowSortKey *sort_key); +#define GARROW_TYPE_SET_LOOKUP_OPTIONS (garrow_set_lookup_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowSetLookupOptions, + garrow_set_lookup_options, + GARROW, + SET_LOOKUP_OPTIONS, + GArrowFunctionOptions) +struct _GArrowSetLookupOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GArrowSetLookupOptions * +garrow_set_lookup_options_new(GArrowDatum *value_set); + + GArrowArray *garrow_array_cast(GArrowArray *array, GArrowDataType *target_data_type, GArrowCastOptions *options, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 01265eee2a8..c616f6c0226 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -89,3 +89,7 @@ garrow_sort_key_get_raw(GArrowSortKey *sort_key); arrow::compute::SortOptions * garrow_sort_options_get_raw(GArrowSortOptions *options); + + +arrow::compute::SetLookupOptions * +garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options); diff --git a/c_glib/test/test-is-in.rb b/c_glib/test/test-is-in.rb index ba44075d6b3..590b5e3798a 100644 --- a/c_glib/test/test-is-in.rb +++ b/c_glib/test/test-is-in.rb @@ -46,6 +46,16 @@ def test_null_in_both assert_equal(build_boolean_array([false, true, true, true]), left.is_in(right)) end + + def test_options + left = build_int16_array([1, 0, nil, 2]) + right = build_int16_array([2, 0, nil]) + is_in = Arrow::Function.find("is_in") + options = Arrow::SetLookupOptions.new(Arrow::ArrayDatum.new(right)) + assert_equal(build_boolean_array([false, true, true, true]), + is_in.execute([Arrow::ArrayDatum.new(left)], + options).value) + end end sub_test_case("ChunkedArray") do @@ -92,5 +102,19 @@ def test_null_in_both assert_equal(build_boolean_array([false, true, true, true]), left.is_in_chunked_array(right)) end + + def test_options + left = build_int16_array([1, 0, nil, 2]) + chunks = [ + build_int16_array([2, 0]), + build_int16_array([3, nil]) + ] + right = Arrow::ChunkedArray.new(chunks) + is_in = Arrow::Function.find("is_in") + options = Arrow::SetLookupOptions.new(Arrow::ChunkedArrayDatum.new(right)) + assert_equal(build_boolean_array([false, true, true, true]), + is_in.execute([Arrow::ArrayDatum.new(left)], + options).value) + end end end diff --git a/c_glib/test/test-set-lookup-options.rb b/c_glib/test/test-set-lookup-options.rb new file mode 100644 index 00000000000..779bacef683 --- /dev/null +++ b/c_glib/test/test-set-lookup-options.rb @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestSetLookupOptions < Test::Unit::TestCase + include Helper::Buildable + + def test_new + value_set = Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])) + options = Arrow::SetLookupOptions.new(value_set) + assert_equal(value_set, options.value_set) + end + + sub_test_case("instance methods") do + def setup + value_set = Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])) + @options = Arrow::SetLookupOptions.new(value_set) + end + + def test_skip_nulls + assert do + not @options.skip_nulls? + end + @options.skip_nulls = true + assert do + @options.skip_nulls? + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/datum.rb b/ruby/red-arrow/lib/arrow/datum.rb index 99d1dae32f8..196a18f54ff 100644 --- a/ruby/red-arrow/lib/arrow/datum.rb +++ b/ruby/red-arrow/lib/arrow/datum.rb @@ -21,6 +21,8 @@ class << self # @api private def try_convert(value) case value + when Table + TableDatum.new(value) when Array ArrayDatum.new(value) when ChunkedArray diff --git a/ruby/red-arrow/lib/arrow/slicer.rb b/ruby/red-arrow/lib/arrow/slicer.rb index fa834766866..6cca7f75e9b 100644 --- a/ruby/red-arrow/lib/arrow/slicer.rb +++ b/ruby/red-arrow/lib/arrow/slicer.rb @@ -16,9 +16,6 @@ # under the License. module Arrow - # Experimental - # - # TODO: Almost codes should be implemented in Apache Arrow C++. class Slicer def initialize(table) @table = table @@ -43,6 +40,21 @@ def method_missing(name, *args, &block) super end + module Helper + class << self + def ensure_boolean(column) + case column.data_type + when Arrow::BooleanDataType + column.data + else + options = CastOptions.new + options.to_data_type = Arrow::BooleanDataType.new + Function.find("cast").execute([column.data], options).value + end + end + end + end + class Condition def evaluate message = "Slicer::Condition must define \#evaluate: #{inspect}" @@ -69,43 +81,28 @@ def initialize(condition1, condition2) end def evaluate - values1 = @condition1.evaluate.each - values2 = @condition2.evaluate.each - raw_array = [] - begin - loop do - value1 = values1.next - value2 = values2.next - if value1.nil? or value2.nil? - raw_array << nil - else - raw_array << evaluate_value(value1, value2) - end - end - rescue StopIteration - end - BooleanArray.new(raw_array) + function.execute([@condition1.evaluate, @condition2.evaluate]).value end end class AndCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 and value2 + def function + Function.find("and") end end class OrCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 or value2 + def function + Function.find("or") end end class XorCondition < LogicalCondition private - def evaluate_value(value1, value2) - value1 ^ value2 + def function + Function.find("xor") end end @@ -115,21 +112,7 @@ def initialize(column) end def evaluate - data = @column.data - - case @column.data_type - when BooleanDataType - data - else - if data.n_chunks == 1 - data.get_chunk(0).cast(BooleanDataType.new, nil) - else - arrays = data.each_chunk.collect do |chunk| - chunk.cast(BooleanDataType.new, nil) - end - ChunkedArray.new(arrays) - end - end + Helper.ensure_boolean(@column) end def !@ @@ -187,23 +170,8 @@ def initialize(column) end def evaluate - data = @column.data - raw_array = [] - data.each_chunk do |chunk| - if chunk.is_a?(BooleanArray) - boolean_array = chunk - else - boolean_array = chunk.cast(BooleanDataType.new, nil) - end - boolean_array.each do |value| - if value.nil? - raw_array << value - else - raw_array << !value - end - end - end - BooleanArray.new(raw_array) + data = Helper.ensure_boolean(@column) + Function.find("invert").execute([data]).value end def !@ @@ -222,19 +190,10 @@ def !@ end def evaluate - case @value - when nil - raw_array = @column.collect(&:nil?) - BooleanArray.new(raw_array) + if @value.nil? + Function.find("is_null").execute([@column.data]).value else - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value == value - end - end - BooleanArray.new(raw_array) + Function.find("equal").execute([@column.data, @value]).value end end end @@ -250,25 +209,10 @@ def !@ end def evaluate - case @value - when nil - if @column.n_nulls.zero? - raw_array = [true] * @column.n_rows - else - raw_array = @column.n_rows.times.collect do |i| - @column.valid?(i) - end - end - BooleanArray.new(raw_array) + if @value.nil? + Function.find("is_valid").execute([@column.data]).value else - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value != value - end - end - BooleanArray.new(raw_array) + Function.find("not_equal").execute([@column.data, @value]).value end end end @@ -284,14 +228,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value > value - end - end - BooleanArray.new(raw_array) + Function.find("less").execute([@column.data, @value]).value end end @@ -306,14 +243,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value >= value - end - end - BooleanArray.new(raw_array) + Function.find("less_equal").execute([@column.data, @value]).value end end @@ -328,14 +258,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value < value - end - end - BooleanArray.new(raw_array) + Function.find("greater").execute([@column.data, @value]).value end end @@ -350,14 +273,7 @@ def !@ end def evaluate - raw_array = @column.collect do |value| - if value.nil? - nil - else - @value <= value - end - end - BooleanArray.new(raw_array) + Function.find("greater_equal").execute([@column.data, @value]).value end end @@ -372,18 +288,10 @@ def !@ end def evaluate - values_index = {} - @values.each do |value| - values_index[value] = true - end - raw_array = @column.collect do |value| - if value.nil? - nil - else - values_index.key?(value) - end - end - BooleanArray.new(raw_array) + values = @values + values = Array.new(values) unless values.is_a?(Array) + options = SetLookupOptions.new(values) + Function.find("is_in").execute([@column.data], options).value end end @@ -398,18 +306,11 @@ def !@ end def evaluate - values_index = {} - @values.each do |value| - values_index[value] = true - end - raw_array = @column.collect do |value| - if value.nil? - nil - else - not values_index.key?(value) - end - end - BooleanArray.new(raw_array) + values = @values + values = Array.new(values) unless values.is_a?(Array) + options = SetLookupOptions.new(values) + booleans = Function.find("is_in").execute([@column.data], options).value + Function.find("invert").execute([booleans]).value end end diff --git a/ruby/red-arrow/test/test-slicer.rb b/ruby/red-arrow/test/test-slicer.rb index b0f2dfa32c6..420086690a0 100644 --- a/ruby/red-arrow/test/test-slicer.rb +++ b/ruby/red-arrow/test/test-slicer.rb @@ -349,12 +349,11 @@ def setup slicer.count.in?([1, 4, 16, 64]) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 4 (null) -2 16 true -3 64 (null) -4 (null) (null) + count visible +0 1 true +1 4 (null) +2 16 true +3 64 (null) TABLE end From 882e8b489ff1de0c60fffed718c265b362350907 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Sat, 4 Sep 2021 13:25:25 -0700 Subject: [PATCH 22/93] MINOR: [Doc][Python] Fix a typo (#11085) --- docs/source/python/parquet.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 812748ad27f..82461ec5da1 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -483,7 +483,7 @@ Reading from Partitioned Datasets ------------------------------------------------ The :class:`~.ParquetDataset` class accepts either a directory name or a list -or file paths, and can discover and infer some common partition structures, +of file paths, and can discover and infer some common partition structures, such as those produced by Hive: .. code-block:: python From 5d3872388f10f181206e145fdf3b7cae662088ae Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 6 Sep 2021 04:08:06 +0900 Subject: [PATCH 23/93] ARROW-13909: [GLib] Add GArrowVarianceOptions Closes #11086 from kou/glib-variance-options Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/compute.cpp | 164 +++++++++++++++++++++++++++++ c_glib/arrow-glib/compute.h | 16 +++ c_glib/arrow-glib/compute.hpp | 4 + c_glib/arrow-glib/input-stream.cpp | 4 +- c_glib/arrow-glib/input-stream.h | 2 +- 5 files changed, 187 insertions(+), 3 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index b0839799d9a..40d7002e7c2 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -168,6 +168,9 @@ G_BEGIN_DECLS * #GArrowSetLookupOptions is a class to customize the `is_in` function * and `index_in` function. * + * #GArrowVarianceOptions is a class to customize the `stddev` function + * and `variance` function. + * * There are many functions to compute data on an array. */ @@ -2571,6 +2574,161 @@ garrow_set_lookup_options_new(GArrowDatum *value_set) } +enum { + PROP_VARIANCE_OPTIONS_DDOF = 1, + PROP_VARIANCE_OPTIONS_SKIP_NULLS, + PROP_VARIANCE_OPTIONS_MIN_COUNT, +}; + +G_DEFINE_TYPE(GArrowVarianceOptions, + garrow_variance_options, + GARROW_TYPE_FUNCTION_OPTIONS) + +#define GARROW_VARIANCE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_variance_options_get_instance_private( \ + GARROW_VARIANCE_OPTIONS(object))) + +static void +garrow_variance_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto options = + garrow_variance_options_get_raw(GARROW_VARIANCE_OPTIONS(object)); + + switch (prop_id) { + case PROP_VARIANCE_OPTIONS_DDOF: + options->ddof = g_value_get_int(value); + break; + case PROP_VARIANCE_OPTIONS_SKIP_NULLS: + options->skip_nulls = g_value_get_boolean(value); + break; + case PROP_VARIANCE_OPTIONS_MIN_COUNT: + options->min_count = g_value_get_uint(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_variance_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto options = + garrow_variance_options_get_raw(GARROW_VARIANCE_OPTIONS(object)); + + switch (prop_id) { + case PROP_VARIANCE_OPTIONS_DDOF: + g_value_set_int(value, options->ddof); + break; + case PROP_VARIANCE_OPTIONS_SKIP_NULLS: + g_value_set_boolean(value, options->skip_nulls); + break; + case PROP_VARIANCE_OPTIONS_MIN_COUNT: + g_value_set_uint(value, options->min_count); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_variance_options_init(GArrowVarianceOptions *object) +{ + auto priv = GARROW_FUNCTION_OPTIONS_GET_PRIVATE(object); + priv->options = static_cast( + new arrow::compute::VarianceOptions()); +} + +static void +garrow_variance_options_class_init(GArrowVarianceOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = garrow_variance_options_set_property; + gobject_class->get_property = garrow_variance_options_get_property; + + + arrow::compute::VarianceOptions options; + + GParamSpec *spec; + /** + * GArrowVarianceOptions:ddof: + * + * The Delta Degrees of Freedom (ddof) to be used. + * + * Since: 6.0.0 + */ + spec = g_param_spec_int("ddof", + "Delta Degrees of Freedom", + "The Delta Degrees of Freedom (ddof) to be used", + G_MININT, + G_MAXINT, + options.ddof, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_VARIANCE_OPTIONS_DDOF, + spec); + + /** + * GArrowVarianceOptions:skip-nulls: + * + * Whether NULLs are skipped or not. + * + * Since: 6.0.0 + */ + spec = g_param_spec_boolean("skip-nulls", + "Skip NULLs", + "Whether NULLs are skipped or not", + options.skip_nulls, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_VARIANCE_OPTIONS_SKIP_NULLS, + spec); + + /** + * GArrowVarianceOptions:min-count: + * + * If less than this many non-null values are observed, emit null. + * + * Since: 6.0.0 + */ + spec = g_param_spec_uint("min-count", + "Min count", + "If less than this many non-null values " + "are observed, emit null", + 0, + G_MAXUINT, + options.min_count, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_VARIANCE_OPTIONS_MIN_COUNT, + spec); + +} + +/** + * garrow_variance_options_new: + * + * Returns: A newly created #GArrowVarianceOptions. + * + * Since: 6.0.0 + */ +GArrowVarianceOptions * +garrow_variance_options_new(void) +{ + return GARROW_VARIANCE_OPTIONS( + g_object_new(GARROW_TYPE_VARIANCE_OPTIONS, NULL)); +} + + /** * garrow_array_cast: * @array: A #GArrowArray. @@ -3918,3 +4076,9 @@ garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options) } +arrow::compute::VarianceOptions * +garrow_variance_options_get_raw(GArrowVarianceOptions *options) +{ + return static_cast( + garrow_function_options_get_raw(GARROW_FUNCTION_OPTIONS(options))); +} diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 239cc50f9e5..0c7424c7765 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -452,6 +452,22 @@ GArrowSetLookupOptions * garrow_set_lookup_options_new(GArrowDatum *value_set); +#define GARROW_TYPE_VARIANCE_OPTIONS (garrow_variance_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowVarianceOptions, + garrow_variance_options, + GARROW, + VARIANCE_OPTIONS, + GArrowFunctionOptions) +struct _GArrowVarianceOptionsClass +{ + GArrowFunctionOptionsClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GArrowVarianceOptions * +garrow_variance_options_new(void); + + GArrowArray *garrow_array_cast(GArrowArray *array, GArrowDataType *target_data_type, GArrowCastOptions *options, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index c616f6c0226..4adea1847ac 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -93,3 +93,7 @@ garrow_sort_options_get_raw(GArrowSortOptions *options); arrow::compute::SetLookupOptions * garrow_set_lookup_options_get_raw(GArrowSetLookupOptions *options); + + +arrow::compute::VarianceOptions * +garrow_variance_options_get_raw(GArrowVarianceOptions *options); diff --git a/c_glib/arrow-glib/input-stream.cpp b/c_glib/arrow-glib/input-stream.cpp index 57a13e65a1f..37e4702ff16 100644 --- a/c_glib/arrow-glib/input-stream.cpp +++ b/c_glib/arrow-glib/input-stream.cpp @@ -681,10 +681,10 @@ garrow_file_input_stream_new(const gchar *path, * Since: 6.0.0 */ GArrowFileInputStream * -garrow_file_input_stream_new_file_descriptor(gint fd, +garrow_file_input_stream_new_file_descriptor(gint file_descriptor, GError **error) { - auto arrow_stream_result = arrow::io::ReadableFile::Open(fd); + auto arrow_stream_result = arrow::io::ReadableFile::Open(file_descriptor); if (garrow::check(error, arrow_stream_result, "[file-input-stream][new-file-descriptor]")) { diff --git a/c_glib/arrow-glib/input-stream.h b/c_glib/arrow-glib/input-stream.h index 5ead66b8389..5f583c80486 100644 --- a/c_glib/arrow-glib/input-stream.h +++ b/c_glib/arrow-glib/input-stream.h @@ -119,7 +119,7 @@ GArrowFileInputStream * garrow_file_input_stream_new(const gchar *path, GError **error); GArrowFileInputStream * -garrow_file_input_stream_new_file_descriptor(gint fd, +garrow_file_input_stream_new_file_descriptor(gint file_descriptor, GError **error); gint garrow_file_input_stream_get_file_descriptor(GArrowFileInputStream *stream); From 2588e1773f4570d34146318f5b01e81f36df35dd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 6 Sep 2021 05:00:29 +0900 Subject: [PATCH 24/93] ARROW-13909: [GLib] Add tests for GArrowVarianceOptions Closes #11089 from kou/glib-variance-options-test Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/test/test-variance-options.rb | 46 ++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 c_glib/test/test-variance-options.rb diff --git a/c_glib/test/test-variance-options.rb b/c_glib/test/test-variance-options.rb new file mode 100644 index 00000000000..64bdf670bf0 --- /dev/null +++ b/c_glib/test/test-variance-options.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestVarianceOptions < Test::Unit::TestCase + include Helper::Buildable + + def setup + @options = Arrow::VarianceOptions.new + end + + def test_ddof + assert_equal(0, @options.ddof) + @options.ddof = 1 + assert_equal(1, @options.ddof) + end + + def test_skip_nulls + assert do + @options.skip_nulls? + end + @options.skip_nulls = false + assert do + not @options.skip_nulls? + end + end + + def test_min_count + assert_equal(0, @options.min_count) + @options.min_count = 1 + assert_equal(1, @options.min_count) + end +end From c83db7e19623c5a9a17ff9ed2eab16a51e29dc46 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 6 Sep 2021 12:43:28 +0200 Subject: [PATCH 25/93] ARROW-13793: [C++] Migrate ORCFileReader to Result Signed-off-by: Junwang Zhao Closes #11065 from zhjwpku/cpp/migrate_orcfilereader_to_result Authored-by: Junwang Zhao Signed-off-by: Antoine Pitrou --- cpp/src/arrow/adapters/orc/adapter.cc | 68 +++++++++++++++- cpp/src/arrow/adapters/orc/adapter.h | 94 ++++++++++++++++++++++ cpp/src/arrow/adapters/orc/adapter_test.cc | 21 +++-- 3 files changed, 169 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 2f74b40e40d..94a3b6e882a 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -430,10 +430,14 @@ ORCFileReader::~ORCFileReader() {} Status ORCFileReader::Open(const std::shared_ptr& file, MemoryPool* pool, std::unique_ptr* reader) { + return Open(file, pool).Value(reader); +} + +Result> ORCFileReader::Open( + const std::shared_ptr& file, MemoryPool* pool) { auto result = std::unique_ptr(new ORCFileReader()); RETURN_NOT_OK(result->impl_->Open(file, pool)); - *reader = std::move(result); - return Status::OK(); + return std::move(result); } Result> ORCFileReader::ReadMetadata() { @@ -444,33 +448,79 @@ Status ORCFileReader::ReadSchema(std::shared_ptr* out) { return impl_->ReadSchema(out); } +Result> ORCFileReader::ReadSchema() { + std::shared_ptr schema; + RETURN_NOT_OK(impl_->ReadSchema(&schema)); + return schema; +} + Status ORCFileReader::Read(std::shared_ptr
    * out) { return impl_->Read(out); } +Result> ORCFileReader::Read() { + std::shared_ptr
    table; + RETURN_NOT_OK(impl_->Read(&table)); + return table; +} + Status ORCFileReader::Read(const std::shared_ptr& schema, std::shared_ptr
    * out) { return impl_->Read(schema, out); } +Result> ORCFileReader::Read( + const std::shared_ptr& schema) { + std::shared_ptr
    table; + RETURN_NOT_OK(impl_->Read(schema, &table)); + return table; +} + Status ORCFileReader::Read(const std::vector& include_indices, std::shared_ptr
    * out) { return impl_->Read(include_indices, out); } +Result> ORCFileReader::Read( + const std::vector& include_indices) { + std::shared_ptr
    table; + RETURN_NOT_OK(impl_->Read(include_indices, &table)); + return table; +} + Status ORCFileReader::Read(const std::shared_ptr& schema, const std::vector& include_indices, std::shared_ptr
    * out) { return impl_->Read(schema, include_indices, out); } +Result> ORCFileReader::Read( + const std::shared_ptr& schema, const std::vector& include_indices) { + std::shared_ptr
    table; + RETURN_NOT_OK(impl_->Read(schema, include_indices, &table)); + return table; +} + Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr* out) { return impl_->ReadStripe(stripe, out); } +Result> ORCFileReader::ReadStripe(int64_t stripe) { + std::shared_ptr recordBatch; + RETURN_NOT_OK(impl_->ReadStripe(stripe, &recordBatch)); + return recordBatch; +} + Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector& include_indices, std::shared_ptr* out) { return impl_->ReadStripe(stripe, include_indices, out); } +Result> ORCFileReader::ReadStripe( + int64_t stripe, const std::vector& include_indices) { + std::shared_ptr recordBatch; + RETURN_NOT_OK(impl_->ReadStripe(stripe, include_indices, &recordBatch)); + return recordBatch; +} + Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); } Status ORCFileReader::NextStripeReader(int64_t batch_sizes, @@ -478,12 +528,26 @@ Status ORCFileReader::NextStripeReader(int64_t batch_sizes, return impl_->NextStripeReader(batch_sizes, out); } +Result> ORCFileReader::NextStripeReader( + int64_t batch_size) { + std::shared_ptr reader; + RETURN_NOT_OK(impl_->NextStripeReader(batch_size, &reader)); + return reader; +} + Status ORCFileReader::NextStripeReader(int64_t batch_size, const std::vector& include_indices, std::shared_ptr* out) { return impl_->NextStripeReader(batch_size, include_indices, out); } +Result> ORCFileReader::NextStripeReader( + int64_t batch_size, const std::vector& include_indices) { + std::shared_ptr reader; + RETURN_NOT_OK(impl_->NextStripeReader(batch_size, include_indices, &reader)); + return reader; +} + int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 012c1701980..036795188f6 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -27,6 +27,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { @@ -45,9 +46,18 @@ class ARROW_EXPORT ORCFileReader { /// \param[in] pool a MemoryPool to use for buffer allocations /// \param[out] reader the returned reader object /// \return Status + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") static Status Open(const std::shared_ptr& file, MemoryPool* pool, std::unique_ptr* reader); + /// \brief Creates a new ORC reader + /// + /// \param[in] file the data source + /// \param[in] pool a MemoryPool to use for buffer allocations + /// \return the returned reader object + static Result> Open( + const std::shared_ptr& file, MemoryPool* pool); + /// \brief Return the metadata read from the ORC file /// /// \return A KeyValueMetadata object containing the ORC metadata @@ -56,31 +66,63 @@ class ARROW_EXPORT ORCFileReader { /// \brief Return the schema read from the ORC file /// /// \param[out] out the returned Schema object + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status ReadSchema(std::shared_ptr* out); + /// \brief Return the schema read from the ORC file + /// + /// \return the returned Schema object + Result> ReadSchema(); + /// \brief Read the file as a Table /// /// The table will be composed of one record batch per stripe. /// /// \param[out] out the returned Table + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status Read(std::shared_ptr
    * out); + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \return the returned Table + Result> Read(); + /// \brief Read the file as a Table /// /// The table will be composed of one record batch per stripe. /// /// \param[in] schema the Table schema /// \param[out] out the returned Table + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status Read(const std::shared_ptr& schema, std::shared_ptr
    * out); + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \return the returned Table + Result> Read(const std::shared_ptr& schema); + /// \brief Read the file as a Table /// /// The table will be composed of one record batch per stripe. /// /// \param[in] include_indices the selected field indices to read /// \param[out] out the returned Table + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status Read(const std::vector& include_indices, std::shared_ptr
    * out); + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] include_indices the selected field indices to read + /// \return the returned Table + Result> Read(const std::vector& include_indices); + /// \brief Read the file as a Table /// /// The table will be composed of one record batch per stripe. @@ -88,23 +130,50 @@ class ARROW_EXPORT ORCFileReader { /// \param[in] schema the Table schema /// \param[in] include_indices the selected field indices to read /// \param[out] out the returned Table + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status Read(const std::shared_ptr& schema, const std::vector& include_indices, std::shared_ptr
    * out); + /// \brief Read the file as a Table + /// + /// The table will be composed of one record batch per stripe. + /// + /// \param[in] schema the Table schema + /// \param[in] include_indices the selected field indices to read + /// \return the returned Table + Result> Read(const std::shared_ptr& schema, + const std::vector& include_indices); + /// \brief Read a single stripe as a RecordBatch /// /// \param[in] stripe the stripe index /// \param[out] out the returned RecordBatch + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status ReadStripe(int64_t stripe, std::shared_ptr* out); + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \return the returned RecordBatch + Result> ReadStripe(int64_t stripe); + /// \brief Read a single stripe as a RecordBatch /// /// \param[in] stripe the stripe index /// \param[in] include_indices the selected field indices to read /// \param[out] out the returned RecordBatch + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status ReadStripe(int64_t stripe, const std::vector& include_indices, std::shared_ptr* out); + /// \brief Read a single stripe as a RecordBatch + /// + /// \param[in] stripe the stripe index + /// \param[in] include_indices the selected field indices to read + /// \return the returned RecordBatch + Result> ReadStripe( + int64_t stripe, const std::vector& include_indices); + /// \brief Seek to designated row. Invoke NextStripeReader() after seek /// will return stripe reader starting from designated row. /// @@ -119,8 +188,19 @@ class ARROW_EXPORT ORCFileReader { /// \param[in] batch_size the number of rows each record batch contains in /// record batch iteration. /// \param[out] out the returned stripe reader + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status NextStripeReader(int64_t batch_size, std::shared_ptr* out); + /// \brief Get a stripe level record batch iterator with specified row count + /// in each record batch. NextStripeReader serves as a fine grain + /// alternative to ReadStripe which may cause OOM issue by loading + /// the whole stripes into memory. + /// + /// \param[in] batch_size the number of rows each record batch contains in + /// record batch iteration. + /// \return the returned stripe reader + Result> NextStripeReader(int64_t batch_size); + /// \brief Get a stripe level record batch iterator with specified row count /// in each record batch. NextStripeReader serves as a fine grain /// alternative to ReadStripe which may cause OOM issue by loading @@ -131,9 +211,23 @@ class ARROW_EXPORT ORCFileReader { /// /// \param[in] include_indices the selected field indices to read /// \param[out] out the returned stripe reader + ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") Status NextStripeReader(int64_t batch_size, const std::vector& include_indices, std::shared_ptr* out); + /// \brief Get a stripe level record batch iterator with specified row count + /// in each record batch. NextStripeReader serves as a fine grain + /// alternative to ReadStripe which may cause OOM issue by loading + /// the whole stripes into memory. + /// + /// \param[in] batch_size Get a stripe level record batch iterator with specified row + /// count in each record batch. + /// + /// \param[in] include_indices the selected field indices to read + /// \return the returned stripe reader + Result> NextStripeReader( + int64_t batch_size, const std::vector& include_indices); + /// \brief The number of stripes in the file int64_t NumberOfStripes(); diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 9f7fb561362..39c66b90f6d 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -237,13 +237,12 @@ void AssertTableWriteReadEqual(const std::shared_ptr
    & input_table, ARROW_EXPECT_OK(writer->Close()); EXPECT_OK_AND_ASSIGN(auto buffer, buffer_output_stream->Finish()); std::shared_ptr in_stream(new io::BufferReader(buffer)); - std::unique_ptr reader; - ARROW_EXPECT_OK( - adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader)); - std::shared_ptr
    actual_output_table; - ARROW_EXPECT_OK(reader->Read(&actual_output_table)); + EXPECT_OK_AND_ASSIGN( + auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool())); + EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read()); AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); } + void AssertArrayWriteReadEqual(const std::shared_ptr& input_array, const std::shared_ptr& expected_output_array, const int64_t max_size = kDefaultSmallMemStreamSize) { @@ -323,9 +322,8 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { std::make_shared(reinterpret_cast(mem_stream.getData()), static_cast(mem_stream.getLength())))); - std::unique_ptr reader; - ASSERT_TRUE( - adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader).ok()); + ASSERT_OK_AND_ASSIGN( + auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool())); EXPECT_OK_AND_ASSIGN(auto metadata, reader->ReadMetadata()); auto expected_metadata = std::const_pointer_cast( @@ -334,8 +332,7 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows()); ASSERT_EQ(stripe_count, reader->NumberOfStripes()); accumulated = 0; - std::shared_ptr stripe_reader; - EXPECT_TRUE(reader->NextStripeReader(reader_batch_size, &stripe_reader).ok()); + EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(reader_batch_size)); while (stripe_reader) { std::shared_ptr record_batch; EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); @@ -350,14 +347,14 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { } EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); } - EXPECT_TRUE(reader->NextStripeReader(reader_batch_size, &stripe_reader).ok()); + EXPECT_OK_AND_ASSIGN(stripe_reader, reader->NextStripeReader(reader_batch_size)); } // test seek operation int64_t start_offset = 830; EXPECT_TRUE(reader->Seek(stripe_row_count + start_offset).ok()); - EXPECT_TRUE(reader->NextStripeReader(reader_batch_size, &stripe_reader).ok()); + EXPECT_OK_AND_ASSIGN(stripe_reader, reader->NextStripeReader(reader_batch_size)); std::shared_ptr record_batch; EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); while (record_batch) { From 5c5af6c1cc1f95c6cf7c2b3087b6bb84c1d80b12 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 6 Sep 2021 12:50:38 +0200 Subject: [PATCH 26/93] ARROW-13871: [C++] JSON reader can fail if a list array key is present in one chunk but not in a later chunk Closes #11072 from westonpace/bugfix/ARROW-13871-json-reader-list-array-null-chunk Authored-by: Weston Pace Signed-off-by: Antoine Pitrou --- cpp/src/arrow/json/chunked_builder.cc | 9 +++++---- cpp/src/arrow/json/chunked_builder_test.cc | 10 +++------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/json/chunked_builder.cc b/cpp/src/arrow/json/chunked_builder.cc index 040009c764f..e95041ea06d 100644 --- a/cpp/src/arrow/json/chunked_builder.cc +++ b/cpp/src/arrow/json/chunked_builder.cc @@ -201,6 +201,11 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder { const std::shared_ptr& unconverted) override { std::unique_lock lock(mutex_); + if (null_bitmap_chunks_.size() <= static_cast(block_index)) { + null_bitmap_chunks_.resize(static_cast(block_index) + 1, nullptr); + offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr); + } + if (unconverted->type_id() == Type::NA) { auto st = InsertNull(block_index, unconverted->length()); if (!st.ok()) { @@ -212,10 +217,6 @@ class ChunkedListArrayBuilder : public ChunkedArrayBuilder { DCHECK_EQ(unconverted->type_id(), Type::LIST); const auto& list_array = checked_cast(*unconverted); - if (null_bitmap_chunks_.size() <= static_cast(block_index)) { - null_bitmap_chunks_.resize(static_cast(block_index) + 1, nullptr); - offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr); - } null_bitmap_chunks_[block_index] = unconverted->null_bitmap(); offset_chunks_[block_index] = list_array.value_offsets(); diff --git a/cpp/src/arrow/json/chunked_builder_test.cc b/cpp/src/arrow/json/chunked_builder_test.cc index a3810316f76..d04f0d5c9bd 100644 --- a/cpp/src/arrow/json/chunked_builder_test.cc +++ b/cpp/src/arrow/json/chunked_builder_test.cc @@ -438,15 +438,11 @@ TEST(InferringChunkedArrayBuilder, MultipleChunkList) { struct_({}), &builder)); std::shared_ptr actual; - AssertBuilding(builder, - { - "{}\n", - "{\"a\": []}\n", - "{\"a\": [1, 2]}\n", - }, + AssertBuilding(builder, {"{}\n", "{\"a\": []}\n", "{\"a\": [1, 2]}\n", "{}\n"}, &actual); - auto expected = ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]"}); + auto expected = + ChunkedArrayFromJSON(list(int64()), {"[null]", "[[]]", "[[1, 2]]", "[null]"}); AssertFieldEqual({"a"}, actual, *expected); } From a8953dee9c18ee65cf96de290dedf89d09af9303 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 6 Sep 2021 13:14:59 +0200 Subject: [PATCH 27/93] ARROW-13845: [C++] Reconcile RandomArrayGenerator::ArrayOf implementations Also use a more reasonable default "max_length" value for binary-like and list-like types. WARNING: this may change some benchmark numbers due to the "max_length" change. Closes #11062 from pitrou/ARROW-13845-rng-arrayof-refactor Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/testing/random.cc | 126 +++--------------- .../parquet/arrow/arrow_reader_writer_test.cc | 2 + 2 files changed, 20 insertions(+), 108 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index cd3385e5aee..ce6ec1a6e67 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -389,6 +389,8 @@ std::shared_ptr RandomArrayGenerator::StringWithRepeats(int64_t size, int32_t min_length, int32_t max_length, double null_probability) { + ARROW_CHECK_LE(unique, size); + // Generate a random string dictionary without any nulls auto array = String(unique, min_length, max_length, /*null_probability=*/0); auto dictionary = std::dynamic_pointer_cast(array); @@ -619,110 +621,8 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field namespace { -struct RandomArrayGeneratorOfImpl { - Status Visit(const NullType&) { - out_ = std::make_shared(size_); - return Status::OK(); - } - - Status Visit(const BooleanType&) { - double probability = 0.25; - out_ = rag_->Boolean(size_, probability, null_probability_); - return Status::OK(); - } - - template - enable_if_integer Visit(const T&) { - auto max = std::numeric_limits::max(); - auto min = std::numeric_limits::lowest(); - - out_ = rag_->Numeric(size_, min, max, null_probability_); - return Status::OK(); - } - - template - enable_if_floating_point Visit(const T&) { - out_ = rag_->Numeric(size_, 0., 1., null_probability_); - return Status::OK(); - } - - template - enable_if_t::value && - !std::is_same::value && - !std::is_same::value, - Status> - Visit(const T&) { - auto max = std::numeric_limits::max(); - auto min = std::numeric_limits::lowest(); - auto values = - rag_->Numeric(size_, min, max, null_probability_); - return values->View(type_).Value(&out_); - } - - template - enable_if_base_binary Visit(const T& t) { - int32_t min_length = 0; - auto max_length = static_cast(std::sqrt(size_)); - - if (t.layout().buffers[1].byte_width == sizeof(int32_t)) { - out_ = rag_->String(size_, min_length, max_length, null_probability_); - } else { - out_ = rag_->LargeString(size_, min_length, max_length, null_probability_); - } - return out_->View(type_).Value(&out_); - } - - template - enable_if_t::value, Status> Visit(const T& t) { - const int32_t value_size = t.byte_width(); - int64_t data_nbytes = size_ * value_size; - ARROW_ASSIGN_OR_RAISE(std::shared_ptr data, AllocateBuffer(data_nbytes)); - random_bytes(data_nbytes, /*seed=*/0, data->mutable_data()); - auto validity = rag_->Boolean(size_, 1 - null_probability_); - - // Assemble the data for a FixedSizeBinaryArray - auto values_data = std::make_shared(type_, size_); - values_data->buffers = {validity->data()->buffers[1], data}; - out_ = MakeArray(values_data); - return Status::OK(); - } - - Status Visit(const Decimal256Type&) { - out_ = rag_->Decimal256(type_, size_, null_probability_); - return Status::OK(); - } - - Status Visit(const Decimal128Type&) { - out_ = rag_->Decimal128(type_, size_, null_probability_); - return Status::OK(); - } - - Status Visit(const DataType& t) { - return Status::NotImplemented("generation of random arrays of type ", t); - } - - std::shared_ptr Finish() && { - DCHECK_OK(VisitTypeInline(*type_, this)); - DCHECK(type_->Equals(out_->type())); - return std::move(out_); - } - - RandomArrayGenerator* rag_; - const std::shared_ptr& type_; - int64_t size_; - double null_probability_; - std::shared_ptr out_; -}; - -} // namespace - -std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, - int64_t size, - double null_probability) { - return RandomArrayGeneratorOfImpl{this, type, size, null_probability, nullptr}.Finish(); -} - -namespace { +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. template ::ArrowType> enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, const std::string& key, @@ -737,14 +637,24 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad } return output; } + } // namespace +std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, + int64_t size, + double null_probability) { + auto metadata = + key_value_metadata({"null_probability"}, {std::to_string(null_probability)}); + auto field = ::arrow::field("", std::move(type), std::move(metadata)); + return ArrayOf(*field, size); +} + std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t length) { #define VALIDATE_RANGE(PARAM, MIN, MAX) \ if (PARAM < MIN || PARAM > MAX) { \ ABORT_NOT_OK(Status::Invalid(field.ToString(), ": ", ARROW_STRINGIFY(PARAM), \ " must be in [", MIN, ", ", MAX, " ] but got ", \ - null_probability)); \ + PARAM)); \ } #define VALIDATE_MIN_MAX(MIN, MAX) \ if (MIN > MAX) { \ @@ -783,7 +693,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t const auto min_length = GetMetadata( \ field.metadata().get(), "min_length", 0); \ const auto max_length = GetMetadata( \ - field.metadata().get(), "max_length", 1024); \ + field.metadata().get(), "max_length", 20); \ const auto lengths = internal::checked_pointer_cast< \ CTypeTraits::ArrayType>( \ Numeric::ArrowType>( \ @@ -835,7 +745,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t const auto min_length = GetMetadata(field.metadata().get(), "min_length", 0); const auto max_length = - GetMetadata(field.metadata().get(), "max_length", 1024); + GetMetadata(field.metadata().get(), "max_length", 20); const auto unique_values = GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { @@ -956,7 +866,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t const auto min_length = GetMetadata(field.metadata().get(), "min_length", 0); const auto max_length = - GetMetadata(field.metadata().get(), "max_length", 1024); + GetMetadata(field.metadata().get(), "max_length", 20); const auto unique_values = GetMetadata(field.metadata().get(), "unique", -1); if (unique_values > 0) { diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index a7b5155c40f..2bb7abc2023 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4004,6 +4004,7 @@ TEST_P(TestArrowReadDictionary, ZeroChunksListOfDictionary) { auto values = std::make_shared(::arrow::ArrayVector{}, ::arrow::list(::arrow::utf8())); options.num_rows = 0; + options.num_uniques = 0; options.num_row_groups = 1; expected_dense_ = MakeSimpleTable(values, false); @@ -4064,6 +4065,7 @@ TEST_P(TestArrowReadDictionary, StreamReadWholeFileDict) { // Recompute generated data with only one row-group options.num_row_groups = 1; options.num_rows = 16; + options.num_uniques = 7; SetUp(); WriteSimple(); From 4390a64fdea75f0a7d334ed3f860330f8c80dc6a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 6 Sep 2021 08:13:27 -0400 Subject: [PATCH 28/93] ARROW-13857: [R][CI] Remove checkbashisms download Closes #11088 from nealrichardson/remove-checkbashisms-dl Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- ci/scripts/r_docker_configure.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 2b9bc03bea0..d138d030eca 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -37,7 +37,7 @@ if [ "$RHUB_PLATFORM" = "linux-x86_64-fedora-clang" ]; then dnf install -y libcxx-devel sed -i.bak -E -e 's/(CXX1?1? =.*)/\1 -stdlib=libc++/g' $(${R_BIN} RHOME)/etc/Makeconf rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak - + sed -i.bak -E -e 's/(CXXFLAGS = )(.*)/\1 -g -O3 -Wall -pedantic -frtti -fPIC/' $(${R_BIN} RHOME)/etc/Makeconf rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak fi @@ -75,9 +75,3 @@ fi # Workaround for html help install failure; see https://github.com/r-lib/devtools/issues/2084#issuecomment-530912786 Rscript -e 'x <- file.path(R.home("doc"), "html"); if (!file.exists(x)) {dir.create(x, recursive=TRUE); file.copy(system.file("html/R.css", package="stats"), x)}' - -if [ "`which curl`" ]; then - # We need this on R >= 4.0 - curl -L https://sourceforge.net/projects/checkbaskisms/files/2.0.0.2/checkbashisms/download > /usr/local/bin/checkbashisms - chmod 755 /usr/local/bin/checkbashisms -fi From cf0e5e43cc6aed4ff5009acdb4128388e151086a Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 6 Sep 2021 14:22:27 +0200 Subject: [PATCH 29/93] ARROW-13803: [C++] Don't read past end of buffer in BitUtil::SetBitmap I can only get this to actually fail on M1 ARM with optimizations, but what happens here is we read one past the end of a buffer. On M1, this actually ends up in an unmapped region, crashing the program. On Linux/x64, I tried AddressSanitizer and Valgrind but neither caught the access, oddly enough. Closes #11070 from lidavidm/arrow-13803 Lead-authored-by: Yibo Cai Co-authored-by: David Li Signed-off-by: Antoine Pitrou --- cpp/src/arrow/util/bit_util.cc | 6 ++++-- cpp/src/arrow/util/bit_util_test.cc | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index ee4bcde7713..aa78da76531 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -111,8 +111,10 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) { // clean up DCHECK_LT(length, 8); - data[offset / 8] = - BitUtil::SpliceWord(static_cast(length), set_byte, data[offset / 8]); + if (length > 0) { + data[offset / 8] = + BitUtil::SpliceWord(static_cast(length), set_byte, data[offset / 8]); + } } void SetBitmap(uint8_t* data, int64_t offset, int64_t length) { diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 61d064cc65f..c3fb0832198 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1569,6 +1569,12 @@ TEST(BitUtilTests, TestSetBitmap) { uint8_t false_byte = static_cast(0); ASSERT_BYTES_EQ(bitmap, {false_byte, false_byte, false_byte, fill_byte}); } + { + // ASAN test against out of bound access (ARROW-13803) + uint8_t bitmap[1] = {fill_byte}; + BitUtil::ClearBitmap(bitmap, 0, 8); + ASSERT_EQ(bitmap[0], 0); + } } } From b1cfa7db007e76054e20a011110e8b0f9322b79d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 6 Sep 2021 08:42:52 -0400 Subject: [PATCH 30/93] ARROW-13912: [R] TrimOptions implementation breaks test-r-minimal-build due to dependencies Also skips a puzzling old R version test failure from ARROW-13740 cc @thisisnic Closes #11087 from nealrichardson/fix-nightlies Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- r/tests/testthat/test-compute-no-bindings.R | 2 ++ r/tests/testthat/test-dplyr-collapse.R | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-compute-no-bindings.R b/r/tests/testthat/test-compute-no-bindings.R index 33b97e8ed76..afcc779ea5d 100644 --- a/r/tests/testthat/test-compute-no-bindings.R +++ b/r/tests/testthat/test-compute-no-bindings.R @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. + test_that("non-bound compute kernels using TrimOptions", { +skip_if_not_available("utf8proc") expect_equal( call_function("utf8_trim", Scalar$create("abracadabra"), options = list(characters = "ab")), Scalar$create("racadabr") diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index 331f7b7b62c..b36f70db232 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -169,12 +169,22 @@ See $.data for the source Arrow object", fixed = TRUE ) + skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal") + # On older R versions: + # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ──────── + # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40). + # Component "total": Mean relative difference: 0.3846154 + # Component "extra": Mean relative difference: 0.3846154 + # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ──────── + # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125). + # Component "total": Mean relative difference: 0.9230769 + # Component "extra": Mean relative difference: 0.9230769 expect_equal( - head(q, 1) %>% collect(), + q %>% head(1) %>% collect(), tibble::tibble(lgl = FALSE, total = 8L, extra = 40) ) expect_equal( - tail(q, 1) %>% collect(), + q %>% tail(1) %>% collect(), tibble::tibble(lgl = NA, total = 25L, extra = 125) ) }) From 303b7f4f55a1f1407a1dc16915b586f14b82ff2a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 6 Sep 2021 10:11:31 -0400 Subject: [PATCH 31/93] ARROW-13915: [R][CI] R UCRT C++ bundles are incomplete FYI @jeroen Closes #11096 from nealrichardson/add-ucrt-libs Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- ci/scripts/r_windows_build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index 47120eef433..8a96b3f5e79 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -92,10 +92,10 @@ cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/l # Do the same also for ucrt64 if [ "$RTOOLS_VERSION" != "35" ]; then -ls $MSYS_LIB_DIR/ucrt64/lib/ -mkdir -p $DST_DIR/lib/x64-ucrt -mv ucrt64/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/x64-ucrt -cp $MSYS_LIB_DIR/ucrt64/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt + ls $MSYS_LIB_DIR/ucrt64/lib/ + mkdir -p $DST_DIR/lib/x64-ucrt + mv ucrt64/lib/*.a $DST_DIR/lib/x64-ucrt + cp $MSYS_LIB_DIR/ucrt64/lib/lib{thrift,snappy,zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt fi # Create build artifact From fd47183f09ae873bf52214a747fdcad5a32515b6 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 6 Sep 2021 17:02:36 +0200 Subject: [PATCH 32/93] ARROW-13913: [C++] Don't segfault if IndexOptions omitted Unlike the other aggregates, there's no default options to fall back on here. Closes #11097 from lidavidm/arrow-13913 Authored-by: David Li Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/aggregate_basic.cc | 3 +++ cpp/src/arrow/compute/kernels/aggregate_test.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 2952eade96b..b19536d33ab 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -549,6 +549,9 @@ struct IndexInit { static Result> Init(KernelContext* ctx, const KernelInitArgs& args) { + if (!args.options) { + return Status::Invalid("Must provide IndexOptions for index kernel"); + } IndexInit visitor(ctx, static_cast(*args.options), *args.inputs[0].type); return visitor.Create(); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 587e2033184..fcf48e25a92 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1853,6 +1853,10 @@ TYPED_TEST(TestNumericIndexKernel, Basics) { this->AssertIndexIs(chunked_input2, value, 4); this->AssertIndexIs(chunked_input3, value, -1); this->AssertIndexIs(chunked_input4, value, 5); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Must provide IndexOptions"), + CallFunction("index", {ArrayFromJSON(this->type_singleton(), "[0]")})); } TYPED_TEST(TestNumericIndexKernel, Random) { constexpr auto kChunks = 4; From 02343c89e1f18eea82721c41c4067b969a1c05d4 Mon Sep 17 00:00:00 2001 From: Rok Date: Mon, 6 Sep 2021 18:05:55 +0200 Subject: [PATCH 33/93] ARROW-13684: [C++][Compute] Strftime kernel follow-up This is to resolve [ARROW-13684](https://issues.apache.org/jira/browse/ARROW-13684). 1. Default strftime string is now `%Y-%m-%dT%H:%M:%S`. Perhaps `%Y-%m-%dT%H:%M:%S%z` would be better? 2. Timestamps without timezone are now strftime-ed as if they were in `UTC`. Not sure this is the way to go. What if the local time is really invalid but we can't tell? 3. Document `%S` behavior. What would be a good location to do that? Closes #10998 from rok/ARROW-13684 Lead-authored-by: Rok Co-authored-by: Rok Mihevc Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/api_scalar.h | 2 +- .../arrow/compute/kernels/scalar_temporal.cc | 94 ++++++++++++++----- .../compute/kernels/scalar_temporal_test.cc | 16 +++- docs/source/cpp/compute.rst | 28 ++++-- python/pyarrow/_compute.pyx | 2 +- python/pyarrow/tests/test_compute.py | 37 ++++++-- 6 files changed, 129 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 2cbc0fde2b2..e959884b233 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -185,7 +185,7 @@ class ARROW_EXPORT StrftimeOptions : public FunctionOptions { constexpr static char const kTypeName[] = "StrftimeOptions"; - constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%SZ"; + constexpr static const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S"; /// The desired format string. std::string format; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index 44c7f75a038..d70411f8338 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -476,11 +476,17 @@ struct Strftime { static Result Make(KernelContext* ctx, const DataType& type) { const StrftimeOptions& options = StrftimeState::Get(ctx); - const auto& timezone = GetInputTimezone(type); + auto timezone = GetInputTimezone(type); if (timezone.empty()) { - return Status::Invalid( - "Timestamps without a time zone cannot be reliably formatted."); + if ((options.format.find("%z") != std::string::npos) || + (options.format.find("%Z") != std::string::npos)) { + return Status::Invalid( + "Timezone not present, cannot convert to string with timezone: ", + options.format); + } + timezone = "UTC"; } + ARROW_ASSIGN_OR_RAISE(const time_zone* tz, LocateZone(timezone)); ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale(options.locale)); @@ -737,107 +743,145 @@ std::shared_ptr MakeSimpleUnaryTemporal( const FunctionDoc year_doc{ "Extract year from timestamp", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc month_doc{ "Extract month number", ("Month is encoded as January=1, December=12.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc day_doc{ "Extract day number", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc day_of_week_doc{ "Extract day of the week number", - ("By default, the week starts on Monday represented by 0 and ends on Sunday " + ("By default, the week starts on Monday represented by 0 and ends on Sunday\n" "represented by 6.\n" - "DayOfWeekOptions.week_start can be used to set another starting day using ISO " - "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using " - "DayOfWeekOptions.one_based_numbering parameter.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "`DayOfWeekOptions.week_start` can be used to set another starting day using\n" + "the ISO numbering convention (1=start week on Monday, 7=start week on Sunday).\n" + "Day numbers can start at 0 or 1 based on `DayOfWeekOptions.one_based_numbering`.\n" + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}, "DayOfWeekOptions"}; const FunctionDoc day_of_year_doc{ "Extract number of day of year", ("January 1st maps to day number 1, February 1st to 32, etc.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_year_doc{ "Extract ISO year number", ("First week of an ISO year has the majority (4 or more) of its days in January." - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_week_doc{ "Extract ISO week of year number", ("First ISO week has the majority (4 or more) of its days in January.\n" "Week of the year starts with 1 and can run up to 53.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc iso_calendar_doc{ "Extract (ISO year, ISO week, ISO day of week) struct", ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc quarter_doc{ "Extract quarter of year number", ("First quarter maps to 1 and forth quarter maps to 4.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc hour_doc{ "Extract hour value", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc minute_doc{ "Extract minute values", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc second_doc{ "Extract second values", - "Returns an error if timestamp has a defined timezone. Null values return null.", + ("Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc millisecond_doc{ "Extract millisecond values", ("Millisecond returns number of milliseconds since the last full second.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc microsecond_doc{ "Extract microsecond values", ("Millisecond returns number of microseconds since the last full millisecond.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc nanosecond_doc{ "Extract nanosecond values", ("Nanosecond returns number of nanoseconds since the last full microsecond.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc subsecond_doc{ "Extract subsecond values", ("Subsecond returns the fraction of a second since the last full second.\n" - "Returns an error if timestamp has a defined timezone. Null values return null."), + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database."), {"values"}}; const FunctionDoc strftime_doc{ "Format timestamps according to a format string", ("For each input timestamp, emit a formatted string.\n" "The time format string and locale can be set using StrftimeOptions.\n" - "An error is returned if the timestamps don't have a defined timezone,\n" - "or if the timezone cannot be found in the timezone database."), + "The output precision of the \"%S\" (seconds) format code depends on\n" + "the input timestamp precision: it is an integer for timestamps with\n" + "second precision, a real number with the required number of fractional\n" + "digits for higher precisions.\n" + "Null values emit null.\n" + "An error is returned if the timestamps have a defined timezone but it\n" + "cannot be found in the timezone database, or if the specified locale\n" + "does not exist on this system."), {"timestamps"}, "StrftimeOptions"}; diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index d8199089328..32e46ae5818 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -394,7 +394,7 @@ TEST_F(ScalarTemporalTest, Strftime) { const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; const char* default_seconds = R"( - ["1970-01-01T00:00:59Z", "2021-08-18T15:11:50Z", null])"; + ["1970-01-01T00:00:59", "2021-08-18T15:11:50", null])"; const char* string_seconds = R"( ["1970-01-01T00:00:59+0000", "2021-08-18T15:11:50+0000", null])"; const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; @@ -414,12 +414,20 @@ TEST_F(ScalarTemporalTest, Strftime) { } TEST_F(ScalarTemporalTest, StrftimeNoTimezone) { + auto options_default = StrftimeOptions(); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND), seconds); + + CheckScalarUnary("strftime", timestamp(TimeUnit::SECOND), seconds, utf8(), seconds, + &options_default); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, - testing::HasSubstr("Timestamps without a time zone cannot be reliably formatted"), - Strftime(arr, StrftimeOptions())); + testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"), + Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%z"))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + testing::HasSubstr("Invalid: Timezone not present, cannot convert to string"), + Strftime(arr, StrftimeOptions("%Y-%m-%dT%H:%M:%S%Z"))); } TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { @@ -440,7 +448,7 @@ TEST_F(ScalarTemporalTest, StrftimeCLocale) { const char* microseconds = R"(["1970-01-01T00:00:59.123456", null])"; const char* nanoseconds = R"(["1970-01-01T00:00:59.123456789", null])"; - const char* default_seconds = R"(["1970-01-01T00:00:59Z", null])"; + const char* default_seconds = R"(["1970-01-01T00:00:59", null])"; const char* string_seconds = R"(["1970-01-01T00:00:59+0000", null])"; const char* string_milliseconds = R"(["1970-01-01T00:00:59.123+0000", null])"; const char* string_microseconds = R"(["1970-01-01T05:30:59.123456+0530", null])"; diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 7263d77acf2..f4ef440f100 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1105,19 +1105,29 @@ number of input and output types. The type to cast to can be passed in a :struct:`CastOptions` instance. As an alternative, the same service is provided by a concrete function :func:`~arrow::compute::Cast`. -+--------------------------+------------+--------------------+------------------+------------------------------+ -| Function name | Arity | Input types | Output type | Options class | -+==========================+============+====================+==================+==============================+ -| cast | Unary | Many | Variable | :struct:`CastOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ -| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ -| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | -+--------------------------+------------+--------------------+------------------+------------------------------+ ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=================+============+====================+==================+==============================+=======+ +| cast | Unary | Many | Variable | :struct:`CastOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strftime | Unary | Timestamp | String | :struct:`StrftimeOptions` | \(1) | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ +| strptime | Unary | String-like | Timestamp | :struct:`StrptimeOptions` | | ++-----------------+------------+--------------------+------------------+------------------------------+-------+ The conversions available with ``cast`` are listed below. In all cases, a null input value is converted into a null output value. +* \(1) Output precision of ``%S`` (seconds) flag depends on the input timestamp + precision. Timestamps with second precision are represented as integers while + milliseconds, microsecond and nanoseconds are represented as fixed floating + point numbers with 3, 6 and 9 decimal places respectively. To obtain integer + seconds, cast to timestamp with second resolution. + The character for the decimal point is localized according to the locale. + See `detailed formatting documentation`_ for descriptions of other flags. + +.. _detailed formatting documentation: https://howardhinnant.github.io/date/date.html#to_stream_formatting + **Truth value extraction** +-----------------------------+------------------------------------+--------------+ diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 29c579f85a9..aaf4c9f2916 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -989,7 +989,7 @@ cdef class _StrftimeOptions(FunctionOptions): class StrftimeOptions(_StrftimeOptions): - def __init__(self, format="%Y-%m-%dT%H:%M:%SZ", locale="C"): + def __init__(self, format="%Y-%m-%dT%H:%M:%S", locale="C"): self._set_options(format, locale) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index bbef46f2477..334a2a7bda3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1496,10 +1496,18 @@ def _fix_timestamp(s): expected = pa.array(_fix_timestamp(ts.strftime(fmt))) assert result.equals(expected) + fmt = "%Y-%m-%dT%H:%M:%S" + # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) result = pc.strftime(tsa, options=pc.StrftimeOptions()) - expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) + assert result.equals(expected) + + # Default format plus timezone + tsa = pa.array(ts, type=pa.timestamp("s", timezone)) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + expected = pa.array(_fix_timestamp(ts.strftime(fmt + "%Z"))) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" @@ -1518,18 +1526,27 @@ def _fix_timestamp(s): # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - options = pc.StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C") + options = pc.StrftimeOptions(fmt, "C") result = pc.strftime(tsa, options=options) - expected = pa.array(_fix_timestamp(ts.strftime("%Y-%m-%dT%H:%M:%SZ"))) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) assert result.equals(expected) - for unit in ["s", "ms", "us", "ns"]: - tsa = pa.array(ts, type=pa.timestamp(unit)) - for fmt in formats: - with pytest.raises(pa.ArrowInvalid, - match="Timestamps without a time zone " - "cannot be reliably formatted"): - pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + # Test timestamps without timezone + fmt = "%Y-%m-%dT%H:%M:%S" + ts = pd.to_datetime(times) + tsa = pa.array(ts, type=pa.timestamp("s")) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + expected = pa.array(_fix_timestamp(ts.strftime(fmt))) + + assert result.equals(expected) + with pytest.raises( + pa.ArrowInvalid, + match="Timezone not present, cannot convert to string"): + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + with pytest.raises( + pa.ArrowInvalid, + match="Timezone not present, cannot convert to string"): + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) def _check_datetime_components(timestamps, timezone=None): From 5876e3f06ef1c275e02cbc66be03c875faee6302 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 6 Sep 2021 15:41:19 -0500 Subject: [PATCH 34/93] ARROW-13403: [R] Update developing.Rmd vignette Closes #10930 from thisisnic/ARROW-13403_developing_vignette Lead-authored-by: Nic Crane Co-authored-by: Nic Co-authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- r/vignettes/developing.Rmd | 554 ++++++++++++++++++++----------------- r/vignettes/install.Rmd | 37 +++ 2 files changed, 340 insertions(+), 251 deletions(-) diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index 3d7f82e3619..59c231724aa 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -9,13 +9,11 @@ vignette: > ```{r setup-options, include=FALSE} knitr::opts_chunk$set(error = TRUE, eval = FALSE) - # Get environment variables describing what to evaluate run <- tolower(Sys.getenv("RUN_DEVDOCS", "false")) == "true" macos <- tolower(Sys.getenv("DEVDOCS_MACOS", "false")) == "true" ubuntu <- tolower(Sys.getenv("DEVDOCS_UBUNTU", "false")) == "true" sys_install <- tolower(Sys.getenv("DEVDOCS_SYSTEM_INSTALL", "false")) == "true" - # Update the source knit_hook to save the chunk (if it is marked to be saved) knit_hooks_source <- knitr::knit_hooks$get("source") knitr::knit_hooks$set(source = function(x, options) { @@ -40,90 +38,111 @@ set -e set -x ``` -If you're looking to contribute to `arrow`, this document can help you set up a development environment that will enable you to write code and run tests locally. It outlines how to build the various components that make up the Arrow project and R package, as well as some common troubleshooting and workflows developers use. Many contributions can be accomplished with the instructions in [R-only development](#r-only-development). But if you're working on both the C++ library and the R package, the [Developer environment setup](#-developer-environment-setup) section will guide you through setting up a developer environment. +If you're looking to contribute to arrow, this vignette can help you set up a development environment that will enable you to write code and run tests locally. It outlines: + +* how to build the components that make up the Arrow project and R package +* workflows that developers use +* some common troubleshooting steps and solutions + +This document is intended only for **developers** of Apache Arrow or the Arrow R package. R package users do not need to do any of this setup. If you're looking for how to install Arrow, see [the instructions in the readme](https://arrow.apache.org/docs/r/#installation). + +This document is a work in progress and will grow and change as the Apache Arrow project grows and changes. We have tried to make these steps as robust as possible (in fact, we even test exactly these instructions on our nightly CI to ensure they don't become stale!), but custom configurations might conflict with these instructions and there are differences of opinion across developers about how to set up development environments like this. -This document is intended only for developers of Apache Arrow or the Arrow R package. Users of the package in R do not need to do any of this setup. If you're looking for how to install Arrow, see [the instructions in the readme](https://arrow.apache.org/docs/r/#installation); Linux users can find more details on building from source at `vignette("install", package = "arrow")`. +We welcome any feedback you have about things that are confusing or additions you would like to see here - please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) if you have any suggestions or requests. -This document is a work in progress and will grow + change as the Apache Arrow project grows and changes. We have tried to make these steps as robust as possible (in fact, we even test exactly these instructions on our nightly CI to ensure they don't become stale!), but certain custom configurations might conflict with these instructions and there are differences of opinion across developers about if and what the one true way to set up development environments like this is. We also solicit any feedback you have about things that are confusing or additions you would like to see here. Please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) if there you see anything that is confusing, odd, or just plain wrong. +# Developer environment setup -## R-only development +## R-only {.tabset} Windows and macOS users who wish to contribute to the R package and -don’t need to alter the Arrow C++ library may be able to obtain a -recent version of the library without building from source. On macOS, -you may install the C++ library using [Homebrew](https://brew.sh/): +don't need to alter libarrow (Arrow's C++ library) may be able to obtain a +recent version of the library without building from source. + +### Linux + +On Linux, you can download a .zip file containing libarrow from the +nightly repository. + +To see what nightlies are available, you can use arrow's (or any other S3 client's) S3 listing functionality to see what is in the bucket `s3://arrow-r-nightly/libarrow/bin`: -``` shell +``` +nightly <- s3_bucket("arrow-r-nightly") +nightly$ls("libarrow/bin") +``` +Version numbers in that repository correspond to dates. + +You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it. + +### macOS +On macOS, you can install libarrow using [Homebrew](https://brew.sh/): + +```bash # For the released version: brew install apache-arrow # Or for a development version, you can try: brew install apache-arrow --HEAD ``` -On Windows and Linux, you can download a .zip file with the arrow dependencies from the -nightly repository. -Windows users then can set the `RWINLIB_LOCAL` environment variable to point to that -zip file before installing the `arrow` R package. On Linux, you'll need to create a `libarrow` directory inside the R package directory and unzip that file into it. Version numbers in that -repository correspond to dates, and you will likely want the most recent. +### Windows + +On Windows, you can download a .zip file containing libarrow from the nightly repository. -To see what nightlies are available, you can use Arrow's (or any other S3 client's) S3 listing functionality to see what is in the bucket `s3://arrow-r-nightly/libarrow/bin`: +To see what nightlies are available, you can use arrow's (or any other S3 client's) S3 listing functionality to see what is in the bucket `s3://arrow-r-nightly/libarrow/bin`: ``` nightly <- s3_bucket("arrow-r-nightly") nightly$ls("libarrow/bin") ``` +Version numbers in that repository correspond to dates. -## Developer environment setup +You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package. -If you need to alter both the Arrow C++ library and the R package code, or if you can’t get a binary version of the latest C++ library elsewhere, you’ll need to build it from source too. This section discusses how to set up a C++ build configured to work with the R package. For more general resources, see the [Arrow C++ developer -guide](https://arrow.apache.org/docs/developers/cpp/building.html). -There are four major steps to the process — the first three are relevant to all Arrow developers, and the last one is specific to the R bindings: +## R and C++ -1. Configuring the Arrow library build (using `cmake`) — this specifies how you want the build to go, what features to include, etc. -2. Building the Arrow library — this actually compiles the Arrow library -3. Install the Arrow library — this organizes and moves the compiled Arrow library files into the location specified in the configuration -4. Building the R package — this builds the C++ code in the R package, and installs the R package for you +If you need to alter both libarrow and the R package code, or if you can't get a binary version of the latest libarrow elsewhere, you'll need to build it from source. This section discusses how to set up a C++ libarrow build configured to work with the R package. For more general resources, see the [Arrow C++ developer guide](https://arrow.apache.org/docs/developers/cpp/building.html). -### Install dependencies {.tabset} +There are five major steps to the process. -The Arrow C++ library will by default use system dependencies if suitable versions are found; if they are not present, it will build them during its own build process. The only dependencies that one needs to install outside of the build process are `cmake` (for configuring the build) and `openssl` if you are building with S3 support. +### Step 1 - Install dependencies {.tabset} -For a faster build, you may choose to install on the system more C++ library dependencies (such as `lz4`, `zstd`, etc.) so that they don't need to be built from source in the Arrow build. This is optional. +When building libarrow, by default, system dependencies will be used if suitable versions are found. If system dependencies are not present, libarrow will build them during its own build process. The only dependencies that you need to install _outside_ of the build process are [cmake](https://cmake.org/) (for configuring the build) and [openssl](https://www.openssl.org/) if you are building with S3 support. -#### macOS -```{bash, save=run & macos} -brew install cmake openssl -``` +For a faster build, you may choose to pre-install more C++ library dependencies (such as [lz4](http://lz4.github.io/lz4/), [zstd](https://facebook.github.io/zstd/), etc.) on the system so that they don't need to be built from source in the libarrow build. #### Ubuntu ```{bash, save=run & ubuntu} sudo apt install -y cmake libcurl4-openssl-dev libssl-dev ``` -### Configure the Arrow build {.tabset} +#### macOS +```{bash, save=run & macos} +brew install cmake openssl +``` -You can choose to build and then install the Arrow library into a user-defined directory or into a system-level directory. You only need to do one of these two options. +#### Windows -It is recommended that you install the arrow library to a user-level directory to be used in development. This is so that the development version you are using doesn't overwrite a released version of Arrow you may have installed. You are also able to have more than one version of the Arrow library to link to with this approach (by using different `ARROW_HOME` directories for the different versions). This approach also matches the recommendations for other Arrow bindings like [Python](http://arrow.apache.org/docs/developers/python.html). +Currently, the R package cannot be made to work with a local libarrow build. This will be resolved in a future release. -#### Configure for installing to a user directory +### Step 2 - Configure the libarrow build -In this example we will install it to a directory called `dist` that has the same parent as our `arrow` checkout, but it could be named or located anywhere you would like. However, note that your installation of the Arrow R package will point to this directory and need it to remain intact for the package to continue to work. This is one reason we recommend *not* placing it inside of the arrow git checkout. +We recommend that you configure libarrow to be built to a user-level directory rather than a system directory for your development work. This is so that the development version you are using doesn't overwrite a released version of libarrow you may already have installed, and so that you are also able work with more than one version of libarrow (by using different `ARROW_HOME` directories for the different versions). + +In the example below, libarrow is installed to a directory called `dist` that has the same parent directory as the `arrow` checkout. Your installation of the Arrow R package can point to any directory with any name, though we recommend *not* placing it inside of the `arrow` git checkout directory as unwanted changes could stop it working properly. ```{bash, save=run & !sys_install} export ARROW_HOME=$(pwd)/dist mkdir $ARROW_HOME ``` -_Special instructions on Linux:_ You will need to set `LD_LIBRARY_PATH` to the `lib` directory that is under where we set `$ARROW_HOME`, before launching R and using Arrow. One way to do this is to add it to your profile (we use `~/.bash_profile` here, but you might need to put this in a different file depending on your setup, e.g. if you use a shell other than `bash`). On macOS we do not need to do this because the macOS shared library paths are hardcoded to their locations during build time. +_Special instructions on Linux:_ You will need to set `LD_LIBRARY_PATH` to the `lib` directory that is under where you set `$ARROW_HOME`, before launching R and using arrow. One way to do this is to add it to your profile (we use `~/.bash_profile` here, but you might need to put this in a different file depending on your setup, e.g. if you use a shell other than `bash`). On macOS you do not need to do this because the macOS shared library paths are hardcoded to their locations during build time. ```{bash, save=run & ubuntu & !sys_install} export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH echo "export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH" >> ~/.bash_profile ``` -Now we can move into the arrow repository to start the build process. You will need to create a directory into which the C++ build will put its contents. It is recommended to make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). And then, change directories to be inside `cpp/build`: +Start by navigating in a terminal to the `arrow` repository. You will need to create a directory into which the C++ build will put its contents. We recommend that you make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). Next, change directories to be inside `cpp/build`: ```{bash, save=run & !sys_install} pushd arrow @@ -131,7 +150,7 @@ mkdir -p cpp/build pushd cpp/build ``` -You’ll first call `cmake` to configure the build and then `make install`. For the R package, you’ll need to enable several features in the C++ library using `-D` flags: +You'll first call `cmake` to configure the build and then `make install`. For the R package, you'll need to enable several features in libarrow using `-D` flags: ```{bash, save=run & !sys_install} cmake \ @@ -151,45 +170,13 @@ cmake \ .. ``` -`..` refers to the C++ source directory: we're in `cpp/build`, and the source is in `cpp`. - -#### Configure to install to a system directory - -If you would like to install Arrow as a system library you can do that as well. This is in some respects simpler, but if you already have Arrow libraries installed there, it would disrupt them and possibly require `sudo` permissions. - -Now we can move into the arrow repository to start the build process. You will need to create a directory into which the C++ build will put its contents. It is recommended to make a `build` directory inside of the `cpp` directory of the Arrow git repository (it is git-ignored, so you won't accidentally check it in). And then, change directories to be inside `cpp/build`: +`..` refers to the C++ source directory: you're in `cpp/build` and the source is in `cpp`. -```{bash, save=run & sys_install} -pushd arrow -mkdir -p cpp/build -pushd cpp/build -``` +#### Enabling more Arrow features -You’ll first call `cmake` to configure the build and then `make install`. For the R package, you’ll need to enable several features in the C++ library using `-D` flags: +To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags to your call to `cmake` (the trailing `\` makes them easier to paste into a bash shell on a new line): -```{bash, save=run & sys_install} -cmake \ - -DARROW_COMPUTE=ON \ - -DARROW_CSV=ON \ - -DARROW_DATASET=ON \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DARROW_FILESYSTEM=ON \ - -DARROW_INSTALL_NAME_RPATH=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_JSON=ON \ - -DARROW_PARQUET=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - .. -``` - -`..` refers to the C++ source directory: we're in `cpp/build`, and the source is in `cpp`. - -### More Arrow features - -To enable optional features including: S3 support, an alternative memory allocator, and additional compression libraries, add some or all of these flags (the trailing `\` makes them easier to paste into a bash shell on a new line): - -``` shell +```bash -DARROW_MIMALLOC=ON \ -DARROW_S3=ON \ -DARROW_WITH_BROTLI=ON \ @@ -201,12 +188,13 @@ To enable optional features including: S3 support, an alternative memory allocat Other flags that may be useful: -* `-DBoost_SOURCE=BUNDLED` and `-DThrift_SOURCE=bundled`, for example, or any other dependency `*_SOURCE`, if you have a system version of a C++ dependency that doesn't work correctly with Arrow. This tells the build to compile its own version of the dependency from source. +* `-DBoost_SOURCE=BUNDLED` and `-DThrift_SOURCE=BUNDLED`, for example, or any other dependency `*_SOURCE`, if you have a system version of a C++ dependency that doesn't work correctly with Arrow. This tells the build to compile its own version of the dependency from source. + * `-DCMAKE_BUILD_TYPE=debug` or `-DCMAKE_BUILD_TYPE=relwithdebinfo` can be useful for debugging. You probably don't want to do this generally because a debug build is much slower at runtime than the default `release` build. -_Note_ `cmake` is particularly sensitive to whitespacing, if you see errors, check that you don't have any errant whitespace around +_Note_ `cmake` is particularly sensitive to whitespacing, if you see errors, check that you don't have any errant whitespace. -### Build Arrow +### Step 3 - Building libarrow You can add `-j#` between `make` and `install` here too to speed up compilation by running in parallel (where `#` is the number of cores you have available). @@ -214,49 +202,39 @@ You can add `-j#` between `make` and `install` here too to speed up compilation make -j8 install ``` -If you are installing on linux, and you are installing to the system, you may -need to use `sudo`: - -```{bash, save=run & sys_install & ubuntu} -sudo make install -``` - +### Step 4 - Build the Arrow R package -### Build the Arrow R package - -Once you’ve built the C++ library, you can install the R package and its +Once you've built libarrow, you can install the R package and its dependencies, along with additional dev dependencies, from the git checkout: ```{bash, save=run} popd # To go back to the root directory of the project, from cpp/build - pushd r R -e 'install.packages("remotes"); remotes::install_deps(dependencies = TRUE)' - R CMD INSTALL . ``` -### Compilation flags +#### Compilation flags If you need to set any compilation flags while building the C++ extensions, you can use the `ARROW_R_CXXFLAGS` environment variable. For example, if you are using `perf` to profile the R extensions, you may need to set -``` shell +```bash export ARROW_R_CXXFLAGS=-fno-omit-frame-pointer ``` -### Developer Experience +#### Recompiling the C++ code -With the setups described here, you should not need to rebuild the Arrow library or even the C++ source in the R package as you iterated and work on the R package. The only time those should need to be rebuilt is if you have changed the C++ in the R package (and even then, `R CMD INSTALL .` should only need to recompile the files that have changed) _or_ if the Arrow library C++ has changed and there is a mismatch between the Arrow Library and the R package. If you find yourself rebuilding either or both each time you install the package or run tests, something is probably wrong with your set up. +With the setup described here, you should not need to rebuild the Arrow library or even the C++ source in the R package as you iterate and work on the R package. The only time those should need to be rebuilt is if you have changed the C++ in the R package (and even then, `R CMD INSTALL .` should only need to recompile the files that have changed) _or_ if the libarrow C++ has changed and there is a mismatch between libarrow and the R package. If you find yourself rebuilding either or both each time you install the package or run tests, something is probably wrong with your set up.
    For a full build: a `cmake` command with all of the R-relevant optional dependencies turned on. Development with other languages might require different flags as well. For example, to develop Python, you would need to also add `-DARROW_PYTHON=ON` (though all of the other flags used for Python are already included here).

    -``` shell +```bash cmake \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DCMAKE_INSTALL_LIBDIR=lib \ @@ -280,193 +258,141 @@ cmake \ .. ```

    -
    + -### Documentation +## Installing a version of the R package with a specific git reference -The documentation for the R package uses features of `roxygen2` that haven't yet been released on CRAN, such as conditional inclusion of examples via the `@examplesIf` tag. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. +If you need an arrow installation from a specific repository or git reference, on most platforms except Windows, you can run: ```{r} -remotes::install_github("r-lib/roxygen2") -``` - -## Troubleshooting - -Note that after any change to the C++ library, you must reinstall it and -run `make clean` or `git clean -fdx .` to remove any cached object code -in the `r/src/` directory before reinstalling the R package. This is -only necessary if you make changes to the C++ library source; you do not -need to manually purge object files if you are only editing R or C++ -code inside `r/`. - -### Arrow library-R package mismatches - -If the Arrow library and the R package have diverged, you will see errors like: - -``` -Error: package or namespace load failed for ‘arrow’ in dyn.load(file, DLLpath = DLLpath, ...): - unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so': - dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Symbol not found: __ZN5arrow2io16RandomAccessFile9ReadAsyncERKNS0_9IOContextExx - Referenced from: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so - Expected in: flat namespace - in /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so -Error: loading failed -Execution halted -ERROR: loading failed +remotes::install_github("apache/arrow/r", build = FALSE) ``` -To resolve this, try rebuilding the Arrow library from [Building Arrow above](#building-arrow). +The `build = FALSE` argument is important so that the installation can access the +C++ source in the `cpp/` directory in `apache/arrow`. -### Multiple versions of Arrow library +As with other installation methods, setting the environment variables `LIBARROW_MINIMAL=false` and `ARROW_R_DEV=true` will provide a more full-featured version of Arrow and provide more verbose output, respectively. -If rebuilding the Arrow library doesn't work and you are [installing from a user-level directory](#installing-to-another-directory) and you already have a previous installation of libarrow in a system directory or you get you may get errors like the following when you install the R package: +For example, to install from the (fictional) branch `bugfix` from `apache/arrow` you could run: -``` -Error: package or namespace load failed for ‘arrow’ in dyn.load(file, DLLpath = DLLpath, ...): - unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so': - dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: /usr/local/lib/libarrow.400.dylib - Referenced from: /usr/local/lib/libparquet.400.dylib - Reason: image not found +```r +Sys.setenv(LIBARROW_MINIMAL="false") +remotes::install_github("apache/arrow/r@bugfix", build = FALSE) ``` -You need to make sure that you don't let R link to your system library when building arrow. You can do this a number of different ways: +Developers may wish to use this method of installing a specific commit +separate from another Arrow development environment or system installation +(e.g. we use this in [arrowbench](https://github.com/ursacomputing/arrowbench) +to install development versions of libarrow isolated from the system install). If +you already have libarrow installed system-wide, you may need to set +some additional variables in order to isolate this build from your system libraries: -* Setting the `MAKEFLAGS` environment variable to `"LDFLAGS="` (see below for an example) this is the recommended way to accomplish this -* Using {withr}'s `with_makevars(list(LDFLAGS = ""), ...)` -* adding `LDFLAGS=` to your `~/.R/Makevars` file (the least recommended way, though it is a common debugging approach suggested online) +* Setting the environment variable `FORCE_BUNDLED_BUILD` to `true` will skip the `pkg-config` search for libarrow and attempt to build from the same source at the repository+ref given. -```{bash, save=run & !sys_install & macos, hide=TRUE} -# Setup troubleshooting section -# install a system-level arrow on macOS -brew install apache-arrow +* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: +```{r} +withr::with_makevars(list(CPPFLAGS = "", LDFLAGS = ""), remotes::install_github(...)) ``` +# Common developer workflow tasks -```{bash, save=run & !sys_install & ubuntu, hide=TRUE} -# Setup troubleshooting section -# install a system-level arrow on Ubuntu -sudo apt update -sudo apt install -y -V ca-certificates lsb-release wget -wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb -sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb -sudo apt update -sudo apt install -y -V libarrow-dev -``` +The `arrow/r` directory contains a `Makefile` to help with some common tasks from the command line (e.g. `make test`, `make doc`, `make clean`, etc.). -```{bash, save=run & !sys_install & macos} -MAKEFLAGS="LDFLAGS=" R CMD INSTALL . -``` +## Loading arrow +You can load the R package via `devtools::load_all()`. -### `rpath` issues +## Rebuilding the documentation -If the package fails to install/load with an error like this: +The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. -``` - ** testing if installed package can be loaded from temporary location - Error: package or namespace load failed for 'arrow' in dyn.load(file, DLLpath = DLLpath, ...): - unable to load shared object '/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so': - dlopen(/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: @rpath/libarrow.14.dylib +```{r} +remotes::install_github("r-lib/roxygen2") ``` -ensure that `-DARROW_INSTALL_NAME_RPATH=OFF` was passed (this is important on -macOS to prevent problems at link time and is a no-op on other platforms). -Alternatively, try setting the environment variable `R_LD_LIBRARY_PATH` to -wherever Arrow C++ was put in `make install`, e.g. `export -R_LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package. +You can use `devtools::document()` and `pkgdown::build_site()` to rebuild the documentation and preview the results. -When installing from source, if the R and C++ library versions do not -match, installation may fail. If you’ve previously installed the -libraries and want to upgrade the R package, you’ll need to update the -Arrow C++ library first. - -For any other build/configuration challenges, see the [C++ developer -guide](https://arrow.apache.org/docs/developers/cpp/building.html). +```r +# Update roxygen documentation +devtools::document() +# To preview the documentation website +pkgdown::build_site(preview=TRUE) +``` -## Using `remotes::install_github(...)` +## Styling and linting -If you need an Arrow installation from a specific repository or at a specific ref, -`remotes::install_github("apache/arrow/r", build = FALSE)` -should work on most platforms (with the notable exception of Windows). -The `build = FALSE` argument is important so that the installation can access the -C++ source in the `cpp/` directory in `apache/arrow`. +### R code -As with other installation methods, setting the environment variables `LIBARROW_MINIMAL=false` and `ARROW_R_DEV=true` will provide a more full-featured version of Arrow and provide more verbose output, respectively. +The R code in the package follows [the tidyverse style](https://style.tidyverse.org/). On PR submission (and on pushes) our CI will run linting and will flag possible errors on the pull request with annotations. -For example, to install from the (fictional) branch `bugfix` from `apache/arrow` one could: +To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run -```r -Sys.setenv(LIBARROW_MINIMAL="false") -remotes::install_github("apache/arrow/r@bugfix", build = FALSE) +```{r} +lintr::lint_package("arrow/r") ``` -Developers may wish to use this method of installing a specific commit -separate from another Arrow development environment or system installation -(e.g. we use this in [arrowbench](https://github.com/ursacomputing/arrowbench) to install development versions of arrow isolated from the system install). If you already have Arrow C++ libraries installed system-wide, you may need to set some additional variables in order to isolate this build from your system libraries: - -* Setting the environment variable `FORCE_BUNDLED_BUILD` to `true` will skip the `pkg-config` search for Arrow libraries and attempt to build from the same source at the repository+ref given. -* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of Arrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: `withr::with_makevars(list(CPPFLAGS = "", LDFLAGS = ""), remotes::install_github(...))`. +You can automatically change the formatting of the code in the package using the [styler](https://styler.r-lib.org/) package. There are two ways to do this: -## What happens when you `R CMD INSTALL`? +1. Use the comment bot to do this automatically with the command `@github-actions autotune` on a PR, and commit it back to the branch. -There are a number of scripts that are triggered when `R CMD INSTALL .`. For Arrow users, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host) so the installation process is easy. However knowing about these scripts can help troubleshoot if things go wrong in them or things go wrong in an install: +2. Run the styler locally either via Makefile commands: -* `configure` and `configure.win` These scripts are triggered during `R CMD INSTALL .` on non-Windows and Windows platforms, respectively. They handle finding the Arrow library, setting up the build variables necessary, and writing the package Makevars file that is used to compile the C++ code in the R package. -* `tools/nixlibs.R` This script is sometimes called by `configure` on Linux (or on any non-windows OS with the environment variable `FORCE_BUNDLED_BUILD=true`). This sets up the build process for our bundled builds (which is the default on linux). The operative logic is at the end of the script, but it will do the following (and it will stop with the first one that succeeds and some of the steps are only checked if they are enabled via an environment variable): - * Check if there is an already built libarrow in `arrow/r/libarrow-{version}`, use that to link against if it exists. - * Check if a binary is available from our hosted unofficial builds. - * Download the Arrow source and build the Arrow Library from source. - * `*** Proceed without C++` dependencies (this is an error and the package will not work, but if you see this message you know the previous steps have not succeeded/were not enabled) -* `inst/build_arrow_static.sh` this script builds Arrow for a bundled, static build. It is called by `tools/nixlibs.R` when the Arrow library is being built. (If you're looking at this script, and you've gotten this far, it should look _incredibly_ familiar: it's basically the contents of this guide in script form — with a few important changes) - -## Styling and linting of the R code in the R package - -The R code in the package follows [the tidyverse style](https://style.tidyverse.org/). On PR submission (and on pushes) our CI will run linting and will flag possible errors on the pull request with annotations. +```bash +make style # (for only the files changed) +make style-all # (for all files) +``` -To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run `lintr::lint_package("arrow/r")`. +or in R: -One can automatically change the formatting of the code in the package using the [styler](https://styler.r-lib.org/) package. There are two ways to do this: +```{r} +# note the two excluded files which should not be styled +styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R")) -1. Use the comment bot to do this automatically with the command `@github-actions autotune` on a PR and commit it back to the branch. -2. Locally, with the command `make style` (for only the files changed), `make style-all` (for all files), or use `styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R"))` note the two excluded files which should not be styled. +``` -The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we habitually do not style is in `r/.styler_excludes.R`. +The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we intentionally do not style is in `r/.styler_excludes.R`. -## Editing C++ code in the R package +### C++ code -The `arrow` package uses some customized tools on top of `cpp11` to prepare its -C++ code in `src/`. This is because we have some features that are only enabled +The arrow package uses some customized tools on top of [cpp11](https://cpp11.r-lib.org/) to prepare its +C++ code in `src/`. This is because there are some features that are only enabled and built conditionally during build time. If you change C++ code in the R package, you will need to set the `ARROW_R_DEV` environment variable to `true` (optionally, add it to your `~/.Renviron` file to persist across sessions) so -that the `data-raw/codegen.R` file is used for code generation. The `Makefile` +that the `data-raw/codegen.R` file is used for code generation. The `Makefile` commands also handles this automatically. We use Google C++ style in our C++ code. The easiest way to accomplish this is -use an editors/IDE that formats your code for you. Many popular editors/IDEs -have support for running `clang-format` on C++ files when you save them. +use an editors/IDE that formats your code for you. Many popular editors/IDEs +have support for running `clang-format` on C++ files when you save them. Installing/enabling the appropriate plugin may save you much frustration. Check for style errors with -``` shell +```bash ./lint.sh ``` Fix any style issues before committing with -``` shell +```bash ./lint.sh --fix ``` The lint script requires Python 3 and `clang-format-8`. If the command -isn’t found, you can explicitly provide the path to it like -`CLANG_FORMAT=$(which clang-format-8) ./lint.sh`. On macOS, you can get -this by installing LLVM via Homebrew and running the script as -`CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh` +isn't found, you can explicitly provide the path to it like: + +```bash +CLANG_FORMAT=$(which clang-format-8) ./lint.sh +``` -_Note_ that the lint script requires Python 3 and the Python dependencies +On macOS, you can get this by installing LLVM via Homebrew and running the script as: +```bash +CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh +``` + +_Note_ that the lint script requires Python 3 and the Python dependencies (note that `cmake_format is pinned to a specific version): * autopep8 @@ -475,65 +401,191 @@ _Note_ that the lint script requires Python 3 and the Python dependencies ## Running tests +Tests can be run either using `devtools::test()` or the Makefile alternative. + +```r +# Run the test suite, optionally filtering file names +devtools::test(filter="^regexp$") + +# or the Makefile alternative from the arrow/r directory in a shell: +make test file=regexp +``` + Some tests are conditionally enabled based on the availability of certain features in the package build (S3 support, compression libraries, etc.). Others are generally skipped by default but can be enabled with environment variables or other settings: +* All tests are skipped on Linux if the package builds without the C++ libarrow. + To make the build fail if libarrow is not available (as in, to test that + the C++ build was successful), set `TEST_R_WITH_ARROW=true` + * Some tests are disabled unless `ARROW_R_DEV=true` + * Tests that require allocating >2GB of memory to test Large types are disabled unless `ARROW_LARGE_MEMORY_TESTS=true` + * Integration tests against a real S3 bucket are disabled unless credentials are set in `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`; these are available on request + * S3 tests using [MinIO](https://min.io/) locally are enabled if the `minio server` process is found running. If you're running MinIO with custom settings, you can set `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and `MINIO_PORT` to override the defaults. -## Github workflows +## Running checks -On a pull request, there are some actions you can trigger by commenting on the PR. We have additional CI checks that run nightly and can be requested on demand using an internal tool called [crosssbow](https://arrow.apache.org/docs/developers/crossbow.html). A few important GitHub comment commands include: +You can run package checks by using `devtools::check()` and check test coverage +with `covr::package_coverage()`. -* `@github-actions crossbow submit -g r` for all extended R CI tests -* `@github-actions crossbow submit {task-name}` for running a specific task. See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) for a list of glob expression patterns that match names of items in the `tasks:` list below it. -* `@github-actions autotune` will run and fix lint c++ linting errors + run R documentation (among other cleanup tasks) and commit them to the branch +```r +# All package checks +devtools::check() +# See test coverage statistics +covr::report() +covr::package_coverage() +``` -## Useful functions for Arrow developers +For full package validation, you can run the following commands from a terminal. -Within an R session, these can help with package development: +``` +R CMD build . +R CMD check arrow_*.tar.gz --as-cran +``` -``` r -# Load the dev package -devtools::load_all() -# Run the test suite, optionally filtering file names -devtools::test(filter="^regexp$") -# or the Makefile alternative from the arrow/r directory in a shell: -make test file=regexp +## Running additional CI checks -# Update roxygen documentation -devtools::document() +On a pull request, there are some actions you can trigger by commenting on the +PR. We have additional CI checks that run nightly and can be requested on demand +using an internal tool called +[crossbow](https://arrow.apache.org/docs/developers/crossbow.html). +A few important GitHub comment commands are shown below. -# To preview the documentation website -pkgdown::build_site() +#### Run all extended R CI tasks +``` +@github-actions crossbow submit -g r +``` -# All package checks; see also below -devtools::check() +This runs each of the R-related CI tasks. -# See test coverage statistics -covr::report() -covr::package_coverage() +#### Run a specific task +``` +@github-actions crossbow submit {task-name} ``` -Any of those can be run from the command line by wrapping them in `R -e -'$COMMAND'`. There’s also a `Makefile` to help with some common tasks -from the command line (`make test`, `make doc`, `make clean`, etc.) +See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) +for a list of glob expression patterns that match names of items in the `tasks:` +list below it. -### Full package validation +#### Run linting and documentation building tasks -``` shell -R CMD build . -R CMD check arrow_*.tar.gz --as-cran ``` +@github-actions autotune +``` + +This will run and fix lint C++ linting errors, run R documentation (among other +cleanup tasks), run styler on any changed R code, and commit the resulting +updates to the branch. + +# Troubleshooting + +Note that after any change to libarrow, you must reinstall it and +run `make clean` or `git clean -fdx .` to remove any cached object code +in the `r/src/` directory before reinstalling the R package. This is +only necessary if you make changes to libarrow source; you do not +need to manually purge object files if you are only editing R or C++ +code inside `r/`. + +## Arrow library - R package mismatches + +If libarrow and the R package have diverged, you will see errors like: + +``` +Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath = DLLpath, ...): + unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so': + dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Symbol not found: __ZN5arrow2io16RandomAccessFile9ReadAsyncERKNS0_9IOContextExx + Referenced from: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so + Expected in: flat namespace + in /Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so +Error: loading failed +Execution halted +ERROR: loading failed +``` + +To resolve this, try [rebuilding the Arrow library](#step-3-building-arrow). + +## Multiple versions of libarrow + +If you are installing from a user-level directory, and you already have a +previous installation of libarrow in a system directory, you get you may get +errors like the following when you install the R package: + +``` +Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath = DLLpath, ...): + unable to load shared object '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so': + dlopen(/Library/Frameworks/R.framework/Versions/4.0/Resources/library/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: /usr/local/lib/libarrow.400.dylib + Referenced from: /usr/local/lib/libparquet.400.dylib + Reason: image not found +``` + +If this happens, you need to make sure that you don't let R link to your system +library when building arrow. You can do this a number of different ways: + +* Setting the `MAKEFLAGS` environment variable to `"LDFLAGS="` (see below for an example) this is the recommended way to accomplish this +* Using {withr}'s `with_makevars(list(LDFLAGS = ""), ...)` +* adding `LDFLAGS=` to your `~/.R/Makevars` file (the least recommended way, though it is a common debugging approach suggested online) + +```{bash, save=run & !sys_install & macos, hide=TRUE} +# Setup troubleshooting section +# install a system-level arrow on macOS +brew install apache-arrow +``` + + +```{bash, save=run & !sys_install & ubuntu, hide=TRUE} +# Setup troubleshooting section +# install a system-level arrow on Ubuntu +sudo apt update +sudo apt install -y -V ca-certificates lsb-release wget +wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb +sudo apt update +sudo apt install -y -V libarrow-dev +``` + +```{bash, save=run & !sys_install & macos} +MAKEFLAGS="LDFLAGS=" R CMD INSTALL . +``` + + +## `rpath` issues + +If the package fails to install/load with an error like this: + +``` + ** testing if installed package can be loaded from temporary location + Error: package or namespace load failed for 'arrow' in dyn.load(file, DLLpath = DLLpath, ...): + unable to load shared object '/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so': + dlopen(/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: @rpath/libarrow.14.dylib +``` + +ensure that `-DARROW_INSTALL_NAME_RPATH=OFF` was passed (this is important on +macOS to prevent problems at link time and is a no-op on other platforms). +Alternatively, try setting the environment variable `R_LD_LIBRARY_PATH` to +wherever Arrow C++ was put in `make install`, e.g. `export +R_LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package. + +When installing from source, if the R and C++ library versions do not +match, installation may fail. If you've previously installed the +libraries and want to upgrade the R package, you'll need to update the +Arrow C++ library first. + +For any other build/configuration challenges, see the [C++ developer +guide](https://arrow.apache.org/docs/developers/cpp/building.html). + +## Other installation issues + +There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information. \ No newline at end of file diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 92daff31529..013e63a113e 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -164,9 +164,46 @@ Depending on your system, building Arrow C++ from source may be slow. For the specific mechanics of how all this works, see the R package `configure` script, which calls `tools/nixlibs.R`. + If the C++ library is built from source, `inst/build_arrow_static.sh` is executed. This build script is also what is used to generate the prebuilt binaries. +## How the package is installed - advanced + +This subsection contains information which is likely to be most relevant mostly +to Arrow developers and is not necessary for Arrow users to install Arrow. + +There are a number of scripts that are triggered when `R CMD INSTALL .` is run. +For Arrow users, these should all just work without configuration and pull in +the most complete pieces (e.g. official binaries that we host). + +An overview of these scripts is shown below: + +* `configure` and `configure.win` - these scripts are triggered during +`R CMD INSTALL .` on non-Windows and Windows platforms, respectively. They +handle finding the Arrow library, setting up the build variables necessary, and +writing the package Makevars file that is used to compile the C++ code in the R +package. + +* `tools/nixlibs.R` - this script is sometimes called by `configure` on Linux +(or on any non-windows OS with the environment variable +`FORCE_BUNDLED_BUILD=true`). This sets up the build process for our bundled +builds (which is the default on linux). The operative logic is at the end of +the script, but it will do the following (and it will stop with the first one +that succeeds and some of the steps are only checked if they are enabled via an +environment variable): + * Check if there is an already built libarrow in `arrow/r/libarrow-{version}`, + use that to link against if it exists. + * Check if a binary is available from our hosted unofficial builds. + * Download the Arrow source and build the Arrow Library from source. + * `*** Proceed without C++` dependencies (this is an error and the package + will not work, but if you see this message you know the previous steps have + not succeeded/were not enabled) + +* `inst/build_arrow_static.sh` - called by `tools/nixlibs.R` when the Arrow +library is being built. It builds Arrow for a bundled, static build, and +mirrors the steps described in the ["Arrow R Developer Guide" vignette]("./developing.html") + # Troubleshooting The intent is that `install.packages("arrow")` will just work and handle all C++ From 4cb77a21ee3b68138c3e2bfcc8969234039ed24d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 7 Sep 2021 05:47:27 +0900 Subject: [PATCH 35/93] ARROW-13910: [Ruby] Arrow::Table#[]/Arrow::RecordBatch#[] accepts Range and selectors Closes #11090 from kou/ruby-table-array-ref Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .../red-arrow/lib/arrow/column-containable.rb | 101 +++++++++++++++++- ruby/red-arrow/lib/arrow/map-data-type.rb | 4 +- ruby/red-arrow/lib/arrow/record-batch.rb | 2 - ruby/red-arrow/lib/arrow/table.rb | 37 ------- ruby/red-arrow/test/test-record-batch.rb | 42 ++++++++ ruby/red-arrow/test/test-table.rb | 31 +++++- 6 files changed, 172 insertions(+), 45 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/column-containable.rb b/ruby/red-arrow/lib/arrow/column-containable.rb index 51ad88e7080..7d7de66bda0 100644 --- a/ruby/red-arrow/lib/arrow/column-containable.rb +++ b/ruby/red-arrow/lib/arrow/column-containable.rb @@ -27,6 +27,17 @@ def each_column(&block) columns.each(&block) end + # @overload [](name) + # Find a column that has the given name. + # + # @param name [String, Symbol] The column name to be found. + # @return [Column] The found column. + # + # @overload [](index) + # Find the `index`-th column. + # + # @param index [Integer] The index to be found. + # @return [Column] The found column. def find_column(name_or_index) case name_or_index when String, Symbol @@ -40,9 +51,97 @@ def find_column(name_or_index) return nil if index < 0 or index >= n_columns Column.new(self, index) else - message = "column name or index must be String, Symbol or Integer" + message = "column name or index must be String, Symbol or Integer: " + message << name_or_index.inspect raise ArgumentError, message end end + + # Selects columns that are selected by `selectors` and/or `block` + # and creates a new container only with the selected columns. + # + # @param selectors [Array] + # If a selector is `String`, `Symbol` or `Integer`, the selector + # selects a column by {#find_column}. + # + # If a selector is `Range`, the selector selects columns by `::Array#[]`. + # @yield [column] Gives a column to the block to select columns. + # This uses `::Array#select`. + # @yieldparam column [Column] A target column. + # @yieldreturn [Boolean] Whether the given column is selected or not. + # @return [self.class] The newly created container that only has selected + # columns. + def select_columns(*selectors, &block) + if selectors.empty? + return to_enum(__method__) unless block_given? + selected_columns = columns.select(&block) + else + selected_columns = [] + selectors.each do |selector| + case selector + when Range + selected_columns.concat(columns[selector]) + else + column = find_column(selector) + if column.nil? + case selector + when String, Symbol + message = "unknown column: #{selector.inspect}: #{inspect}" + raise KeyError.new(message) + else + message = "out of index (0..#{n_columns - 1}): " + message << "#{selector.inspect}: #{inspect}" + raise IndexError.new(message) + end + end + selected_columns << column + end + end + selected_columns = selected_columns.select(&block) if block_given? + end + self.class.new(selected_columns) + end + + # @overload [](name) + # Find a column that has the given name. + # + # @param name [String, Symbol] The column name to be found. + # @return [Column] The found column. + # @see #find_column + # + # @overload [](index) + # Find the `index`-th column. + # + # @param index [Integer] The index to be found. + # @return [Column] The found column. + # @see #find_column + # + # @overload [](range) + # Selects columns that are in `range` and creates a new container + # only with the selected columns. + # + # @param range [Range] The range to be selected. + # @return [self.class] The newly created container that only has selected + # columns. + # @see #select_columns + # + # @overload [](selectors) + # Selects columns that are selected by `selectors` and creates a + # new container only with the selected columns. + # + # @param selectors [Array] The selectors that are used to select columns. + # @return [self.class] The newly created container that only has selected + # columns. + # @see #select_columns + def [](selector) + case selector + when ::Array + select_columns(*selector) + when Range + select_columns(selector) + else + find_column(selector) + end + end end end diff --git a/ruby/red-arrow/lib/arrow/map-data-type.rb b/ruby/red-arrow/lib/arrow/map-data-type.rb index a157aa512af..67e1343295c 100644 --- a/ruby/red-arrow/lib/arrow/map-data-type.rb +++ b/ruby/red-arrow/lib/arrow/map-data-type.rb @@ -40,7 +40,7 @@ class MapDataType # See {Arrow::DataType.resolve} how to specify data type # description. # - # @example Create a map data type for {0: "Hello", 1: "World"} + # @example Create a map data type for `{0: "Hello", 1: "World"}` # key = :int8 # item = :string # Arrow::MapDataType.new(key, item) @@ -66,7 +66,7 @@ class MapDataType # See {Arrow::DataType.resolve} how to specify data type # description. # - # @example Create a maap data type for {0: "Hello", 1: "World"} + # @example Create a map data type for `{0: "Hello", 1: "World"}` # Arrow::MapDataType.new(key: :int8, item: :string) def initialize(*args) n_args = args.size diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb index e7ebf50e197..c5aaf876b07 100644 --- a/ruby/red-arrow/lib/arrow/record-batch.rb +++ b/ruby/red-arrow/lib/arrow/record-batch.rb @@ -50,8 +50,6 @@ def new(*args) alias_method :size, :n_rows alias_method :length, :n_rows - alias_method :[], :find_column - # Converts the record batch to {Arrow::Table}. # # @return [Arrow::Table] diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb index e9bf3221304..aab4c409861 100644 --- a/ruby/red-arrow/lib/arrow/table.rb +++ b/ruby/red-arrow/lib/arrow/table.rb @@ -195,8 +195,6 @@ def each_record_batch alias_method :size, :n_rows alias_method :length, :n_rows - alias_method :[], :find_column - alias_method :slice_raw, :slice # @overload slice(offset, length) @@ -397,41 +395,6 @@ def remove_column(name_or_index) remove_column_raw(index) end - # TODO - # - # @return [Arrow::Table] - def select_columns(*selectors, &block) - if selectors.empty? - return to_enum(__method__) unless block_given? - selected_columns = columns.select(&block) - else - selected_columns = [] - selectors.each do |selector| - case selector - when String, Symbol - column = find_column(selector) - if column.nil? - message = "unknown column: #{selector.inspect}: #{inspect}" - raise KeyError.new(message) - end - selected_columns << column - when Range - selected_columns.concat(columns[selector]) - else - column = columns[selector] - if column.nil? - message = "out of index (0..#{n_columns - 1}): " + - "#{selector.inspect}: #{inspect}" - raise IndexError.new(message) - end - selected_columns << column - end - end - selected_columns = selected_columns.select(&block) if block_given? - end - self.class.new(selected_columns) - end - # Experimental def group(*keys) Group.new(self, keys) diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb index 87f89355854..e94c26f2e32 100644 --- a/ruby/red-arrow/test/test-record-batch.rb +++ b/ruby/red-arrow/test/test-record-batch.rb @@ -136,5 +136,47 @@ def setup end end end + + sub_test_case("#[]") do + def setup + @record_batch = Arrow::RecordBatch.new(a: [true], + b: [true], + c: [true], + d: [true], + e: [true], + f: [true], + g: [true]) + end + + test("[String]") do + assert_equal(Arrow::Column.new(@record_batch, 0), + @record_batch["a"]) + end + + test("[Symbol]") do + assert_equal(Arrow::Column.new(@record_batch, 1), + @record_batch[:b]) + end + + test("[Integer]") do + assert_equal(Arrow::Column.new(@record_batch, 6), + @record_batch[-1]) + end + + test("[Range]") do + assert_equal(Arrow::RecordBatch.new(d: [true], + e: [true]), + @record_batch[3..4]) + end + + test("[[Symbol, String, Integer, Range]]") do + assert_equal(Arrow::RecordBatch.new(c: [true], + a: [true], + g: [true], + d: [true], + e: [true]), + @record_batch[[:c, "a", -1, 3..4]]) + end + end end end diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 5f6ba8b94be..d530be24a24 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -190,20 +190,45 @@ def setup end sub_test_case("#[]") do + def setup + @table = Arrow::Table.new(a: [true], + b: [true], + c: [true], + d: [true], + e: [true], + f: [true], + g: [true]) + end + test("[String]") do assert_equal(Arrow::Column.new(@table, 0), - @table["count"]) + @table["a"]) end test("[Symbol]") do assert_equal(Arrow::Column.new(@table, 1), - @table[:visible]) + @table[:b]) end test("[Integer]") do - assert_equal(Arrow::Column.new(@table, 1), + assert_equal(Arrow::Column.new(@table, 6), @table[-1]) end + + test("[Range]") do + assert_equal(Arrow::Table.new(d: [true], + e: [true]), + @table[3..4]) + end + + test("[[Symbol, String, Integer, Range]]") do + assert_equal(Arrow::Table.new(c: [true], + a: [true], + g: [true], + d: [true], + e: [true]), + @table[[:c, "a", -1, 3..4]]) + end end sub_test_case("#merge") do From 67b5bd28ad45e340326988fbe385806fa14c8120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Sep 2021 11:14:59 +0200 Subject: [PATCH 36/93] ARROW-13743: [CI] OSX job fails due to incompatible git and libcurl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #11095 from kszucs/ARROW-13743 Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- dev/tasks/conda-recipes/azure.osx.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/tasks/conda-recipes/azure.osx.yml b/dev/tasks/conda-recipes/azure.osx.yml index d3cbcbbb787..99bb76ba520 100755 --- a/dev/tasks/conda-recipes/azure.osx.yml +++ b/dev/tasks/conda-recipes/azure.osx.yml @@ -18,6 +18,7 @@ jobs: - script: | source activate base + conda config --set channel_priority strict conda install -n base -c conda-forge --quiet --yes conda-forge-ci-setup=3 conda-build displayName: 'Add conda-forge-ci-setup=3' @@ -27,7 +28,6 @@ jobs: /usr/bin/sudo -k displayName: Mangle homebrew - {{ macros.azure_checkout_arrow() }} - script: | @@ -37,7 +37,7 @@ jobs: setup_conda_rc ./ ./ ./.ci_support/${CONFIG}.yaml export CI=azure source run_conda_forge_build_setup - conda update --yes --quiet --override-channels -c conda-forge -c defaults --all + conda update --yes --quiet --override-channels -c conda-forge --all displayName: Configure conda and conda-build workingDirectory: arrow/dev/tasks/conda-recipes env: From 6dc272aa4b4377d6c597efef675534507b48e853 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 7 Sep 2021 12:19:15 +0200 Subject: [PATCH 37/93] ARROW-13810: [C++][Compute] Predicate IsAsciiCharacter allows invalid types and values Remove template from string predicate IsAsciiCharacter to prevent returning true for invalid types and values. Closes #11048 from edponce/ARROW-13810-Compute-Predicate-IsAsciiCharacter-allow Authored-by: Eduardo Ponce Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/kernels/scalar_string.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index aa953119d47..1f043adb0b5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -73,10 +73,7 @@ Status RegexStatus(const RE2& regex) { // IsAlpha/Digit etc -template -static inline bool IsAsciiCharacter(T character) { - return character < 128; -} +static inline bool IsAsciiCharacter(uint8_t character) { return character < 128; } static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { return (ascii_character >= 'a') && (ascii_character <= 'z'); @@ -1746,8 +1743,7 @@ struct IsNumericUnicode : CharacterPredicateUnicode { struct IsAscii { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_nascii_characters, Status*) { - return std::all_of(input, input + input_string_nascii_characters, - IsAsciiCharacter); + return std::all_of(input, input + input_string_nascii_characters, IsAsciiCharacter); } }; From 6c7c4f0a09a3f40d65999511c77d0f21473b4de8 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 7 Sep 2021 13:08:06 +0200 Subject: [PATCH 38/93] ARROW-13671: [Dev] Fix conda recipe on Arm 64k page system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #10963 from cyb70289/arm-64k-page Authored-by: Yibo Cai Signed-off-by: Krisztián Szűcs --- dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh index e07b4f758fe..9e4c02c5c6e 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh @@ -36,9 +36,13 @@ fi if [[ "${target_platform}" == "osx-arm64" ]]; then # We need llvm 11+ support in Arrow for this - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=OFF" + # Tell jemalloc to support 16K page size on apple arm64 silicon + EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=OFF -DARROW_JEMALLOC_LG_PAGE=14" sed -ie "s;protoc-gen-grpc.*$;protoc-gen-grpc=${BUILD_PREFIX}/bin/grpc_cpp_plugin\";g" ../src/arrow/flight/CMakeLists.txt - sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=14";g' ../cmake_modules/ThirdpartyToolchain.cmake +elif [[ "${target_platform}" == "linux-aarch64" ]]; then + # Tell jemalloc to support both 4k and 64k page arm64 systems + # See https://github.com/apache/arrow/pull/10940 + EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=ON -DARROW_JEMALLOC_LG_PAGE=16" else EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=ON" fi From 9064fa0cccd240d30ad243daded9d84825983c83 Mon Sep 17 00:00:00 2001 From: karldw Date: Tue, 7 Sep 2021 09:52:00 -0500 Subject: [PATCH 39/93] ARROW-12981: [R] Install source package from CRAN alone I took a stab at implementing the approach @nealrichardson laid out in [ARROW-12981](https://issues.apache.org/jira/browse/ARROW-12981?focusedCommentId=17400415#comment-17400415). Please let me know what you think, and if you'd like any changes! I wrote some basic tests for the `download_optional_dependencies()` helper function, but it would be good to have more comprehensive install tests. These could be something like: ```sh export LIBARROW_BINARY=false export LIBARROW_BUILD=true export LIBARROW_DOWNLOAD=false export LIBARROW_MINIMAL=false # Make sure offline, feature-light installation works R -e "install.packages('arrow_x.y.z.p.tar.xz')" R -e 'stopifnot(arrow::arrow_available(), isFALSE(arrow::arrow_info()$capabilities["parquet"]))' # Download and install the thirdparty features R -e "arrow::download_optional_dependencies('arrow-thirdparty')" source arrow-thirdparty/DEFINE_ENV_VARS.sh R -e "install.packages('arrow_x.y.z.p.tar.xz') R -e 'stopifnot(arrow::arrow_available(), isTRUE(arrow::arrow_info()$capabilities["parquet"]))' ``` Closes #11001 from karldw/fix-12981 Lead-authored-by: karldw Co-authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- dev/tasks/r/github.linux.offline.build.yml | 117 +++++++ dev/tasks/tasks.yml | 13 + r/.gitignore | 8 + r/Makefile | 7 + r/NAMESPACE | 1 + r/R/install-arrow.R | 102 +++++- r/_pkgdown.yml | 1 + r/configure | 21 +- r/inst/build_arrow_static.sh | 1 + r/man/create_package_with_all_dependencies.Rd | 70 ++++ r/tools/nixlibs.R | 308 +++++++++++------- r/vignettes/developing.Rmd | 71 ++-- r/vignettes/install.Rmd | 58 +++- 13 files changed, 613 insertions(+), 165 deletions(-) create mode 100644 dev/tasks/r/github.linux.offline.build.yml create mode 100644 r/man/create_package_with_all_dependencies.Rd diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml new file mode 100644 index 00000000000..60685b18c5c --- /dev/null +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: must set "Crossbow" as name to have the badge links working in the +# github comment reports! +name: Crossbow + +on: + push: + branches: + - "*-github-*" + +jobs: + grab-dependencies: + name: "Download thirdparty dependencies" + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: "TRUE" + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - name: Free Up Disk Space + shell: bash + run: arrow/ci/scripts/util_cleanup.sh + - name: Fetch Submodules and Tags + shell: bash + run: cd arrow && ci/scripts/util_checkout.sh + - uses: r-lib/actions/setup-r@v1 + - name: Pull Arrow dependencies + run: | + cd arrow/r + # This is `make build`, but with no vignettes and not running `make doc` + cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ + R CMD build --no-build-vignettes --no-manual . + built_tar=$(ls -1 arrow*.tar.gz | head -n 1) + R -e "source('R/install-arrow.R'); create_package_with_all_dependencies(dest_file = 'arrow_with_deps.tar.gz', source_file = \"${built_tar}\")" + shell: bash + - name: Upload the third party dependency artifacts + uses: actions/upload-artifact@v2 + with: + name: thirdparty_deps + path: arrow/r/arrow_with_deps.tar.gz + + intall-offline: + name: "Install offline" + needs: [grab-dependencies] + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + env: + ARROW_R_DEV: TRUE + RSPM: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" + steps: + - name: Checkout Arrow + run: | + git clone --no-checkout {{ arrow.remote }} arrow + git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} + git -C arrow checkout FETCH_HEAD + git -C arrow submodule update --init --recursive + - uses: r-lib/actions/setup-r@v1 + - name: Download artifacts + uses: actions/download-artifact@v2 + with: + name: thirdparty_deps + path: arrow/r/ + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt install libcurl4-openssl-dev libssl-dev + - name: Install dependencies + run: | + install.packages(c("remotes", "glue", "sys")) + remotes::install_deps("arrow/r", dependencies = TRUE) + shell: Rscript {0} + - name: Install + env: + TEST_OFFLINE_BUILD: true + LIBARROW_MINIMAL: false + run: | + cd arrow/r + R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz + - name: Run the tests + run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' + - name: Dump test logs + run: cat arrow-tests/testthat.Rout* + if: always() + - name: Save the test output + uses: actions/upload-artifact@v2 + with: + name: test-output + path: arrow-tests/testthat.Rout* + if: always() diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 176d44ec35f..b2f0a1dcae8 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1033,6 +1033,19 @@ tasks: flags: '-e ARROW_SOURCE_HOME="/arrow" -e FORCE_BUNDLED_BUILD=TRUE -e LIBARROW_BUILD=TRUE -e ARROW_DEPENDENCY_SOURCE=SYSTEM' image: ubuntu-r-only-r + test-r-offline-minimal: + ci: azure + template: r/azure.linux.yml + params: + r_org: rocker + r_image: r-base + r_tag: latest + flags: '-e TEST_OFFLINE_BUILD=true' + + test-r-offline-maximal: + ci: github + template: r/github.linux.offline.build.yml + {% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"), ("rocker", "r-base", "latest"), diff --git a/r/.gitignore b/r/.gitignore index 76e8a8dd0bd..fbc5c8c3bfd 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -18,3 +18,11 @@ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz extra-tests/files + +# C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here. +/tools/cpp/ +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up +# from cpp/, but again, they're just copies +/tools/.env +/tools/LICENSE.txt +/tools/NOTICE.txt diff --git a/r/Makefile b/r/Makefile index 7a51cbd5188..f493cc49ffe 100644 --- a/r/Makefile +++ b/r/Makefile @@ -36,8 +36,14 @@ test: deps: R -s -e 'lib <- Sys.getenv("R_LIB", .libPaths()[1]); install.packages("devtools", repo="https://cloud.r-project.org", lib=lib); devtools::install_dev_deps(lib=lib)' +# Note: files in tools are available at build time, but not at run time. The thirdparty +# cmake expects .env, NOTICE.txt, and LICENSE.txt to be available one level up from cpp/ build: doc cp ../NOTICE.txt inst/NOTICE.txt + rsync --archive --delete ../cpp tools/ + cp -p ../.env tools/ + cp -p ../NOTICE.txt tools/ + cp -p ../LICENSE.txt tools/ R CMD build . check: build @@ -56,4 +62,5 @@ clean: -rm src/Makevars.win -rm -rf arrow.Rcheck/ -rm -rf libarrow/ + -rm -rf tools/cpp/ tools/.env tools/NOTICE.txt tools/LICENSE.txt -find . -name "*.orig" -delete diff --git a/r/NAMESPACE b/r/NAMESPACE index 5e78d04de52..5164e7c9f20 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -201,6 +201,7 @@ export(codec_is_available) export(contains) export(copy_files) export(cpu_count) +export(create_package_with_all_dependencies) export(dataset_factory) export(date32) export(date64) diff --git a/r/R/install-arrow.R b/r/R/install-arrow.R index 63db8ede910..3e295c543cf 100644 --- a/r/R/install-arrow.R +++ b/r/R/install-arrow.R @@ -70,7 +70,6 @@ install_arrow <- function(nightly = FALSE, } } else { Sys.setenv( - LIBARROW_DOWNLOAD = "true", LIBARROW_BINARY = binary, LIBARROW_MINIMAL = minimal, ARROW_R_DEV = verbose, @@ -137,3 +136,104 @@ reload_arrow <- function() { message("Please restart R to use the 'arrow' package.") } } + + +#' Create a source bundle that includes all thirdparty dependencies +#' +#' @param dest_file File path for the new tar.gz package. Defaults to +#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version) +#' @param source_file File path for the input tar.gz package. Defaults to +#' downloading the package from CRAN (or whatever you have set as the first in +#' `getOption("repos")`) +#' @return The full path to `dest_file`, invisibly +#' +#' This function is used for setting up an offline build. If it's possible to +#' download at build time, don't use this function. Instead, let `cmake` +#' download the required dependencies for you. +#' These downloaded dependencies are only used in the build if +#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`. +#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds +#' +#' If you're using binary packages you shouldn't need to use this function. You +#' should download the appropriate binary from your package repository, transfer +#' that to the offline computer, and install that. Any OS can create the source +#' bundle, but it cannot be installed on Windows. (Instead, use a standard +#' Windows binary package.) +#' +#' Note if you're using RStudio Package Manager on Linux: If you still want to +#' make a source bundle with this function, make sure to set the first repo in +#' `options("repos")` to be a mirror that contains source packages (that is: +#' something other than the RSPM binary mirror URLs). +#' +#' ## Steps for an offline install with optional dependencies: +#' +#' ### Using a computer with internet access, pre-download the dependencies: +#' * Install the `arrow` package _or_ run +#' `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` +#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access +#' +#' ### On the computer without internet access, install the prepared package: +#' * Install the `arrow` package from the copied file +#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` +#' * This installation will build from source, so `cmake` must be available +#' * Run [arrow_info()] to check installed capabilities +#' +#' +#' @examples +#' \dontrun{ +#' new_pkg <- create_package_with_all_dependencies() +#' # Note: this works when run in the same R session, but it's meant to be +#' # copied to a different computer. +#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +#' } +#' @export +create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) { + if (is.null(source_file)) { + pkg_download_dir <- tempfile() + dir.create(pkg_download_dir) + on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE) + message("Downloading Arrow source file") + downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source") + source_file <- downloaded[1, 2, drop = TRUE] + } + if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) { + stop("Arrow package .tar.gz file not found") + } + if (is.null(dest_file)) { + # e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz + # (add 'with_deps' for clarity if the file was downloaded locally) + dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz") + } + untar_dir <- tempfile() + on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE) + utils::untar(source_file, exdir = untar_dir) + tools_dir <- file.path(untar_dir, "arrow/tools") + download_dependencies_sh <- file.path(tools_dir, "cpp/thirdparty/download_dependencies.sh") + # If you change this path, also need to edit nixlibs.R + download_dir <- file.path(tools_dir, "thirdparty_dependencies") + dir.create(download_dir) + + message("Downloading files to ", download_dir) + download_successful <- system2(download_dependencies_sh, download_dir, stdout = FALSE) == 0 + if (!download_successful) { + stop("Failed to download thirdparty dependencies") + } + # Need to change directory to untar_dir so tar() will use relative paths. That + # means we'll need a full, non-relative path for dest_file. (extra_flags="-C" + # doesn't work with R's internal tar) + orig_wd <- getwd() + on.exit(setwd(orig_wd), add = TRUE) + # normalizePath() may return the input unchanged if dest_file doesn't exist, + # so create it first. + file.create(dest_file) + dest_file <- normalizePath(dest_file, mustWork = TRUE) + setwd(untar_dir) + + message("Repacking tar.gz file to ", dest_file) + tar_successful <- utils::tar(dest_file, compression = "gz") == 0 + if (!tar_successful) { + stop("Failed to create new tar.gz file") + } + invisible(dest_file) +} diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 90d900ddf28..c0127a8b53a 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -175,6 +175,7 @@ reference: - arrow_available - install_arrow - install_pyarrow + - create_package_with_all_dependencies repo: jira_projects: [ARROW] diff --git a/r/configure b/r/configure index 88aef7e1d35..c36e13388c2 100755 --- a/r/configure +++ b/r/configure @@ -39,7 +39,7 @@ FORCE_AUTOBREW=`echo $FORCE_AUTOBREW | tr '[:upper:]' '[:lower:]'` FORCE_BUNDLED_BUILD=`echo $FORCE_BUNDLED_BUILD | tr '[:upper:]' '[:lower:]'` ARROW_USE_PKG_CONFIG=`echo $ARROW_USE_PKG_CONFIG | tr '[:upper:]' '[:lower:]'` LIBARROW_MINIMAL=`echo $LIBARROW_MINIMAL | tr '[:upper:]' '[:lower:]'` -LIBARROW_DOWNLOAD=`echo $LIBARROW_DOWNLOAD | tr '[:upper:]' '[:lower:]'` +TEST_OFFLINE_BUILD=`echo $TEST_OFFLINE_BUILD | tr '[:upper:]' '[:lower:]'` NOT_CRAN=`echo $NOT_CRAN | tr '[:upper:]' '[:lower:]'` VERSION=`grep '^Version' DESCRIPTION | sed s/Version:\ //` @@ -129,18 +129,15 @@ else # autobrew sets `PKG_LIBS`, `PKG_DIRS`, and `PKG_CFLAGS` fi else + # Set some default values/backwards compatibility - if [ "${LIBARROW_DOWNLOAD}" = "" ] && [ "${NOT_CRAN}" != "" ]; then - LIBARROW_DOWNLOAD=$NOT_CRAN; export LIBARROW_DOWNLOAD - fi - if [ "${LIBARROW_BINARY}" = "" ] && [ "${LIBARROW_DOWNLOAD}" != "" ]; then - LIBARROW_BINARY=$LIBARROW_DOWNLOAD; export LIBARROW_BINARY - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${LIBARROW_DOWNLOAD}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL - fi - if [ "${LIBARROW_MINIMAL}" = "" ] && [ "${NOT_CRAN}" = "true" ]; then - LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + if [ "${NOT_CRAN}" = "true" ]; then + if [ "${LIBARROW_BINARY}" = "" ]; then + LIBARROW_BINARY=true; export LIBARROW_BINARY + fi + if [ "${LIBARROW_MINIMAL}" = "" ]; then + LIBARROW_MINIMAL=false; export LIBARROW_MINIMAL + fi fi # find openssl on macos. macOS ships with libressl. openssl is installable diff --git a/r/inst/build_arrow_static.sh b/r/inst/build_arrow_static.sh index 578d8b6e5b2..5f01ae0a75f 100755 --- a/r/inst/build_arrow_static.sh +++ b/r/inst/build_arrow_static.sh @@ -70,6 +70,7 @@ ${CMAKE} -DARROW_BOOST_USE_SHARED=OFF \ -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-$ARROW_DEFAULT_PARAM} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-$ARROW_DEFAULT_PARAM} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=${DEST_DIR} \ diff --git a/r/man/create_package_with_all_dependencies.Rd b/r/man/create_package_with_all_dependencies.Rd new file mode 100644 index 00000000000..b2da8c2491a --- /dev/null +++ b/r/man/create_package_with_all_dependencies.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/install-arrow.R +\name{create_package_with_all_dependencies} +\alias{create_package_with_all_dependencies} +\title{Create a source bundle that includes all thirdparty dependencies} +\usage{ +create_package_with_all_dependencies(dest_file = NULL, source_file = NULL) +} +\arguments{ +\item{dest_file}{File path for the new tar.gz package. Defaults to +\code{arrow_V.V.V_with_deps.tar.gz} in the current directory (\code{V.V.V} is the version)} + +\item{source_file}{File path for the input tar.gz package. Defaults to +downloading the package from CRAN (or whatever you have set as the first in +\code{getOption("repos")})} +} +\value{ +The full path to \code{dest_file}, invisibly + +This function is used for setting up an offline build. If it's possible to +download at build time, don't use this function. Instead, let \code{cmake} +download the required dependencies for you. +These downloaded dependencies are only used in the build if +\code{ARROW_DEPENDENCY_SOURCE} is unset, \code{BUNDLED}, or \code{AUTO}. +https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds + +If you're using binary packages you shouldn't need to use this function. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +\code{options("repos")} to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). +\subsection{Steps for an offline install with optional dependencies:}{ +\subsection{Using a computer with internet access, pre-download the dependencies:}{ +\itemize{ +\item Install the \code{arrow} package \emph{or} run +\code{source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")} +\item Run \code{create_package_with_all_dependencies("my_arrow_pkg.tar.gz")} +\item Copy the newly created \code{my_arrow_pkg.tar.gz} to the computer without internet access +} +} + +\subsection{On the computer without internet access, install the prepared package:}{ +\itemize{ +\item Install the \code{arrow} package from the copied file +\itemize{ +\item \code{install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))} +\item This installation will build from source, so \code{cmake} must be available +} +\item Run \code{\link[=arrow_info]{arrow_info()}} to check installed capabilities +} +} + +} +} +\description{ +Create a source bundle that includes all thirdparty dependencies +} +\examples{ +\dontrun{ +new_pkg <- create_package_with_all_dependencies() +# Note: this works when run in the same R session, but it's meant to be +# copied to a different computer. +install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo")) +} +} diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index e28dae79f5d..d3bf9879500 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -30,16 +30,6 @@ options(.arrow.cleanup = character()) # To collect dirs to rm on exit on.exit(unlink(getOption(".arrow.cleanup"))) env_is <- function(var, value) identical(tolower(Sys.getenv(var)), value) -# * no download, build_ok: Only build with local git checkout -# * download_ok, no build: Only use prebuilt binary, if found -# * neither: Get the arrow-without-arrow package -# Download and build are OK unless you say not to -download_ok <- !env_is("LIBARROW_DOWNLOAD", "false") -build_ok <- !env_is("LIBARROW_BUILD", "false") -# But binary defaults to not OK -binary_ok <- !identical(tolower(Sys.getenv("LIBARROW_BINARY", "false")), "false") -# For local debugging, set ARROW_R_DEV=TRUE to make this script print more -quietly <- !env_is("ARROW_R_DEV", "true") try_download <- function(from_url, to_file) { status <- try( @@ -52,6 +42,26 @@ try_download <- function(from_url, to_file) { !inherits(status, "try-error") && status == 0 } +# For local debugging, set ARROW_R_DEV=TRUE to make this script print more +quietly <- !env_is("ARROW_R_DEV", "true") + +# Default is build from source, not download a binary +build_ok <- !env_is("LIBARROW_BUILD", "false") +binary_ok <- !(env_is("LIBARROW_BINARY", "false") || env_is("LIBARROW_BINARY", "")) + +# Check if we're doing an offline build. +# (Note that cmake will still be downloaded if necessary +# https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds) +download_ok <- !env_is("TEST_OFFLINE_BUILD", "true") && try_download("https://github.com", tempfile()) + +# This "tools/thirdparty_dependencies" path, within the tar file, might exist if +# create_package_with_all_dependencies() was run, or if someone has created it +# manually before running make build. +# If you change this path, you also need to edit +# `create_package_with_all_dependencies()` in install-arrow.R +thirdparty_dependency_dir <- Sys.getenv("ARROW_THIRDPARTY_DEPENDENCY_DIR", "tools/thirdparty_dependencies") + + download_binary <- function(os = identify_os()) { libfile <- tempfile() if (!is.null(os)) { @@ -82,7 +92,7 @@ download_binary <- function(os = identify_os()) { # * `TRUE` (not case-sensitive), to try to discover your current OS, or # * some other string, presumably a related "distro-version" that has binaries # built that work for your OS -identify_os <- function(os = Sys.getenv("LIBARROW_BINARY", Sys.getenv("LIBARROW_DOWNLOAD"))) { +identify_os <- function(os = Sys.getenv("LIBARROW_BINARY")) { if (tolower(os) %in% c("", "false")) { # Env var says not to download a binary return(NULL) @@ -193,6 +203,10 @@ system_release <- function() { read_system_release <- function() utils::head(readLines("/etc/system-release"), 1) +is_solaris <- function() { + tolower(Sys.info()[["sysname"]]) %in% "sunos" +} + #### end distro #### find_available_binary <- function(os) { @@ -209,73 +223,40 @@ find_available_binary <- function(os) { os } -download_source <- function() { - tf1 <- tempfile() - src_dir <- tempfile() - - # Given VERSION as x.y.z.p - p <- package_version(VERSION)[1, 4] - if (is.na(p)) { - # This is just x.y.z so download the official Apache release - if (apache_download(VERSION, tf1)) { - untar(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/apache-arrow-", VERSION, "/cpp") - } - } else if (p != 9000) { - # This is a custom dev version (x.y.z.9999) or a nightly (x.y.z.20210505) - # (Don't try to download on the default dev .9000 version) - if (nightly_download(VERSION, tf1)) { - unzip(tf1, exdir = src_dir) - unlink(tf1) - src_dir <- paste0(src_dir, "/cpp") - } - } - - if (dir.exists(src_dir)) { - cat("*** Successfully retrieved C++ source\n") - options(.arrow.cleanup = c(getOption(".arrow.cleanup"), src_dir)) - # These scripts need to be executable - system( - sprintf("chmod 755 %s/build-support/*.sh", src_dir), - ignore.stdout = quietly, ignore.stderr = quietly - ) - return(src_dir) - } else { - return(NULL) - } -} - -nightly_download <- function(version, destfile) { - source_url <- paste0(arrow_repo, "src/arrow-", version, ".zip") - try_download(source_url, destfile) -} - -apache_download <- function(version, destfile, n_mirrors = 3) { - apache_path <- paste0("arrow/arrow-", version, "/apache-arrow-", version, ".tar.gz") - apache_urls <- c( - # This returns a different mirror each time - rep("https://www.apache.org/dyn/closer.lua?action=download&filename=", n_mirrors), - "https://downloads.apache.org/" # The backup +find_local_source <- function() { + # We'll take the first of these that exists + # The first case probably occurs if we're in the arrow git repo + # The second probably occurs if we're installing the arrow R package + cpp_dir_options <- c( + file.path(Sys.getenv("ARROW_SOURCE_HOME", ".."), "cpp"), + "tools/cpp" ) - downloaded <- FALSE - for (u in apache_urls) { - downloaded <- try_download(paste0(u, apache_path), destfile) - if (downloaded) { - break + for (cpp_dir in cpp_dir_options) { + if (file.exists(file.path(cpp_dir, "src/arrow/api.h"))) { + cat(paste0("*** Found local C++ source: '", cpp_dir, "'\n")) + return(cpp_dir) } } - downloaded + NULL } -find_local_source <- function(arrow_home = Sys.getenv("ARROW_SOURCE_HOME", "..")) { - if (file.exists(paste0(arrow_home, "/cpp/src/arrow/api.h"))) { - # We're in a git checkout of arrow, so we can build it - cat("*** Found local C++ source\n") - return(paste0(arrow_home, "/cpp")) - } else { - return(NULL) +env_vars_as_string <- function(env_var_list) { + # Do some basic checks on env_var_list: + # Check that env_var_list has names, that those names are valid POSIX + # environment variables, and that none of the values contain `'`. + stopifnot( + length(env_var_list) == length(names(env_var_list)), + all(grepl("^[^0-9]", names(env_var_list))), + all(grepl("^[A-Z0-9_]+$", names(env_var_list))), + !any(grepl("'", env_var_list, fixed = TRUE)) + ) + env_var_string <- paste0(names(env_var_list), "='", env_var_list, "'", collapse = " ") + if (nchar(env_var_string) > 30000) { + # This could happen if the full paths in *_SOURCE_URL were *very* long. + # A more formal check would look at getconf ARG_MAX, but this shouldn't matter + cat("*** Warning: Environment variables are very long. This could cause issues on some shells.\n") } + env_var_string } build_libarrow <- function(src_dir, dst_dir) { @@ -320,25 +301,42 @@ build_libarrow <- function(src_dir, dst_dir) { BUILD_DIR = build_dir, DEST_DIR = dst_dir, CMAKE = cmake, + # EXTRA_CMAKE_FLAGS will often be "", but it's convenient later to have it defined + EXTRA_CMAKE_FLAGS = Sys.getenv("EXTRA_CMAKE_FLAGS"), # Make sure we build with the same compiler settings that R is using CC = R_CMD_config("CC"), CXX = paste(R_CMD_config("CXX11"), R_CMD_config("CXX11STD")), # CXXFLAGS = R_CMD_config("CXX11FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS") ) - env_vars <- paste0(names(env_var_list), '="', env_var_list, '"', collapse = " ") - env_vars <- with_s3_support(env_vars) - env_vars <- with_mimalloc(env_vars) - if (tolower(Sys.info()[["sysname"]]) %in% "sunos") { - # jemalloc doesn't seem to build on Solaris - # nor does thrift, so turn off parquet, - # and arrowExports.cpp requires parquet for dataset (ARROW-11994), so turn that off - # xsimd doesn't compile, so set SIMD level to NONE to skip it - # re2 and utf8proc do compile, - # but `ar` fails to build libarrow_bundled_dependencies, so turn them off - # so that there are no bundled deps - env_vars <- paste(env_vars, "ARROW_JEMALLOC=OFF ARROW_PARQUET=OFF ARROW_DATASET=OFF ARROW_WITH_RE2=OFF ARROW_WITH_UTF8PROC=OFF EXTRA_CMAKE_FLAGS=-DARROW_SIMD_LEVEL=NONE") + env_var_list <- with_s3_support(env_var_list) + env_var_list <- with_mimalloc(env_var_list) + # turn_off_thirdparty_features() needs to happen after with_mimalloc() and + # with_s3_support(), since those might turn features ON. + thirdparty_deps_unavailable <- !download_ok && + !dir.exists(thirdparty_dependency_dir) && + !env_is("ARROW_DEPENDENCY_SOURCE", "system") + if (is_solaris()) { + # Note that JSON support does work on Solaris, but will be turned off with + # the rest of the thirdparty dependencies. + # All other dependencies don't compile (e.g thrift, jemalloc, and xsimd) + # or do compile but `ar` fails to build + # libarrow_bundled_dependencies (e.g. re2 and utf8proc). + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else if (thirdparty_deps_unavailable) { + cat(paste0( + "*** Building C++ library from source, but downloading thirdparty dependencies\n", + " is not possible, so this build will turn off all thirdparty features.\n", + " See install vignette for details:\n", + " https://cran.r-project.org/web/packages/arrow/vignettes/install.html\n" + )) + env_var_list <- turn_off_thirdparty_features(env_var_list) + } else if (dir.exists(thirdparty_dependency_dir)){ + # Add the *_SOURCE_URL env vars + env_var_list <- set_thirdparty_urls(env_var_list) } + env_vars <- env_vars_as_string(env_var_list) + cat("**** arrow", ifelse(quietly, "", paste("with", env_vars)), "\n") status <- suppressWarnings(system( paste(env_vars, "inst/build_arrow_static.sh"), @@ -346,7 +344,11 @@ build_libarrow <- function(src_dir, dst_dir) { )) if (status != 0) { # It failed :( - cat("**** Error building Arrow C++. Re-run with ARROW_R_DEV=true for debug information.\n") + cat( + "**** Error building Arrow C++.", + ifelse(env_is("ARROW_R_DEV", "true"), "", "Re-run with ARROW_R_DEV=true for debug information."), + "\n" + ) } invisible(status) } @@ -373,7 +375,15 @@ ensure_cmake <- function() { ) cmake_tar <- tempfile() cmake_dir <- tempfile() - try_download(cmake_binary_url, cmake_tar) + download_successful <- try_download(cmake_binary_url, cmake_tar) + if (!download_successful) { + cat(paste0( + "*** cmake was not found locally and download failed.\n", + " Make sure cmake >= 3.10 is installed and available on your PATH,\n", + " or download ", cmake_binary_url, "\n", + " and define the CMAKE environment variable.\n" + )) + } untar(cmake_tar, exdir = cmake_dir) unlink(cmake_tar) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), cmake_dir)) @@ -413,53 +423,121 @@ cmake_version <- function(cmd = "cmake") { ) } -with_s3_support <- function(env_vars) { - arrow_s3 <- toupper(Sys.getenv("ARROW_S3")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - # but if ARROW_S3=OFF explicitly, we are definitely off, so override - if (toupper(Sys.getenv("ARROW_S3")) == "OFF") { - arrow_s3 <- FALSE +turn_off_thirdparty_features <- function(env_var_list) { + # Because these are done as environment variables (as opposed to build flags), + # setting these to "OFF" overrides any previous setting. We don't need to + # check the existing value. + turn_off <- c( + "ARROW_MIMALLOC" = "OFF", + "ARROW_JEMALLOC" = "OFF", + "ARROW_PARQUET" = "OFF", # depends on thrift + "ARROW_DATASET" = "OFF", # depends on parquet + "ARROW_S3" = "OFF", + "ARROW_WITH_BROTLI" = "OFF", + "ARROW_WITH_BZ2" = "OFF", + "ARROW_WITH_LZ4" = "OFF", + "ARROW_WITH_SNAPPY" = "OFF", + "ARROW_WITH_ZLIB" = "OFF", + "ARROW_WITH_ZSTD" = "OFF", + "ARROW_WITH_RE2" = "OFF", + "ARROW_WITH_UTF8PROC" = "OFF", + "ARROW_JSON" = "OFF", + # The syntax to turn off XSIMD is different. + # Pull existing value of EXTRA_CMAKE_FLAGS first (must be defined) + "EXTRA_CMAKE_FLAGS" = paste( + env_var_list[["EXTRA_CMAKE_FLAGS"]], + "-DARROW_SIMD_LEVEL=NONE -DARROW_RUNTIME_SIMD_LEVEL=NONE" + ) + ) + # Create a new env_var_list, with the values of turn_off set. + # replace() also adds new values if they didn't exist before + replace(env_var_list, names(turn_off), turn_off) +} + +set_thirdparty_urls <- function(env_var_list) { + # This function does *not* check if existing *_SOURCE_URL variables are set. + # The directory tools/thirdparty_dependencies is created by + # create_package_with_all_dependencies() and saved in the tar file. + files <- list.files(thirdparty_dependency_dir, full.names = FALSE) + url_env_varname <- toupper(sub("(.*?)-.*", "ARROW_\\1_URL", files)) + # Special handling for the aws dependencies, which have extra `-` + aws <- grepl("^aws", files) + url_env_varname[aws] <- sub( + "AWS_SDK_CPP", "AWSSDK", + gsub( + "-", "_", + sub( + "(AWS.*)-.*", "ARROW_\\1_URL", + toupper(files[aws]) + ) + ) + ) + full_filenames <- file.path(normalizePath(thirdparty_dependency_dir), files) + + env_var_list <- replace(env_var_list, url_env_varname, full_filenames) + if (!quietly) { + env_var_list <- replace(env_var_list, "ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") + } + env_var_list +} + +is_feature_requested <- function(env_varname, default = env_is("LIBARROW_MINIMAL", "false")) { + env_value <- tolower(Sys.getenv(env_varname)) + if (identical(env_value, "off")) { + # If e.g. ARROW_MIMALLOC=OFF explicitly, override default + requested <- FALSE + } else if (identical(env_value, "on")) { + requested <- TRUE + } else { + requested <- default + } + requested +} + +with_mimalloc <- function(env_var_list) { + arrow_mimalloc <- is_feature_requested("ARROW_MIMALLOC") + if (arrow_mimalloc) { + # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { + cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") + arrow_mimalloc <- FALSE + } } + replace(env_var_list, "ARROW_MIMALLOC", ifelse(arrow_mimalloc, "ON", "OFF")) +} + +with_s3_support <- function(env_var_list) { + arrow_s3 <- is_feature_requested("ARROW_S3") if (arrow_s3) { # User wants S3 support. If they're using gcc, let's make sure the version is >= 4.9 # and make sure that we have curl and openssl system libs - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { + if (isTRUE(cmake_gcc_version(env_var_list) < "4.9")) { cat("**** S3 support not available for gcc < 4.9; building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("CURL", NULL, env_vars)) { + } else if (!cmake_find_package("CURL", NULL, env_var_list)) { # curl on macos should be installed, so no need to alter this for macos cat("**** S3 support requires libcurl-devel (rpm) or libcurl4-openssl-dev (deb); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE - } else if (!cmake_find_package("OpenSSL", "1.0.2", env_vars)) { + } else if (!cmake_find_package("OpenSSL", "1.0.2", env_var_list)) { cat("**** S3 support requires version >= 1.0.2 of openssl-devel (rpm), libssl-dev (deb), or openssl (brew); building with ARROW_S3=OFF\n") arrow_s3 <- FALSE } } - paste(env_vars, ifelse(arrow_s3, "ARROW_S3=ON", "ARROW_S3=OFF")) -} - -with_mimalloc <- function(env_vars) { - arrow_mimalloc <- toupper(Sys.getenv("ARROW_MIMALLOC")) == "ON" || tolower(Sys.getenv("LIBARROW_MINIMAL")) == "false" - if (arrow_mimalloc) { - # User wants mimalloc. If they're using gcc, let's make sure the version is >= 4.9 - if (isTRUE(cmake_gcc_version(env_vars) < "4.9")) { - cat("**** mimalloc support not available for gcc < 4.9; building with ARROW_MIMALLOC=OFF\n") - arrow_mimalloc <- FALSE - } - } - paste(env_vars, ifelse(arrow_mimalloc, "ARROW_MIMALLOC=ON", "ARROW_MIMALLOC=OFF")) + replace(env_var_list, "ARROW_S3", ifelse(arrow_s3, "ON", "OFF")) } -cmake_gcc_version <- function(env_vars) { +cmake_gcc_version <- function(env_var_list) { # This function returns NA if using a non-gcc compiler # Always enclose calls to it in isTRUE() or isFALSE() - vals <- cmake_cxx_compiler_vars(env_vars) + vals <- cmake_cxx_compiler_vars(env_var_list) if (!identical(vals[["CMAKE_CXX_COMPILER_ID"]], "GNU")) { return(NA) } package_version(vals[["CMAKE_CXX_COMPILER_VERSION"]]) } -cmake_cxx_compiler_vars <- function(env_vars) { +cmake_cxx_compiler_vars <- function(env_var_list) { + env_vars <- env_vars_as_string(env_var_list) info <- system(paste("export", env_vars, "&& $CMAKE --system-information"), intern = TRUE) info <- grep("^[A-Z_]* .*$", info, value = TRUE) vals <- as.list(sub('^.*? "?(.*?)"?$', "\\1", info)) @@ -467,12 +545,13 @@ cmake_cxx_compiler_vars <- function(env_vars) { vals[grepl("^CMAKE_CXX_COMPILER_?", names(vals))] } -cmake_find_package <- function(pkg, version = NULL, env_vars) { +cmake_find_package <- function(pkg, version = NULL, env_var_list) { td <- tempfile() dir.create(td) options(.arrow.cleanup = c(getOption(".arrow.cleanup"), td)) find_package <- paste0("find_package(", pkg, " ", version, " REQUIRED)") writeLines(find_package, file.path(td, "CMakeLists.txt")) + env_vars <- env_vars_as_string(env_var_list) cmake_cmd <- paste0( "export ", env_vars, " && cd ", td, @@ -501,12 +580,7 @@ if (!file.exists(paste0(dst_dir, "/include/arrow/api.h"))) { unlink(bin_file) } else if (build_ok) { # (2) Find source and build it - if (download_ok) { - src_dir <- download_source() - } - if (is.null(src_dir)) { - src_dir <- find_local_source() - } + src_dir <- find_local_source() if (!is.null(src_dir)) { cat("*** Building C++ libraries\n") build_libarrow(src_dir, dst_dir) diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index 59c231724aa..d1d7998de32 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -50,13 +50,13 @@ This document is a work in progress and will grow and change as the Apache Arrow We welcome any feedback you have about things that are confusing or additions you would like to see here - please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) if you have any suggestions or requests. -# Developer environment setup +# Developer environment setup ## R-only {.tabset} Windows and macOS users who wish to contribute to the R package and don't need to alter libarrow (Arrow's C++ library) may be able to obtain a -recent version of the library without building from source. +recent version of the library without building from source. ### Linux @@ -71,7 +71,7 @@ nightly$ls("libarrow/bin") ``` Version numbers in that repository correspond to dates. -You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it. +You'll need to create a `libarrow` directory inside the R package directory and unzip the zip file containing the compiled libarrow binary files into it. ### macOS On macOS, you can install libarrow using [Homebrew](https://brew.sh/): @@ -95,7 +95,7 @@ nightly$ls("libarrow/bin") ``` Version numbers in that repository correspond to dates. -You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package. +You can set the `RWINLIB_LOCAL` environment variable to point to the zip file containing libarrow before installing the arrow R package. ## R and C++ @@ -258,7 +258,7 @@ cmake \ .. ```

    - + ## Installing a version of the R package with a specific git reference @@ -283,13 +283,13 @@ remotes::install_github("apache/arrow/r@bugfix", build = FALSE) Developers may wish to use this method of installing a specific commit separate from another Arrow development environment or system installation (e.g. we use this in [arrowbench](https://github.com/ursacomputing/arrowbench) -to install development versions of libarrow isolated from the system install). If -you already have libarrow installed system-wide, you may need to set +to install development versions of libarrow isolated from the system install). If +you already have libarrow installed system-wide, you may need to set some additional variables in order to isolate this build from your system libraries: * Setting the environment variable `FORCE_BUNDLED_BUILD` to `true` will skip the `pkg-config` search for libarrow and attempt to build from the same source at the repository+ref given. -* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: +* You may also need to set the Makevars `CPPFLAGS` and `LDFLAGS` to `""` in order to prevent the installation process from attempting to link to already installed system versions of libarrow. One way to do this temporarily is wrapping your `remotes::install_github()` call like so: ```{r} withr::with_makevars(list(CPPFLAGS = "", LDFLAGS = ""), remotes::install_github(...)) ``` @@ -304,7 +304,7 @@ You can load the R package via `devtools::load_all()`. ## Rebuilding the documentation -The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. +The R documentation uses the [`@examplesIf`](https://roxygen2.r-lib.org/articles/rd.html#functions) tag introduced in `roxygen2` version 7.1.1.9001, which hasn't yet been released on CRAN at the time of writing. If you are making changes which require updating the documentation, please install the development version of `roxygen2` from GitHub. ```{r} remotes::install_github("r-lib/roxygen2") @@ -326,7 +326,7 @@ pkgdown::build_site(preview=TRUE) The R code in the package follows [the tidyverse style](https://style.tidyverse.org/). On PR submission (and on pushes) our CI will run linting and will flag possible errors on the pull request with annotations. -To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run +To run the [lintr](https://github.com/jimhester/lintr) locally, install the lintr package (note, we currently use a fork that includes fixes not yet accepted upstream, see how lintr is being installed in the file `ci/docker/linux-apt-lint.dockerfile` for the current status) and then run ```{r} lintr::lint_package("arrow/r") @@ -360,12 +360,12 @@ C++ code in `src/`. This is because there are some features that are only enable and built conditionally during build time. If you change C++ code in the R package, you will need to set the `ARROW_R_DEV` environment variable to `true` (optionally, add it to your `~/.Renviron` file to persist across sessions) so -that the `data-raw/codegen.R` file is used for code generation. The `Makefile` +that the `data-raw/codegen.R` file is used for code generation. The `Makefile` commands also handles this automatically. We use Google C++ style in our C++ code. The easiest way to accomplish this is -use an editors/IDE that formats your code for you. Many popular editors/IDEs -have support for running `clang-format` on C++ files when you save them. +use an editors/IDE that formats your code for you. Many popular editors/IDEs +have support for running `clang-format` on C++ files when you save them. Installing/enabling the appropriate plugin may save you much frustration. Check for style errors with @@ -392,7 +392,7 @@ On macOS, you can get this by installing LLVM via Homebrew and running the scrip CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh ``` -_Note_ that the lint script requires Python 3 and the Python dependencies +_Note_ that the lint script requires Python 3 and the Python dependencies (note that `cmake_format is pinned to a specific version): * autopep8 @@ -419,16 +419,16 @@ variables or other settings: * All tests are skipped on Linux if the package builds without the C++ libarrow. To make the build fail if libarrow is not available (as in, to test that the C++ build was successful), set `TEST_R_WITH_ARROW=true` - + * Some tests are disabled unless `ARROW_R_DEV=true` * Tests that require allocating >2GB of memory to test Large types are disabled unless `ARROW_LARGE_MEMORY_TESTS=true` - + * Integration tests against a real S3 bucket are disabled unless credentials are set in `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`; these are available on request - + * S3 tests using [MinIO](https://min.io/) locally are enabled if the `minio server` process is found running. If you're running MinIO with custom settings, you can set `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and @@ -436,7 +436,7 @@ variables or other settings: ## Running checks -You can run package checks by using `devtools::check()` and check test coverage +You can run package checks by using `devtools::check()` and check test coverage with `covr::package_coverage()`. ```r @@ -458,10 +458,10 @@ R CMD check arrow_*.tar.gz --as-cran ## Running additional CI checks -On a pull request, there are some actions you can trigger by commenting on the +On a pull request, there are some actions you can trigger by commenting on the PR. We have additional CI checks that run nightly and can be requested on demand -using an internal tool called -[crossbow](https://arrow.apache.org/docs/developers/crossbow.html). +using an internal tool called +[crossbow](https://arrow.apache.org/docs/developers/crossbow.html). A few important GitHub comment commands are shown below. #### Run all extended R CI tasks @@ -476,7 +476,7 @@ This runs each of the R-related CI tasks. @github-actions crossbow submit {task-name} ``` -See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) +See the `r:` group definition near the beginning of the [crossbow configuration](https://github.com/apache/arrow/blob/master/dev/tasks/tasks.yml) for a list of glob expression patterns that match names of items in the `tasks:` list below it. @@ -486,10 +486,24 @@ list below it. @github-actions autotune ``` -This will run and fix lint C++ linting errors, run R documentation (among other -cleanup tasks), run styler on any changed R code, and commit the resulting +This will run and fix lint C++ linting errors, run R documentation (among other +cleanup tasks), run styler on any changed R code, and commit the resulting updates to the branch. +# Summary of environment variables + +* See the user-facing [Install vignette](install.html) for a large number of + environment variables that determine how the build works and what features + get built. +* `TEST_OFFLINE_BUILD`: When set to `true`, the build script will not download + prebuilt the C++ library binary. + It will turn off any features that require a download, unless they're available + in the `tools/cpp/thirdparty/download/` subfolder of the tar.gz file. + `create_package_with_all_dependencies()` creates that subfolder. + Regardless of this flag's value, `cmake` will be downloaded if it's unavailable. +* `TEST_R_WITHOUT_LIBARROW`: When set to `true`, skip tests that would require + the C++ Arrow library (that is, almost everything). + # Troubleshooting Note that after any change to libarrow, you must reinstall it and @@ -519,8 +533,8 @@ To resolve this, try [rebuilding the Arrow library](#step-3-building-arrow). ## Multiple versions of libarrow -If you are installing from a user-level directory, and you already have a -previous installation of libarrow in a system directory, you get you may get +If you are installing from a user-level directory, and you already have a +previous installation of libarrow in a system directory, you get you may get errors like the following when you install the R package: ``` @@ -531,7 +545,7 @@ Error: package or namespace load failed for ‘arrow' in dyn.load(file, DLLpath Reason: image not found ``` -If this happens, you need to make sure that you don't let R link to your system +If this happens, you need to make sure that you don't let R link to your system library when building arrow. You can do this a number of different ways: * Setting the `MAKEFLAGS` environment variable to `"LDFLAGS="` (see below for an example) this is the recommended way to accomplish this @@ -588,4 +602,5 @@ guide](https://arrow.apache.org/docs/developers/cpp/building.html). ## Other installation issues -There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information. \ No newline at end of file +There are a number of scripts that are triggered when the arrow R package is installed. For package users who are not interacting with the underlying code, these should all just work without configuration and pull in the most complete pieces (e.g. official binaries that we host). However, knowing about these scripts can help package developers troubleshoot if things go wrong in them or things go wrong in an install. See [the installation vignette](./install.html#how-dependencies-are-resolved) for more information. +>>>>>>> master diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 013e63a113e..66f3e8e2e6e 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -102,6 +102,50 @@ satisfy C++ dependencies. > Note that, unlike packages like `tensorflow`, `blogdown`, and others that require external dependencies, you do not need to run `install_arrow()` after a successful `arrow` installation. +## Offline installation + +The `install-arrow.R` file also includes the `create_package_with_all_dependencies()` +function. Normally, when installing on a computer with internet access, the +build process will download third-party dependencies as needed. +This function provides a way to download them in advance. +Doing so may be useful when installing Arrow on a computer without internet access. +Note that Arrow _can_ be installed on a computer without internet access without doing this, but +many useful features will be disabled, as they depend on third-party components. +More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every +capability. +One approach to add more capabilities in an offline install is to prepare a +package with pre-downloaded dependencies. The +`create_package_with_all_dependencies()` function does this preparation. + +If you're using binary packages you shouldn't need to follow these steps. You +should download the appropriate binary from your package repository, transfer +that to the offline computer, and install that. Any OS can create the source +bundle, but it cannot be installed on Windows. (Instead, use a standard +Windows binary package.) + +Note if you're using RStudio Package Manager on Linux: If you still want to +make a source bundle with this function, make sure to set the first repo in +`options("repos")` to be a mirror that contains source packages (that is: +something other than the RSPM binary mirror URLs). + +### Using a computer with internet access, pre-download the dependencies: +* Install the `arrow` package _or_ run + `source("https://raw.githubusercontent.com/apache/arrow/master/r/R/install-arrow.R")` +* Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")` +* Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access + +### On the computer without internet access, install the prepared package: +* Install the `arrow` package from the copied file + * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))` + * This installation will build from source, so `cmake` must be available +* Run `arrow_info()` to check installed capabilities + +#### Alternative, hands-on approach +* Download the dependency files (`cpp/thirdparty/download_dependencies.sh` may be helpful) +* Copy the directory of dependencies to the offline computer +* Create the environment variable `ARROW_THIRDPARTY_DEPENDENCY_DIR` on the offline computer, pointing to the copied directory. +* Install the `arrow` package as usual. + ## S3 support The `arrow` package allows you to work with data in AWS S3 or in other cloud @@ -156,10 +200,10 @@ If found, they will be downloaded and bundled when your R package compiles. For a list of supported distributions and versions, see the [arrow-r-nightly](https://github.com/ursa-labs/arrow-r-nightly/blob/master/README.md) project. -If no binary is found, it will download the Arrow C++ source that matches the R package version -(CRAN release or nightly build) and attempt to build it locally. -If no matching source bundle is found, it will also look to see if you are in +If no C++ library binary is found, it will attempt to build it locally. +First, it will also look to see if you are in a checkout of the `apache/arrow` git repository and thus have the C++ source there. +Otherwise, it builds from the C++ files included in the package. Depending on your system, building Arrow C++ from source may be slow. For the specific mechanics of how all this works, see the R package `configure` script, @@ -329,11 +373,15 @@ Some features are optional when you build Arrow from source. With the exception * `ARROW_S3`: If set to `ON` S3 support will be built as long as the dependencies are met; if they are not met, the build script will turn this `OFF` * `ARROW_JEMALLOC` for the `jemalloc` memory allocator +* `ARROW_MIMALLOC` for the `mimalloc` memmory allocator * `ARROW_PARQUET` * `ARROW_DATASET` * `ARROW_JSON` for the JSON parsing library * `ARROW_WITH_RE2` for the RE2 regular expression library, used in some string compute functions * `ARROW_WITH_UTF8PROC` for the UTF8Proc string library, used in many other string compute functions +* `ARROW_JSON` for JSON parsing +* `ARROW_WITH_BROTLI`, `ARROW_WITH_BZ2`, `ARROW_WITH_LZ4`, `ARROW_WITH_SNAPPY`, `ARROW_WITH_ZLIB`, and `ARROW_WITH_ZSTD` for various compression algorithms + There are a number of other variables that affect the `configure` script and the bundled build script. By default, these are all unset. All boolean variables are case-insensitive. @@ -342,10 +390,6 @@ By default, these are all unset. All boolean variables are case-insensitive. won't look for Arrow libraries on your system and instead will look to download/build them. Use this if you have a version mismatch between installed system libraries and the version of the R package you're installing. -* `LIBARROW_DOWNLOAD`: Unless set to `false`, the build script - will attempt to download C++ binary or source bundles. - If you're in a checkout of the `apache/arrow` git repository - and want to build the C++ library from the local source, make this `false`. * `LIBARROW_BINARY`: If set to `true`, the script will try to download a binary C++ library built for your operating system. You may also set it to some other string, From 080a86b2150ae58c4a160bffd5ebb6363eae33fd Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Tue, 7 Sep 2021 09:59:08 -0500 Subject: [PATCH 40/93] Implemented review feedback and added more unit tests --- cpp/src/arrow/util/value_parsing.h | 130 +++++++---------------- cpp/src/arrow/util/value_parsing_test.cc | 6 ++ 2 files changed, 42 insertions(+), 94 deletions(-) diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 435081047f8..61801d43027 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -273,96 +273,36 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST -#define PARSE_HEX_ITERATION(C_TYPE) \ - if (length > 0) { \ - char val = *s; \ - s++; \ - result = static_cast(result << 4); \ - length--; \ - if (val >= '0' && val <= '9'){ \ - result = static_cast(result | (val -'0')); \ - } else if (val >= 'A' && val <= 'F'){ \ - result = static_cast(result | (val -'A' + 10)); \ - } else if (val >= 'a' && val <= 'f'){ \ - result = static_cast(result | (val -'a' + 10)); \ - } else { \ - /* Non-digit */ \ - return false; \ - } \ - } else { \ - break; \ - } - - -inline bool ParseHex(const char* s, size_t length, uint8_t* out) { - uint8_t result = 0; - - do { - PARSE_HEX_ITERATION(uint8_t); - PARSE_HEX_ITERATION(uint8_t); - } while (false); - *out = result; - return true; -} - -inline bool ParseHex(const char* s, size_t length, uint16_t* out) { - uint16_t result = 0; - do { - PARSE_HEX_ITERATION(uint16_t); - PARSE_HEX_ITERATION(uint16_t); - PARSE_HEX_ITERATION(uint16_t); - PARSE_HEX_ITERATION(uint16_t); - } while (false); - *out = result; - return true; -} -inline bool ParseHex(const char* s, size_t length, uint32_t* out) { - uint32_t result = 0; - do { - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - PARSE_HEX_ITERATION(uint32_t); - } while (false); - *out = result; - return true; -} -inline bool ParseHex(const char* s, size_t length, uint64_t* out) { - uint64_t result = 0; - do { - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - PARSE_HEX_ITERATION(uint64_t); - } while (false); +template +bool ParseHex(const char* s, size_t length, T* out) { + T result = 0; + int num_iterations = (int)(sizeof(T)*2); + for (int i = 0; i < num_iterations; i++){ + if (length > 0) { + char val = *s; + s++; + result = static_cast(result << 4); + length--; + if (val >= '0' && val <= '9'){ + result = static_cast(result | (val -'0')); + } else if (val >= 'A' && val <= 'F'){ + result = static_cast(result | (val -'A' + 10)); + } else if (val >= 'a' && val <= 'f'){ + result = static_cast(result | (val -'a' + 10)); + } else { + /* Non-digit */ + return false; + } + } else { + break; + } + } *out = result; return true; } -#undef PARSE_HEX_ITERATION - template struct StringToUnsignedIntConverterMixin { using value_type = typename ARROW_TYPE::c_type; @@ -372,7 +312,7 @@ struct StringToUnsignedIntConverterMixin { return false; } // If its starts with 0x then its hex - if (*s == '0' && *(s + 1) == 'x'){ + if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ length -= 2; s += 2; // lets make sure that the length of the string is not too big @@ -432,16 +372,9 @@ struct StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } - if (*s == '-') { - negative = true; - s++; - if (--length == 0) { - return false; - } - } - + // If its starts with 0x then its hex - if (*s == '0' && *(s + 1) == 'x'){ + if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ length -= 2; s += 2; // lets make sure that the length of the string is not too big @@ -454,6 +387,15 @@ struct StringToSignedIntConverterMixin { *out = static_cast(unsigned_value); return true; } + + if (*s == '-') { + negative = true; + s++; + if (--length == 0) { + return false; + } + } + // Skip leading zeros while (length > 0 && *s == '0') { length--; diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index edc5223f0cd..cb4fdd1e2b1 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -230,6 +230,7 @@ TEST(StringConversion, ToInt32) { AssertConversion("0x123abc", 1194684); AssertConversion("0xA4b35", 674613); AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", -1); AssertConversionFails("0x23512ak"); } @@ -258,6 +259,7 @@ TEST(StringConversion, ToUInt32) { AssertConversion("0x123abc", 1194684); AssertConversion("0xA4b35", 674613); AssertConversion("0x7FFFfFfF", 2147483647); + AssertConversion("0XFFFFfFfF", 4294967295); AssertConversionFails("0x23512ak"); } @@ -282,6 +284,8 @@ TEST(StringConversion, ToInt64) { AssertConversion("0x5415a123ABC123cb", 6058926048274359243); AssertConversion("0xA4B35", 674613); AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", -1152921504606846975); + AssertConversion("0xfFFFFFFFFFFFFFFf", -1); AssertConversionFails("0x12345678901234567"); AssertConversionFails("0x23512ak"); } @@ -304,6 +308,8 @@ TEST(StringConversion, ToUInt64) { AssertConversion("0x5415a123ABC123cb", 6058926048274359243); AssertConversion("0xA4B35", 674613); AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); + AssertConversion("0XF000000000000001", 17293822569102704641ULL); + AssertConversion("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL); AssertConversionFails("0x12345678901234567"); AssertConversionFails("0x23512ak"); } From f40856a768f2c397082da70af034080994587807 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Tue, 7 Sep 2021 10:12:34 -0500 Subject: [PATCH 41/93] ARROW-13925: [R] Remove system installation devdocs jobs Closes #11103 from jonkeane/ARROW-13925 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- dev/tasks/r/github.devdocs.yml | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml index e4d6bfb6953..5591e6587ae 100644 --- a/dev/tasks/r/github.devdocs.yml +++ b/dev/tasks/r/github.devdocs.yml @@ -15,36 +15,22 @@ # specific language governing permissions and limitations # under the License. -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow +{% import 'macros.jinja' as macros with context %} -on: - push: - branches: - - "*-github-*" +{{ macros.github_header() }} jobs: devdocs: - name: 'R devdocs {{ "${{ matrix.os }}" }} system install: {{ "${{ matrix.system-install }}" }}' + name: 'R devdocs {{ "${{ matrix.os }}" }}' runs-on: {{ "${{ matrix.os }}" }} strategy: fail-fast: false matrix: os: [macOS-latest, ubuntu-20.04] - # should the install method install libarrow into a system directory - # or a temporary directory. old is the same as a temporary - # directory, but an old version of libarrow will be installed - # into a system directory first (to make sure we can link correctly when building) - system-install: [true, false] steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive + {{ macros.github_checkout_arrow()|indent }} + - uses: r-lib/actions/setup-r@v1 - uses: r-lib/actions/setup-pandoc@v1 - name: Install knitr, rmarkdown @@ -64,8 +50,6 @@ jobs: RUN_DEVDOCS: TRUE DEVDOCS_MACOS: {{ "${{contains(matrix.os, 'macOS')}}" }} DEVDOCS_UBUNTU: {{ "${{contains(matrix.os, 'ubuntu')}}" }} - DEVDOCS_SYSTEM_INSTALL: {{ "${{contains(matrix.system-install, 'true')}}" }} - DEVDOCS_PRIOR_SYSTEM_INSTALL: {{ "${{contains(matrix.system-install, 'old')}}" }} run: | # This isn't actually rendering the docs, but will save arrow/r/vignettes/script.sh # which can be sourced to install arrow. From 85d8175ea24b4dd99f108a673e9b63996d4f88cc Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Wed, 8 Sep 2021 06:02:20 +0900 Subject: [PATCH 42/93] ARROW-13919: [GLib] Add GArrowFunctionDoc Closes #11099 from kou/glib-function-doc Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- c_glib/arrow-glib/composite-data-type.cpp | 4 +- c_glib/arrow-glib/compute.cpp | 174 ++++++++++++++++++++++ c_glib/arrow-glib/compute.h | 29 ++++ c_glib/arrow-glib/compute.hpp | 5 + c_glib/arrow-glib/reader.cpp | 36 +++-- c_glib/test/test-function-doc.rb | 43 ++++++ 6 files changed, 277 insertions(+), 14 deletions(-) create mode 100644 c_glib/test/test-function-doc.rb diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 95cd283c1b3..fadcafe6b40 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -376,7 +376,7 @@ garrow_map_data_type_new(GArrowDataType *key_type, * garrow_map_data_type_get_key_type: * @map_data_type: A #GArrowMapDataType. * - * Return: (transfer full): The key type of the map. + * Returns: (transfer full): The key type of the map. * * Since: 0.17.0 */ @@ -395,7 +395,7 @@ garrow_map_data_type_get_key_type(GArrowMapDataType *map_data_type) * garrow_map_data_type_get_item_type: * @map_data_type: A #GArrowMapDataType. * - * Return: (transfer full): The item type of the map. + * Returns: (transfer full): The item type of the map. * * Since: 0.17.0 */ diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 40d7002e7c2..2f4a0de215c 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -126,6 +126,8 @@ G_BEGIN_DECLS * #GArrowFunctionOptions is a base class for all function options * classes such as #GArrowCastOptions. * + * #GArrowFunctionDoc is a class for function document. + * * #GArrowFunction is a class to process data. * * #GArrowExecuteNodeOptions is a base class for all execute node @@ -260,6 +262,145 @@ garrow_function_options_class_init(GArrowFunctionOptionsClass *klass) } +typedef struct GArrowFunctionDocPrivate_ { + arrow::compute::FunctionDoc *doc; +} GArrowFunctionDocPrivate; + +enum { + PROP_DOC = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowFunctionDoc, + garrow_function_doc, + G_TYPE_OBJECT) + +#define GARROW_FUNCTION_DOC_GET_PRIVATE(object) \ + static_cast( \ + garrow_function_doc_get_instance_private( \ + GARROW_FUNCTION_DOC(object))) + +static void +garrow_function_doc_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FUNCTION_DOC_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DOC: + priv->doc = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_function_doc_init(GArrowFunctionDoc *object) +{ +} + +static void +garrow_function_doc_class_init(GArrowFunctionDocClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->set_property = garrow_function_doc_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("doc", + "Doc", + "The raw arrow::compute::FunctionDoc *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DOC, spec); +} + +/** + * garrow_function_doc_get_summary: + * @doc: A #GArrowFunctionDoc. + * + * Returns: A one-line summary of the function, using a verb. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 6.0.0 + */ +gchar * +garrow_function_doc_get_summary(GArrowFunctionDoc *doc) +{ + auto arrow_doc = garrow_function_doc_get_raw(doc); + return g_strndup(arrow_doc->summary.data(), + arrow_doc->summary.size()); +} + +/** + * garrow_function_doc_get_description: + * @doc: A #GArrowFunctionDoc. + * + * Returns: A detailed description of the function, meant to follow + * the summary. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 6.0.0 + */ +gchar * +garrow_function_doc_get_description(GArrowFunctionDoc *doc) +{ + auto arrow_doc = garrow_function_doc_get_raw(doc); + return g_strndup(arrow_doc->description.data(), + arrow_doc->description.size()); +} + +/** + * garrow_function_doc_get_arg_names: + * @doc: A #GArrowFunctionDoc. + * + * Returns: (array zero-terminated=1) (element-type utf8) (transfer full): + * Symbolic names (identifiers) for the function arguments. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. + * + * Since: 6.0.0 + */ +gchar ** +garrow_function_doc_get_arg_names(GArrowFunctionDoc *doc) +{ + auto arrow_doc = garrow_function_doc_get_raw(doc); + const auto &arrow_arg_names = arrow_doc->arg_names; + auto n = arrow_arg_names.size(); + auto arg_names = g_new(gchar *, n + 1); + for (size_t i = 0; i < n; ++i) { + arg_names[i] = g_strndup(arrow_arg_names[i].data(), + arrow_arg_names[i].size()); + } + arg_names[n] = NULL; + return arg_names; +} + +/** + * garrow_function_doc_get_options_class_name: + * @doc: A #GArrowFunctionDoc. + * + * Returns: Name of the options class, if any. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 6.0.0 + */ +gchar * +garrow_function_doc_get_options_class_name(GArrowFunctionDoc *doc) +{ + auto arrow_doc = garrow_function_doc_get_raw(doc); + return g_strndup(arrow_doc->options_class.data(), + arrow_doc->options_class.size()); +} + + typedef struct GArrowFunctionPrivate_ { std::shared_ptr function; } GArrowFunctionPrivate; @@ -403,6 +544,22 @@ garrow_function_execute(GArrowFunction *function, } } +/** + * garrow_function_get_doc: + * @function: A #GArrowFunction. + * + * Returns: (transfer full): The function documentation. + * + * Since: 6.0.0 + */ +GArrowFunctionDoc * +garrow_function_get_doc(GArrowFunction *function) +{ + auto arrow_function = garrow_function_get_raw(function); + const auto &arrow_doc = arrow_function->doc(); + return garrow_function_doc_new_raw(&arrow_doc); +} + typedef struct GArrowExecuteNodeOptionsPrivate_ { arrow::compute::ExecNodeOptions *options; @@ -3922,6 +4079,23 @@ garrow_function_options_get_raw(GArrowFunctionOptions *options) return priv->options; } + +GArrowFunctionDoc * +garrow_function_doc_new_raw(const arrow::compute::FunctionDoc *arrow_doc) +{ + return GARROW_FUNCTION_DOC(g_object_new(GARROW_TYPE_FUNCTION_DOC, + "doc", arrow_doc, + NULL)); +} + +arrow::compute::FunctionDoc * +garrow_function_doc_get_raw(GArrowFunctionDoc *doc) +{ + auto priv = GARROW_FUNCTION_DOC_GET_PRIVATE(doc); + return priv->doc; +} + + GArrowFunction * garrow_function_new_raw(std::shared_ptr *arrow_function) { diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 0c7424c7765..2171d6abd9a 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -51,6 +51,31 @@ struct _GArrowFunctionOptionsClass }; +#define GARROW_TYPE_FUNCTION_DOC (garrow_function_doc_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFunctionDoc, + garrow_function_doc, + GARROW, + FUNCTION_DOC, + GObject) +struct _GArrowFunctionDocClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +gchar * +garrow_function_doc_get_summary(GArrowFunctionDoc *doc); +GARROW_AVAILABLE_IN_6_0 +gchar * +garrow_function_doc_get_description(GArrowFunctionDoc *doc); +GARROW_AVAILABLE_IN_6_0 +gchar ** +garrow_function_doc_get_arg_names(GArrowFunctionDoc *doc); +GARROW_AVAILABLE_IN_6_0 +gchar * +garrow_function_doc_get_options_class_name(GArrowFunctionDoc *doc); + + #define GARROW_TYPE_FUNCTION (garrow_function_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowFunction, garrow_function, @@ -73,6 +98,10 @@ GArrowDatum *garrow_function_execute(GArrowFunction *function, GArrowExecuteContext *context, GError **error); +GARROW_AVAILABLE_IN_6_0 +GArrowFunctionDoc * +garrow_function_get_doc(GArrowFunction *function); + #define GARROW_TYPE_EXECUTE_NODE_OPTIONS (garrow_execute_node_options_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowExecuteNodeOptions, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 4adea1847ac..88f55d5329c 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -30,6 +30,11 @@ garrow_execute_context_get_raw(GArrowExecuteContext *context); arrow::compute::FunctionOptions * garrow_function_options_get_raw(GArrowFunctionOptions *options); +GArrowFunctionDoc * +garrow_function_doc_new_raw(const arrow::compute::FunctionDoc *arrow_doc); +arrow::compute::FunctionDoc * +garrow_function_doc_get_raw(GArrowFunctionDoc *doc); + GArrowFunction * garrow_function_new_raw(std::shared_ptr *arrow_function); std::shared_ptr diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 3e6539feb5a..98038248050 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -1360,10 +1360,13 @@ garrow_csv_read_options_set_null_values(GArrowCSVReadOptions *options, * garrow_csv_read_options_get_null_values: * @options: A #GArrowCSVReadOptions. * - * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): - * The values to be processed as null. It's a %NULL-terminated string array. + * Returns: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): + * The values to be processed as null. + * * If the number of values is zero, this returns %NULL. - * It must be freed with g_strfreev() when no longer needed. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. * * Since: 0.14.0 */ @@ -1425,10 +1428,13 @@ garrow_csv_read_options_set_true_values(GArrowCSVReadOptions *options, * garrow_csv_read_options_get_true_values: * @options: A #GArrowCSVReadOptions. * - * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): - * The values to be processed as true. It's a %NULL-terminated string array. + * Returns: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): + * The values to be processed as true. + * * If the number of values is zero, this returns %NULL. - * It must be freed with g_strfreev() when no longer needed. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. * * Since: 0.14.0 */ @@ -1490,10 +1496,13 @@ garrow_csv_read_options_set_false_values(GArrowCSVReadOptions *options, * garrow_csv_read_options_get_false_values: * @options: A #GArrowCSVReadOptions. * - * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): - * The values to be processed as false. It's a %NULL-terminated string array. + * Returns: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): + * The values to be processed as false. + * * If the number of values is zero, this returns %NULL. - * It must be freed with g_strfreev() when no longer needed. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. * * Since: 0.14.0 */ @@ -1556,10 +1565,13 @@ garrow_csv_read_options_set_column_names(GArrowCSVReadOptions *options, * garrow_csv_read_options_get_column_names: * @options: A #GArrowCSVReadOptions. * - * Return: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): - * The column names. It's a %NULL-terminated string array. + * Returns: (nullable) (array zero-terminated=1) (element-type utf8) (transfer full): + * The column names. + * * If the number of values is zero, this returns %NULL. - * It must be freed with g_strfreev() when no longer needed. + * + * It's a %NULL-terminated string array. It must be freed with + * g_strfreev() when no longer needed. * * Since: 0.15.0 */ diff --git a/c_glib/test/test-function-doc.rb b/c_glib/test/test-function-doc.rb new file mode 100644 index 00000000000..7e624a5ab7c --- /dev/null +++ b/c_glib/test/test-function-doc.rb @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFunctionDoc < Test::Unit::TestCase + def setup + @doc = Arrow::Function.find("or").doc + end + + def test_summary + assert_equal("Logical 'or' boolean values", + @doc.summary) + end + + def test_description + assert_equal(<<-DESCRIPTION.chomp, @doc.description) +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + DESCRIPTION + end + + def test_arg_names + assert_equal(["x", "y"], @doc.arg_names) + end + + def test_options_class_name + doc = Arrow::Function.find("cast").doc + assert_equal("CastOptions", doc.options_class_name) + end +end From e396d4fa64429d8b7aa1a72118b941ad8e9b06be Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 8 Sep 2021 14:28:13 +0800 Subject: [PATCH 43/93] ARROW-13872: [Java] ExtensionTypeVector does not work with RangeEqualsVisitor This adds the ExtensionTypeVector to the VectorVisitor interface and implements that method for the existing visitors. Now when visitor like RangeEqualsVisitor that uses another ExtensionTypeVector, they can both use the visitor on the underlying storage vector. Closes #11073 from BryanCutler/java-extension-compare Authored-by: Bryan Cutler Signed-off-by: liyafan82 --- .../arrow/vector/ExtensionTypeVector.java | 2 +- .../vector/compare/RangeEqualsVisitor.java | 13 +++++ .../vector/compare/TypeEqualsVisitor.java | 6 +++ .../arrow/vector/compare/VectorVisitor.java | 3 ++ .../arrow/vector/util/VectorAppender.java | 9 ++++ .../validate/ValidateVectorBufferVisitor.java | 7 +++ .../validate/ValidateVectorDataVisitor.java | 7 +++ .../validate/ValidateVectorTypeVisitor.java | 18 +++++++ .../validate/ValidateVectorVisitor.java | 9 +++- .../vector/types/pojo/TestExtensionType.java | 48 +++++++++++++++++++ 10 files changed, 120 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java index 516077d8328..37222507e4c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ExtensionTypeVector.java @@ -261,6 +261,6 @@ public BufferAllocator getAllocator() { @Override public OUT accept(VectorVisitor visitor, IN value) { - return getUnderlyingVector().accept(visitor, value); + return visitor.visit(this, value); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 6805d7caf8e..35b4936e357 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -27,6 +27,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; @@ -214,6 +215,18 @@ public Boolean visit(NullVector left, Range range) { return true; } + @Override + public Boolean visit(ExtensionTypeVector left, Range range) { + if (!(right instanceof ExtensionTypeVector) || !validate(left)) { + return false; + } + ValueVector rightUnderlying = ((ExtensionTypeVector) right).getUnderlyingVector(); + TypeEqualsVisitor typeVisitor = new TypeEqualsVisitor(rightUnderlying); + RangeEqualsVisitor underlyingVisitor = + createInnerVisitor(left.getUnderlyingVector(), rightUnderlying, (l, r) -> typeVisitor.equals(l)); + return underlyingVisitor.rangeEquals(range); + } + protected RangeEqualsVisitor createInnerVisitor( ValueVector leftInner, ValueVector rightInner, BiFunction typeComparator) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 95db7924cd1..443ee1f96e2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -119,6 +120,11 @@ public Boolean visit(NullVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(ExtensionTypeVector left, Void value) { + return compareField(left.getField(), right.getField()); + } + private boolean compareField(Field leftField, Field rightField) { if (leftField == rightField) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index 14f3434736e..aee090706b3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -20,6 +20,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; @@ -54,5 +55,7 @@ public interface VectorVisitor { OUT visit(DenseUnionVector left, IN value); OUT visit(NullVector left, IN value); + + OUT visit(ExtensionTypeVector left, IN value); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 7703fed65a6..e5809e93ea8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.compare.TypeEqualsVisitor; @@ -530,4 +531,12 @@ public ValueVector visit(NullVector deltaVector, Void value) { "The targetVector to append must have the same type as the targetVector being appended"); return targetVector; } + + @Override + public ValueVector visit(ExtensionTypeVector deltaVector, Void value) { + ValueVector targetUnderlying = ((ExtensionTypeVector) targetVector).getUnderlyingVector(); + VectorAppender underlyingAppender = new VectorAppender(targetUnderlying); + deltaVector.getUnderlyingVector().accept(underlyingAppender, null); + return targetVector; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index 46064c37bdd..d4abaa1945b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -24,6 +24,7 @@ import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.TypeLayout; @@ -236,4 +237,10 @@ public Void visit(DenseUnionVector vector, Void value) { public Void visit(NullVector vector, Void value) { return null; } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java index 23a0beeb51f..cdeb4f1eaa1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.compare.VectorVisitor; @@ -170,4 +171,10 @@ public Void visit(DenseUnionVector vector, Void value) { public Void visit(NullVector vector, Void value) { return null; } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index b7aa44c99f9..65795b46813 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.Decimal256Vector; import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.Float8Vector; @@ -146,6 +147,17 @@ private void validateTimeStampVector(ValueVector vector, TimeUnit expectedTimeUn } } + private void validateExtensionTypeVector(ExtensionTypeVector vector) { + validateOrThrow(vector.getField().getFieldType().getType() instanceof ArrowType.ExtensionType, + "Vector %s is not an extension type vector.", vector.getClass()); + validateOrThrow(vector.getField().getMetadata().containsKey(ArrowType.ExtensionType.EXTENSION_METADATA_KEY_NAME), + "Field %s does not have proper extension type metadata: %s", + vector.getField().getName(), + vector.getField().getMetadata()); + // Validate the storage vector type + vector.getUnderlyingVector().accept(this, null); + } + @Override public Void visit(BaseFixedWidthVector vector, Void value) { if (vector instanceof TinyIntVector) { @@ -357,4 +369,10 @@ public Void visit(NullVector vector, Void value) { validateVectorCommon(vector, ArrowType.Null.class); return null; } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + validateExtensionTypeVector(vector); + return null; + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java index 3e44c262d75..7e99b1f90fb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -39,7 +40,7 @@ /** * Visitor to validate vector (without validating data). * This visitor could be used for {@link ValueVector#accept(VectorVisitor, Object)} API, - * and also users could simply use {@link ValueVectorUtility#validate(FieldVector)}. + * and also users could simply use {@link ValueVectorUtility#validate(ValueVector)}. */ public class ValidateVectorVisitor implements VectorVisitor { @@ -263,4 +264,10 @@ public Void visit(DenseUnionVector vector, Void value) { public Void visit(NullVector vector, Void value) { return null; } + + @Override + public Void visit(ExtensionTypeVector vector, Void value) { + vector.getUnderlyingVector().accept(this, value); + return null; + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java index 53f009cb761..8b2743210de 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestExtensionType.java @@ -17,6 +17,8 @@ package org.apache.arrow.vector.types.pojo; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -40,11 +42,15 @@ import org.apache.arrow.vector.FixedSizeBinaryVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.compare.Range; +import org.apache.arrow.vector.compare.RangeEqualsVisitor; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.ArrowFileReader; import org.apache.arrow.vector.ipc.ArrowFileWriter; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType.ExtensionType; +import org.apache.arrow.vector.util.VectorBatchAppender; +import org.apache.arrow.vector.validate.ValidateVectorVisitor; import org.junit.Assert; import org.junit.Test; @@ -230,6 +236,48 @@ public void roundtripLocation() throws IOException { } } + @Test + public void testVectorCompare() { + UuidType uuidType = new UuidType(); + ExtensionTypeRegistry.register(uuidType); + try (final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + UuidVector a1 = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator); + UuidVector a2 = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator); + UuidVector bb = (UuidVector) uuidType.getNewVector("a", FieldType.nullable(uuidType), allocator) + ) { + UUID u1 = UUID.randomUUID(); + UUID u2 = UUID.randomUUID(); + + // Test out type and vector validation visitors for an ExtensionTypeVector + ValidateVectorVisitor validateVisitor = new ValidateVectorVisitor(); + validateVisitor.visit(a1, null); + + a1.setValueCount(2); + a1.set(0, u1); + a1.set(1, u2); + + a2.setValueCount(2); + a2.set(0, u1); + a2.set(1, u2); + + bb.setValueCount(2); + bb.set(0, u2); + bb.set(1, u1); + + Range range = new Range(0, 0, a1.getValueCount()); + RangeEqualsVisitor visitor = new RangeEqualsVisitor(a1, a2); + assertTrue(visitor.rangeEquals(range)); + + visitor = new RangeEqualsVisitor(a1, bb); + assertFalse(visitor.rangeEquals(range)); + + // Test out vector appender + VectorBatchAppender.batchAppend(a1, a2, bb); + assertEquals(a1.getValueCount(), 6); + validateVisitor.visit(a1, null); + } + } + static class UuidType extends ExtensionType { @Override From 57e76e8e620a2a4dda96f1153437c77bfab7f990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 8 Sep 2021 15:36:36 +0200 Subject: [PATCH 44/93] ARROW-13921: [Python][Packaging] Pin minimum setuptools version for the macos wheels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a bug in setuptools which caused the recent nightly failures: https://github.com/ursacomputing/crossbow/runs/3521607291#step:10:269 Closes #11100 from kszucs/macos-wheels-setuptools Authored-by: Krisztián Szűcs Signed-off-by: Krisztián Szűcs --- ci/scripts/python_wheel_macos_build.sh | 1 + python/requirements-wheel-build.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 82e0339c9d0..1a52a2ad52b 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -53,6 +53,7 @@ export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[ export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" pip install \ + --upgrade \ --only-binary=:all: \ --target $PIP_SITE_PACKAGES \ --platform $PIP_TARGET_PLATFORM \ diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index b2878d2971c..c06935b8374 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,5 @@ cython>=0.29.11 +setuptools>=58 setuptools_scm wheel numpy==1.19.4; platform_system == "Linux" and platform_machine == "aarch64" From 97135bcc30d8a03fb77e11e32dc898b44df4d9a1 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Wed, 8 Sep 2021 08:43:11 -0500 Subject: [PATCH 45/93] Docs + lintr fix (#11107) --- r/man/write_dataset.Rd | 40 ++++++++++++++++++++++++++++++++++++++ r/vignettes/developing.Rmd | 1 - 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index f29a9ee2189..4e44f12e2fb 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -64,3 +64,43 @@ This function allows you to write a dataset. By writing to more efficient binary storage formats, and by specifying relevant partitioning, you can make it much faster to read and query. } +\examples{ +\dontshow{if (arrow_with_dataset() & arrow_with_parquet() & requireNamespace("dplyr", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# You can write datasets partitioned by the values in a column (here: "cyl"). +# This creates a structure of the form cyl=X/part-Z.parquet. +one_level_tree <- tempfile() +write_dataset(mtcars, one_level_tree, partitioning = "cyl") +list.files(one_level_tree, recursive = TRUE) + +# You can also partition by the values in multiple columns +# (here: "cyl" and "gear"). +# This creates a structure of the form cyl=X/gear=Y/part-Z.parquet. +two_levels_tree <- tempfile() +write_dataset(mtcars, two_levels_tree, partitioning = c("cyl", "gear")) +list.files(two_levels_tree, recursive = TRUE) + +# In the two previous examples we would have: +# X = {4,6,8}, the number of cylinders. +# Y = {3,4,5}, the number of forward gears. +# Z = {0,1,2}, the number of saved parts, starting from 0. + +# You can obtain the same result as as the previous examples using arrow with +# a dplyr pipeline. This will be the same as two_levels_tree above, but the +# output directory will be different. +two_levels_tree_2 <- tempfile() +mtcars \%>\% + group_by(cyl, gear) \%>\% + write_dataset(two_levels_tree_2) +list.files(two_levels_tree_2, recursive = TRUE) + +# And you can also turn off the Hive-style directory naming where the column +# name is included with the values by using `hive_style = FALSE`. + +# Write a structure X/Y/part-Z.parquet. +two_levels_tree_no_hive <- tempfile() +mtcars \%>\% + group_by(cyl, gear) \%>\% + write_dataset(two_levels_tree_no_hive, hive_style = FALSE) +list.files(two_levels_tree_no_hive, recursive = TRUE) +\dontshow{\}) # examplesIf} +} diff --git a/r/vignettes/developing.Rmd b/r/vignettes/developing.Rmd index d1d7998de32..5cff5e5608c 100644 --- a/r/vignettes/developing.Rmd +++ b/r/vignettes/developing.Rmd @@ -348,7 +348,6 @@ or in R: ```{r} # note the two excluded files which should not be styled styler::style_pkg(exclude_files = c("tests/testthat/latin1.R", "data-raw/codegen.R")) - ``` The styler package will fix many styling errors, thought not all lintr errors are automatically fixable with styler. The list of files we intentionally do not style is in `r/.styler_excludes.R`. From a081a0524d876f4adaaba97f2d9b725097c0091e Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 8 Sep 2021 10:35:13 -0500 Subject: [PATCH 46/93] checked for empty hex falues. added scalar tests --- .../arrow/compute/kernels/scalar_cast_test.cc | 40 +++++++++++-------- cpp/src/arrow/util/value_parsing.h | 8 ++-- cpp/src/arrow/util/value_parsing_test.cc | 22 +++++++--- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 90d41894578..b311ca5136f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1453,34 +1453,37 @@ TEST(Cast, StringToBoolean) { TEST(Cast, StringToInt) { for (auto string_type : {utf8(), large_utf8()}) { for (auto signed_type : {int8(), int16(), int32(), int64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0"])"), - ArrayFromJSON(signed_type, "[0, null, 127, -1, 0]")); + CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"), + ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]")); } CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0"])"), - ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0]")); + ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", + "0X0", "0x7FFFFFFF", "0XFFFFfFfF", "0Xf0000000"])"), + ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]")); CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "-9223372036854775808", "0", "0"])"), - ArrayFromJSON(int64(), - "[9223372036854775807, null, -9223372036854775808, 0, 0]")); + string_type, R"(["9223372036854775807", null, "-9223372036854775808", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"), + ArrayFromJSON( int64(), + "[9223372036854775807, null, -9223372036854775808, 0, 0, 9223372036854775807, -1152921504606846975]")); for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0"])"), - ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0]")); + CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f])"), + ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); } CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0"])"), - ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0]")); + ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", + "0x0", "0x7FFFFFf", "0xFFFFFFFF"])"), + ArrayFromJSON(uint32(), + "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); CheckCast(ArrayFromJSON( - string_type, - R"(["9223372036854775807", null, "18446744073709551615", "0", "0"])"), - ArrayFromJSON(uint64(), - "[9223372036854775807, null, 18446744073709551615, 0, 0]")); + string_type, R"(["9223372036854775807", null, "18446744073709551615", "0", + "0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"), + ArrayFromJSON( uint64(), + "[9223372036854775807, null, 18446744073709551615, 0, 0, 9223372036854775807, 18446744073709551615]")); for (std::string not_int8 : { "z", @@ -1488,6 +1491,8 @@ TEST(Cast, StringToInt) { "128", "-129", "0.5", + "0x", + "0xfff", }) { auto options = CastOptions::Safe(int8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); @@ -1497,6 +1502,9 @@ TEST(Cast, StringToInt) { "256", "-1", "0.5", + "0x", + "0x3wa" + "0x123" }) { auto options = CastOptions::Safe(uint8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 61801d43027..fc723441963 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -315,8 +315,9 @@ struct StringToUnsignedIntConverterMixin { if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ length -= 2; s += 2; + // lets make sure that the length of the string is not too big - if (!ARROW_PREDICT_TRUE(sizeof(value_type)*2 >= length)) { + if (!ARROW_PREDICT_TRUE(sizeof(value_type)*2 >= length && length > 0)) { return false; } if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { @@ -377,8 +378,9 @@ struct StringToSignedIntConverterMixin { if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ length -= 2; s += 2; + // lets make sure that the length of the string is not too big - if (!ARROW_PREDICT_TRUE(sizeof(unsigned_value)*2 >= length)) { + if (!ARROW_PREDICT_TRUE(sizeof(unsigned_value)*2 >= length && length > 0)) { return false; } if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { @@ -395,7 +397,7 @@ struct StringToSignedIntConverterMixin { return false; } } - + // Skip leading zeros while (length > 0 && *s == '0') { length--; diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index cb4fdd1e2b1..03869797bf3 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -123,9 +123,11 @@ TEST(StringConversion, ToInt8) { // Hex AssertConversion("0x0", 0); - AssertConversion("0x1A", 26); + AssertConversion("0X1A", 26); AssertConversion("0xb", 11); AssertConversion("0x7F", 127); + AssertConversion("0xFF", -1); + AssertConversionFails("0x"); AssertConversionFails("0x100"); AssertConversionFails("0x1g"); } @@ -152,6 +154,8 @@ TEST(StringConversion, ToUInt8) { AssertConversion("0x1A", 26); AssertConversion("0xb", 11); AssertConversion("0x7F", 127); + AssertConversion("0xFF", 255); + AssertConversionFails("0x"); AssertConversionFails("0x100"); AssertConversionFails("0x1g"); } @@ -174,9 +178,11 @@ TEST(StringConversion, ToInt16) { // Hex AssertConversion("0x0", 0); - AssertConversion("0x1aA", 426); + AssertConversion("0X1aA", 426); AssertConversion("0xb", 11); AssertConversion("0x7ffF", 32767); + AssertConversion("0XfffF", -1); + AssertConversionFails("0x"); AssertConversionFails("0x10000"); AssertConversionFails("0x1g"); } @@ -202,6 +208,8 @@ TEST(StringConversion, ToUInt16) { AssertConversion("0x1aA", 426); AssertConversion("0xb", 11); AssertConversion("0x7ffF", 32767); + AssertConversion("0xFffF", 65535); + AssertConversionFails("0x"); AssertConversionFails("0x10000"); AssertConversionFails("0x1g"); } @@ -230,7 +238,8 @@ TEST(StringConversion, ToInt32) { AssertConversion("0x123abc", 1194684); AssertConversion("0xA4b35", 674613); AssertConversion("0x7FFFfFfF", 2147483647); - AssertConversion("0XFFFFfFfF", -1); + AssertConversion("0XFFFFfFfF", -1); + AssertConversionFails("0X"); AssertConversionFails("0x23512ak"); } @@ -260,6 +269,7 @@ TEST(StringConversion, ToUInt32) { AssertConversion("0xA4b35", 674613); AssertConversion("0x7FFFfFfF", 2147483647); AssertConversion("0XFFFFfFfF", 4294967295); + AssertConversionFails("0X"); AssertConversionFails("0x23512ak"); } @@ -284,8 +294,9 @@ TEST(StringConversion, ToInt64) { AssertConversion("0x5415a123ABC123cb", 6058926048274359243); AssertConversion("0xA4B35", 674613); AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); - AssertConversion("0XF000000000000001", -1152921504606846975); - AssertConversion("0xfFFFFFFFFFFFFFFf", -1); + AssertConversion("0XF000000000000001", -1152921504606846975); + AssertConversion("0xfFFFFFFFFFFFFFFf", -1); + AssertConversionFails("0X"); AssertConversionFails("0x12345678901234567"); AssertConversionFails("0x23512ak"); } @@ -310,6 +321,7 @@ TEST(StringConversion, ToUInt64) { AssertConversion("0x7FFFFFFFFFFFFFFf", 9223372036854775807); AssertConversion("0XF000000000000001", 17293822569102704641ULL); AssertConversion("0xfFFFFFFFFFFFFFFf", 18446744073709551615ULL); + AssertConversionFails("0x"); AssertConversionFails("0x12345678901234567"); AssertConversionFails("0x23512ak"); } From 170a24fc5f9f6f9c7c7d15a716c5995f03204a4b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Sep 2021 11:35:28 -0400 Subject: [PATCH 47/93] ARROW-13820: [R] Rename na.min_count to min_count and na.rm to skip_nulls Also contains ARROW-13821: [R] Handle na.rm in sd, var bindings The only other place we take nonstandard C++ options in make_compute_options is for a few types (like count) where there is an enum that controls options and we switch off based on a boolean. We can try to clean that up but I'm inclined to wait to see if anyone needs lower-level control. Closes #11079 from nealrichardson/na-options Authored-by: Neal Richardson Signed-off-by: Neal Richardson --- r/R/compute.R | 4 ++-- r/R/dplyr-functions.R | 14 ++++++-------- r/src/compute.cpp | 20 ++++++++++---------- r/tests/testthat/test-dplyr-summarize.R | 2 -- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/r/R/compute.R b/r/R/compute.R index 39940eedc8c..2d00bcf10e3 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -120,7 +120,7 @@ max.ArrowDatum <- function(..., na.rm = FALSE) { scalar_aggregate("min_max", ..., na.rm = na.rm)$GetFieldByName("max") } -scalar_aggregate <- function(FUN, ..., na.rm = FALSE, na.min_count = 0) { +scalar_aggregate <- function(FUN, ..., na.rm = FALSE, min_count = 0L) { a <- collect_arrays_from_dots(list(...)) if (FUN == "min_max" && na.rm && a$null_count == length(a)) { Array$create(data.frame(min = Inf, max = -Inf)) @@ -128,7 +128,7 @@ scalar_aggregate <- function(FUN, ..., na.rm = FALSE, na.min_count = 0) { # Inf/-Inf, which are type double. Since Arrow is type-stable # and does not do that, we handle this special case here. } else { - call_function(FUN, a, options = list(na.rm = na.rm, na.min_count = na.min_count)) + call_function(FUN, a, options = list(skip_nulls = na.rm, min_count = min_count)) } } diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index 72731216f50..d2f7892aee8 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -784,44 +784,42 @@ agg_funcs$sum <- function(x, na.rm = FALSE) { list( fun = "sum", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = list(skip_nulls = na.rm, min_count = 0L) ) } agg_funcs$any <- function(x, na.rm = FALSE) { list( fun = "any", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = list(skip_nulls = na.rm, min_count = 0L) ) } agg_funcs$all <- function(x, na.rm = FALSE) { list( fun = "all", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = list(skip_nulls = na.rm, min_count = 0L) ) } agg_funcs$mean <- function(x, na.rm = FALSE) { list( fun = "mean", data = x, - options = list(na.rm = na.rm, na.min_count = 0L) + options = list(skip_nulls = na.rm, min_count = 0L) ) } -# na.rm not currently passed in due to ARROW-13691 agg_funcs$sd <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "stddev", data = x, - options = list(ddof = ddof) + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) } -# na.rm not currently passed in due to ARROW-13691 agg_funcs$var <- function(x, na.rm = FALSE, ddof = 1) { list( fun = "variance", data = x, - options = list(ddof = ddof) + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 446e011f548..0f08b41e85d 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -177,11 +177,11 @@ std::shared_ptr make_compute_options( func_name == "hash_all") { using Options = arrow::compute::ScalarAggregateOptions; auto out = std::make_shared(Options::Defaults()); - if (!Rf_isNull(options["na.min_count"])) { - out->min_count = cpp11::as_cpp(options["na.min_count"]); + if (!Rf_isNull(options["min_count"])) { + out->min_count = cpp11::as_cpp(options["min_count"]); } - if (!Rf_isNull(options["na.rm"])) { - out->skip_nulls = cpp11::as_cpp(options["na.rm"]); + if (!Rf_isNull(options["skip_nulls"])) { + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } @@ -225,11 +225,11 @@ std::shared_ptr make_compute_options( cpp11::as_cpp( interpolation); } - if (!Rf_isNull(options["na.min_count"])) { - out->min_count = cpp11::as_cpp(options["na.min_count"]); + if (!Rf_isNull(options["min_count"])) { + out->min_count = cpp11::as_cpp(options["min_count"]); } - if (!Rf_isNull(options["na.rm"])) { - out->skip_nulls = cpp11::as_cpp(options["na.rm"]); + if (!Rf_isNull(options["skip_nulls"])) { + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } @@ -392,8 +392,8 @@ std::shared_ptr make_compute_options( if (!Rf_isNull(options["min_count"])) { out->min_count = cpp11::as_cpp(options["min_count"]); } - if (!Rf_isNull(options["na.rm"])) { - out->skip_nulls = cpp11::as_cpp(options["na.rm"]); + if (!Rf_isNull(options["skip_nulls"])) { + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 9f149673c5a..18596fcf30c 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -134,7 +134,6 @@ test_that("Group by sd on dataset", { tbl ) - skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% group_by(some_grouping) %>% @@ -153,7 +152,6 @@ test_that("Group by var on dataset", { tbl ) - skip("ARROW-13691 - na.rm not yet implemented for VarianceOptions") expect_dplyr_equal( input %>% group_by(some_grouping) %>% From 7a23a07a5eaa324b6c528c64e035657d953c2a56 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 8 Sep 2021 11:00:01 -0500 Subject: [PATCH 48/93] fixed style with clang-format --- .../arrow/compute/kernels/scalar_cast_test.cc | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index b311ca5136f..a948bf9e9ca 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1453,37 +1453,42 @@ TEST(Cast, StringToBoolean) { TEST(Cast, StringToInt) { for (auto string_type : {utf8(), large_utf8()}) { for (auto signed_type : {int8(), int16(), int32(), int64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"), - ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]")); + CheckCast( + ArrayFromJSON(string_type, R"(["0", null, "127", "-1", "0", "0x0", "0x7F"])"), + ArrayFromJSON(signed_type, "[0, null, 127, -1, 0, 0, 127]")); } - CheckCast( - ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", + CheckCast(ArrayFromJSON(string_type, R"(["2147483647", null, "-2147483648", "0", "0X0", "0x7FFFFFFF", "0XFFFFfFfF", "0Xf0000000"])"), - ArrayFromJSON(int32(), "[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]")); + ArrayFromJSON( + int32(), + "[2147483647, null, -2147483648, 0, 0, 2147483647, -1, -268435456]")); - CheckCast(ArrayFromJSON( - string_type, R"(["9223372036854775807", null, "-9223372036854775808", "0", + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "-9223372036854775808", "0", "0x0", "0x7FFFFFFFFFFFFFFf", "0XF000000000000001"])"), - ArrayFromJSON( int64(), - "[9223372036854775807, null, -9223372036854775808, 0, 0, 9223372036854775807, -1152921504606846975]")); + ArrayFromJSON(int64(), + "[9223372036854775807, null, -9223372036854775808, 0, 0, " + "9223372036854775807, -1152921504606846975]")); for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { - CheckCast(ArrayFromJSON(string_type, R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f])"), + CheckCast(ArrayFromJSON(string_type, + R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f])"), ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); } CheckCast( ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", "0x0", "0x7FFFFFf", "0xFFFFFFFF"])"), - ArrayFromJSON(uint32(), - "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); + ArrayFromJSON(uint32(), + "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); - CheckCast(ArrayFromJSON( - string_type, R"(["9223372036854775807", null, "18446744073709551615", "0", + CheckCast(ArrayFromJSON(string_type, + R"(["9223372036854775807", null, "18446744073709551615", "0", "0x0", "0x7FFFFFFFFFFFFFFf", "0xfFFFFFFFFFFFFFFf"])"), - ArrayFromJSON( uint64(), - "[9223372036854775807, null, 18446744073709551615, 0, 0, 9223372036854775807, 18446744073709551615]")); + ArrayFromJSON(uint64(), + "[9223372036854775807, null, 18446744073709551615, 0, 0, " + "9223372036854775807, 18446744073709551615]")); for (std::string not_int8 : { "z", @@ -1498,14 +1503,9 @@ TEST(Cast, StringToInt) { CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); } - for (std::string not_uint8 : { - "256", - "-1", - "0.5", - "0x", - "0x3wa" - "0x123" - }) { + for (std::string not_uint8 : {"256", "-1", "0.5", "0x", + "0x3wa" + "0x123"}) { auto options = CastOptions::Safe(uint8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); } From e5db0fc25ee9be4befa2e0ac93d317c09d53f6ae Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 8 Sep 2021 12:41:37 -0400 Subject: [PATCH 49/93] MINOR: [R] Fix broken doc example (#11110) --- r/R/dataset-write.R | 1 + r/man/write_dataset.Rd | 1 + 2 files changed, 2 insertions(+) diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index 8f410e284c2..b17c5f39ba8 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -76,6 +76,7 @@ #' # You can obtain the same result as as the previous examples using arrow with #' # a dplyr pipeline. This will be the same as two_levels_tree above, but the #' # output directory will be different. +#' library(dplyr) #' two_levels_tree_2 <- tempfile() #' mtcars %>% #' group_by(cyl, gear) %>% diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 4e44f12e2fb..6f36f8e72e8 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -87,6 +87,7 @@ list.files(two_levels_tree, recursive = TRUE) # You can obtain the same result as as the previous examples using arrow with # a dplyr pipeline. This will be the same as two_levels_tree above, but the # output directory will be different. +library(dplyr) two_levels_tree_2 <- tempfile() mtcars \%>\% group_by(cyl, gear) \%>\% From 9dd8b6abc34fee2e0eb069556d80e34270cca334 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 8 Sep 2021 13:46:30 -0500 Subject: [PATCH 50/93] implemented some improvements --- cpp/src/arrow/util/value_parsing.h | 48 +++++++++++------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index fc723441963..5677c27a09b 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -277,26 +277,22 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { template bool ParseHex(const char* s, size_t length, T* out) { + // lets make sure that the length of the string is not too big + if (!ARROW_PREDICT_TRUE(sizeof(T)*2 >= length && length > 0)){ + return false; + } T result = 0; - int num_iterations = (int)(sizeof(T)*2); - for (int i = 0; i < num_iterations; i++){ - if (length > 0) { - char val = *s; - s++; - result = static_cast(result << 4); - length--; - if (val >= '0' && val <= '9'){ - result = static_cast(result | (val -'0')); - } else if (val >= 'A' && val <= 'F'){ - result = static_cast(result | (val -'A' + 10)); - } else if (val >= 'a' && val <= 'f'){ - result = static_cast(result | (val -'a' + 10)); - } else { - /* Non-digit */ - return false; - } + for (size_t i = 0; i < length; i++){ + result = static_cast(result << 4); + if (s[i] >= '0' && s[i] <= '9'){ + result = static_cast(result | (s[i] -'0')); + } else if (s[i] >= 'A' && s[i] <= 'F'){ + result = static_cast(result | (s[i] -'A' + 10)); + } else if (s[i] >= 'a' && s[i] <= 'f'){ + result = static_cast(result | (s[i] -'a' + 10)); } else { - break; + /* Non-digit */ + return false; } } *out = result; @@ -311,15 +307,11 @@ struct StringToUnsignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } - // If its starts with 0x then its hex - if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))){ length -= 2; s += 2; - // lets make sure that the length of the string is not too big - if (!ARROW_PREDICT_TRUE(sizeof(value_type)*2 >= length && length > 0)) { - return false; - } if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { return false; } @@ -374,15 +366,11 @@ struct StringToSignedIntConverterMixin { return false; } - // If its starts with 0x then its hex - if (*s == '0' && ((*(s + 1) == 'x') || (*(s + 1) == 'X'))){ + // If it starts with 0x then its hex + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))){ length -= 2; s += 2; - // lets make sure that the length of the string is not too big - if (!ARROW_PREDICT_TRUE(sizeof(unsigned_value)*2 >= length && length > 0)) { - return false; - } if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { return false; } From 31f80e5be0bf657d5470ad1caa9f29701e7f57ec Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 8 Sep 2021 15:46:42 -0500 Subject: [PATCH 51/93] fixed clang format --- .../arrow/compute/kernels/scalar_cast_test.cc | 4 +-- cpp/src/arrow/util/value_parsing.h | 28 +++++++++---------- cpp/src/arrow/util/value_parsing_benchmark.cc | 13 ++++----- cpp/src/arrow/util/value_parsing_test.cc | 1 - 4 files changed, 19 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a948bf9e9ca..635776bec46 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1503,9 +1503,7 @@ TEST(Cast, StringToInt) { CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_int8 + "\"]"), options); } - for (std::string not_uint8 : {"256", "-1", "0.5", "0x", - "0x3wa" - "0x123"}) { + for (std::string not_uint8 : {"256", "-1", "0.5", "0x", "0x3wa", "0x123"}) { auto options = CastOptions::Safe(uint8()); CheckCastFails(ArrayFromJSON(string_type, "[\"" + not_uint8 + "\"]"), options); } diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index 5677c27a09b..3a9de685a71 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -273,23 +273,21 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { #undef PARSE_UNSIGNED_ITERATION #undef PARSE_UNSIGNED_ITERATION_LAST - - template bool ParseHex(const char* s, size_t length, T* out) { // lets make sure that the length of the string is not too big - if (!ARROW_PREDICT_TRUE(sizeof(T)*2 >= length && length > 0)){ + if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) { return false; } T result = 0; - for (size_t i = 0; i < length; i++){ + for (size_t i = 0; i < length; i++) { result = static_cast(result << 4); - if (s[i] >= '0' && s[i] <= '9'){ - result = static_cast(result | (s[i] -'0')); - } else if (s[i] >= 'A' && s[i] <= 'F'){ - result = static_cast(result | (s[i] -'A' + 10)); - } else if (s[i] >= 'a' && s[i] <= 'f'){ - result = static_cast(result | (s[i] -'a' + 10)); + if (s[i] >= '0' && s[i] <= '9') { + result = static_cast(result | (s[i] - '0')); + } else if (s[i] >= 'A' && s[i] <= 'F') { + result = static_cast(result | (s[i] - 'A' + 10)); + } else if (s[i] >= 'a' && s[i] <= 'f') { + result = static_cast(result | (s[i] - 'a' + 10)); } else { /* Non-digit */ return false; @@ -308,14 +306,14 @@ struct StringToUnsignedIntConverterMixin { return false; } // If it starts with 0x then its hex - if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))){ + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { length -= 2; s += 2; if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { return false; } - return true; + return true; } // Skip leading zeros while (length > 0 && *s == '0') { @@ -365,9 +363,9 @@ struct StringToSignedIntConverterMixin { if (ARROW_PREDICT_FALSE(length == 0)) { return false; } - + // If it starts with 0x then its hex - if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))){ + if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) { length -= 2; s += 2; @@ -375,7 +373,7 @@ struct StringToSignedIntConverterMixin { return false; } *out = static_cast(unsigned_value); - return true; + return true; } if (*s == '-') { diff --git a/cpp/src/arrow/util/value_parsing_benchmark.cc b/cpp/src/arrow/util/value_parsing_benchmark.cc index 0fd0d1f8ec3..40d139316e5 100644 --- a/cpp/src/arrow/util/value_parsing_benchmark.cc +++ b/cpp/src/arrow/util/value_parsing_benchmark.cc @@ -60,18 +60,15 @@ template static std::vector MakeHexStrings(int32_t num_items) { int32_t num_bytes = sizeof(c_int); const char* kAsciiTable = "0123456789ABCDEF"; - std::vector large_hex_chars(num_bytes*2 + 2); - large_hex_chars[0]='0'; - large_hex_chars[1]='x'; - for (int32_t i = 0; i < num_bytes*2; ++i) { + std::vector large_hex_chars(num_bytes * 2 + 2); + large_hex_chars[0] = '0'; + large_hex_chars[1] = 'x'; + for (int32_t i = 0; i < num_bytes * 2; ++i) { large_hex_chars[i + 2] = kAsciiTable[i]; } std::string large_hex(&large_hex_chars[0], large_hex_chars.size()); - std::vector base_strings = {"0x0", - "0xA5", - "0x5E", - large_hex}; + std::vector base_strings = {"0x0", "0xA5", "0x5E", large_hex}; std::vector strings; for (int32_t i = 0; i < num_items; ++i) { strings.push_back(base_strings[i % base_strings.size()]); diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index 03869797bf3..ebbb733398d 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -241,7 +241,6 @@ TEST(StringConversion, ToInt32) { AssertConversion("0XFFFFfFfF", -1); AssertConversionFails("0X"); AssertConversionFails("0x23512ak"); - } TEST(StringConversion, ToUInt32) { From 4666073c19b136f3669705173cdf56cb4703f969 Mon Sep 17 00:00:00 2001 From: William Malpica <16705032+wmalpica@users.noreply.github.com> Date: Wed, 8 Sep 2021 16:29:50 -0500 Subject: [PATCH 52/93] fixed unit test --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 635776bec46..a6b2fb11233 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1473,13 +1473,13 @@ TEST(Cast, StringToInt) { for (auto unsigned_type : {uint8(), uint16(), uint32(), uint64()}) { CheckCast(ArrayFromJSON(string_type, - R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f])"), + R"(["0", null, "127", "255", "0", "0X0", "0xff", "0x7f"])"), ArrayFromJSON(unsigned_type, "[0, null, 127, 255, 0, 0, 255, 127]")); } CheckCast( ArrayFromJSON(string_type, R"(["2147483647", null, "4294967295", "0", - "0x0", "0x7FFFFFf", "0xFFFFFFFF"])"), + "0x0", "0x7FFFFFFf", "0xFFFFFFFF"])"), ArrayFromJSON(uint32(), "[2147483647, null, 4294967295, 0, 0, 2147483647, 4294967295]")); From b0d89db3113f8575d89648106636d29d296090c5 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 9 Sep 2021 08:16:50 -0400 Subject: [PATCH 53/93] ARROW-13680: [C++] Create an asynchronous nursery to simplify capture logic This PR adds two new utilities. The first is an asynchronous smart pointer which makes it easier to ensure that asynchronous tasks are finished before an object is destroyed (and generally makes it safe to capture `this`) The second is an asynchronous task group which collects futures and can help to ensure that all futures are completed. It is similar to AllComplete except it doesn't require collecting all the futures at once. Combined, these two things can help give structured concurrency / nursery type control over asynchronous operations. I have used these utilities in #10955 if you would like to see an example of them in action. Closes #11084 from westonpace/experiment/async-smart-ptr Authored-by: Weston Pace Signed-off-by: David Li --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/async_util.cc | 82 ++++++++++++++ cpp/src/arrow/util/async_util.h | 132 ++++++++++++++++++++++ cpp/src/arrow/util/async_util_test.cc | 154 ++++++++++++++++++++++++++ 5 files changed, 370 insertions(+) create mode 100644 cpp/src/arrow/util/async_util.cc create mode 100644 cpp/src/arrow/util/async_util.h create mode 100644 cpp/src/arrow/util/async_util_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 637f3d1a54f..e06fad9a1de 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -190,6 +190,7 @@ set(ARROW_SRCS io/slow.cc io/stdio.cc io/transform.cc + util/async_util.cc util/basic_decimal.cc util/bit_block_counter.cc util/bit_run_reader.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index aa875ab6bee..1b14215ddd8 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -42,6 +42,7 @@ add_arrow_test(utility-test SOURCES align_util_test.cc async_generator_test.cc + async_util_test.cc bit_block_counter_test.cc bit_util_test.cc cache_test.cc diff --git a/cpp/src/arrow/util/async_util.cc b/cpp/src/arrow/util/async_util.cc new file mode 100644 index 00000000000..76c971f576e --- /dev/null +++ b/cpp/src/arrow/util/async_util.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/async_util.h" + +#include "arrow/util/future.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace util { + +AsyncDestroyable::AsyncDestroyable() : on_closed_(Future<>::Make()) {} + +#ifndef NDEBUG +AsyncDestroyable::~AsyncDestroyable() { + DCHECK(constructed_correctly_) << "An instance of AsyncDestroyable must be created by " + "MakeSharedAsync or MakeUniqueAsync"; +} +#else +AsyncDestroyable::~AsyncDestroyable() = default; +#endif + +void AsyncDestroyable::Destroy() { + DoDestroy().AddCallback([this](const Status& st) { + on_closed_.MarkFinished(st); + delete this; + }); +} + +Status AsyncTaskGroup::AddTask(const Future<>& task) { + auto guard = mutex_.Lock(); + if (all_tasks_done_.is_finished()) { + return Status::Invalid("Attempt to add a task after the task group has completed"); + } + if (!err_.ok()) { + return err_; + } + // If the task is already finished there is nothing to track so lets save + // some work and return early + if (task.is_finished()) { + err_ &= task.status(); + return Status::OK(); + } + running_tasks_++; + guard.Unlock(); + task.AddCallback([this](const Status& st) { + auto guard = mutex_.Lock(); + err_ &= st; + if (--running_tasks_ == 0 && finished_adding_) { + guard.Unlock(); + all_tasks_done_.MarkFinished(err_); + } + }); + return Status::OK(); +} + +Future<> AsyncTaskGroup::WaitForTasksToFinish() { + auto guard = mutex_.Lock(); + finished_adding_ = true; + if (running_tasks_ == 0) { + all_tasks_done_.MarkFinished(err_); + return all_tasks_done_; + } + return all_tasks_done_; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/async_util.h b/cpp/src/arrow/util/async_util.h new file mode 100644 index 00000000000..31e5d09a86c --- /dev/null +++ b/cpp/src/arrow/util/async_util.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/future.h" +#include "arrow/util/mutex.h" + +namespace arrow { +namespace util { + +/// Custom deleter for AsyncDestroyable objects +template +struct DestroyingDeleter { + void operator()(T* p) { p->Destroy(); } +}; + +/// An object which should be asynchronously closed before it is destroyed +/// +/// Classes can extend this to ensure that the close method is called and completed +/// before the instance is deleted. This provides smart_ptr / delete semantics for +/// objects with an asynchronous destructor. +/// +/// Classes which extend this must be constructed using MakeSharedAsync or MakeUniqueAsync +class ARROW_EXPORT AsyncDestroyable { + public: + AsyncDestroyable(); + virtual ~AsyncDestroyable(); + + /// A future which will complete when the AsyncDestroyable has finished and is ready + /// to be deleted. + /// + /// This can be used to ensure all work done by this object has been completed before + /// proceeding. + Future<> on_closed() { return on_closed_; } + + protected: + /// Subclasses should override this and perform any cleanup. Once the future returned + /// by this method finishes then this object is eligible for destruction and any + /// reference to `this` will be invalid + virtual Future<> DoDestroy() = 0; + + private: + void Destroy(); + + Future<> on_closed_; +#ifndef NDEBUG + bool constructed_correctly_ = false; +#endif + + template + friend struct DestroyingDeleter; + template + friend std::shared_ptr MakeSharedAsync(Args&&... args); + template + friend std::unique_ptr> MakeUniqueAsync(Args&&... args); +}; + +template +std::shared_ptr MakeSharedAsync(Args&&... args) { + static_assert(std::is_base_of::value, + "Nursery::MakeSharedCloseable only works with AsyncDestroyable types"); + std::shared_ptr ptr(new T(std::forward(args)...), DestroyingDeleter()); +#ifndef NDEBUG + ptr->constructed_correctly_ = true; +#endif + return ptr; +} + +template +std::unique_ptr> MakeUniqueAsync(Args&&... args) { + static_assert(std::is_base_of::value, + "Nursery::MakeUniqueCloseable only works with AsyncDestroyable types"); + std::unique_ptr> ptr(new T(std::forward(args)...), + DestroyingDeleter()); +#ifndef NDEBUG + ptr->constructed_correctly_ = true; +#endif + return ptr; +} + +/// A utility which keeps track of a collection of asynchronous tasks +/// +/// This can be used to provide structured concurrency for asynchronous development. +/// A task group created at a high level can be distributed amongst low level components +/// which register work to be completed. The high level job can then wait for all work +/// to be completed before cleaning up. +class ARROW_EXPORT AsyncTaskGroup { + public: + /// Add a task to be tracked by this task group + /// + /// If a previous task has failed then adding a task will fail + /// + /// If WaitForTasksToFinish has been called and the returned future has been marked + /// completed then adding a task will fail. + Status AddTask(const Future<>& task); + /// A future that will be completed when all running tasks are finished. + /// + /// It is allowed for tasks to be added after this call provided the future has not yet + /// completed. This should be safe as long as the tasks being added are added as part + /// of a task that is tracked. As soon as the count of running tasks reaches 0 this + /// future will be marked complete. + /// + /// Any attempt to add a task after the returned future has completed will fail. + Future<> WaitForTasksToFinish(); + + private: + bool finished_adding_ = false; + int running_tasks_ = 0; + Status err_; + Future<> all_tasks_done_ = Future<>::Make(); + util::Mutex mutex_; +}; + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/async_util_test.cc b/cpp/src/arrow/util/async_util_test.cc new file mode 100644 index 00000000000..f263ee548cf --- /dev/null +++ b/cpp/src/arrow/util/async_util_test.cc @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/async_util.h" + +#include + +#include "arrow/result.h" +#include "arrow/testing/future_util.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { +namespace util { + +class GatingDestroyable : public AsyncDestroyable { + public: + GatingDestroyable(Future<> close_future, bool* destroyed) + : close_future_(std::move(close_future)), destroyed_(destroyed) {} + ~GatingDestroyable() override { *destroyed_ = true; } + + protected: + Future<> DoDestroy() override { return close_future_; } + + private: + Future<> close_future_; + bool* destroyed_; +}; + +template +void TestAsyncDestroyable(Factory factory) { + Future<> gate = Future<>::Make(); + bool destroyed = false; + bool on_closed = false; + { + auto obj = factory(gate, &destroyed); + obj->on_closed().AddCallback([&](const Status& st) { on_closed = true; }); + ASSERT_FALSE(destroyed); + } + ASSERT_FALSE(destroyed); + ASSERT_FALSE(on_closed); + gate.MarkFinished(); + ASSERT_TRUE(destroyed); + ASSERT_TRUE(on_closed); +} + +TEST(AsyncDestroyable, MakeShared) { + TestAsyncDestroyable([](Future<> gate, bool* destroyed) { + return MakeSharedAsync(gate, destroyed); + }); +} + +TEST(AsyncDestroyable, MakeUnique) { + TestAsyncDestroyable([](Future<> gate, bool* destroyed) { + return MakeUniqueAsync(gate, destroyed); + }); +} + +TEST(AsyncTaskGroup, Basic) { + AsyncTaskGroup task_group; + Future<> fut1 = Future<>::Make(); + Future<> fut2 = Future<>::Make(); + ASSERT_OK(task_group.AddTask(fut1)); + ASSERT_OK(task_group.AddTask(fut2)); + Future<> all_done = task_group.WaitForTasksToFinish(); + AssertNotFinished(all_done); + fut1.MarkFinished(); + AssertNotFinished(all_done); + fut2.MarkFinished(); + ASSERT_FINISHES_OK(all_done); +} + +TEST(AsyncTaskGroup, NoTasks) { + AsyncTaskGroup task_group; + ASSERT_FINISHES_OK(task_group.WaitForTasksToFinish()); +} + +TEST(AsyncTaskGroup, AddAfterDone) { + AsyncTaskGroup task_group; + ASSERT_FINISHES_OK(task_group.WaitForTasksToFinish()); + ASSERT_RAISES(Invalid, task_group.AddTask(Future<>::Make())); +} + +TEST(AsyncTaskGroup, AddAfterWaitButBeforeFinish) { + AsyncTaskGroup task_group; + Future<> task_one = Future<>::Make(); + ASSERT_OK(task_group.AddTask(task_one)); + Future<> finish_fut = task_group.WaitForTasksToFinish(); + AssertNotFinished(finish_fut); + Future<> task_two = Future<>::Make(); + ASSERT_OK(task_group.AddTask(task_two)); + AssertNotFinished(finish_fut); + task_one.MarkFinished(); + AssertNotFinished(finish_fut); + task_two.MarkFinished(); + AssertFinished(finish_fut); + ASSERT_FINISHES_OK(finish_fut); +} + +TEST(AsyncTaskGroup, Error) { + AsyncTaskGroup task_group; + Future<> failed_task = Future<>::MakeFinished(Status::Invalid("XYZ")); + ASSERT_OK(task_group.AddTask(failed_task)); + ASSERT_FINISHES_AND_RAISES(Invalid, task_group.WaitForTasksToFinish()); +} + +TEST(AsyncTaskGroup, TaskFinishesAfterError) { + AsyncTaskGroup task_group; + Future<> fut1 = Future<>::Make(); + ASSERT_OK(task_group.AddTask(fut1)); + ASSERT_OK(task_group.AddTask(Future<>::MakeFinished(Status::Invalid("XYZ")))); + Future<> finished_fut = task_group.WaitForTasksToFinish(); + AssertNotFinished(finished_fut); + fut1.MarkFinished(); + ASSERT_FINISHES_AND_RAISES(Invalid, finished_fut); +} + +TEST(AsyncTaskGroup, AddAfterFailed) { + AsyncTaskGroup task_group; + ASSERT_OK(task_group.AddTask(Future<>::MakeFinished(Status::Invalid("XYZ")))); + ASSERT_RAISES(Invalid, task_group.AddTask(Future<>::Make())); + ASSERT_FINISHES_AND_RAISES(Invalid, task_group.WaitForTasksToFinish()); +} + +TEST(AsyncTaskGroup, FailAfterAdd) { + AsyncTaskGroup task_group; + Future<> will_fail = Future<>::Make(); + ASSERT_OK(task_group.AddTask(will_fail)); + Future<> added_later_and_passes = Future<>::Make(); + ASSERT_OK(task_group.AddTask(added_later_and_passes)); + will_fail.MarkFinished(Status::Invalid("XYZ")); + ASSERT_RAISES(Invalid, task_group.AddTask(Future<>::Make())); + Future<> finished_fut = task_group.WaitForTasksToFinish(); + AssertNotFinished(finished_fut); + added_later_and_passes.MarkFinished(); + AssertFinished(finished_fut); + ASSERT_FINISHES_AND_RAISES(Invalid, finished_fut); +} + +} // namespace util +} // namespace arrow From 4b5ed4eb5583cf24d8daff05a865c8d1cb616576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Percy=20Camilo=20Trive=C3=B1o=20Aucahuasi?= Date: Thu, 9 Sep 2021 08:27:22 -0400 Subject: [PATCH 54/93] ARROW-13138: [C++][R] Implement extract temporal components (year, month, day, etc) from date32/64 types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://issues.apache.org/jira/browse/ARROW-13138 Closes #11075 from aucahuasi/temporal-functions-date32 Authored-by: Percy Camilo Triveño Aucahuasi Signed-off-by: David Li --- .../arrow/compute/kernels/scalar_temporal.cc | 272 ++++++++++++------ .../compute/kernels/scalar_temporal_test.cc | 84 ++++-- docs/source/cpp/compute.rst | 22 +- r/tests/testthat/test-dplyr-lubridate.R | 96 ++++++- 4 files changed, 348 insertions(+), 126 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index d70411f8338..b60e4ef71f0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -122,13 +122,14 @@ struct ZonedLocalizer { // // Executor class for temporal component extractors, i.e. scalar kernels -// with the signature Timestamp -> +// with the signature temporal type -> // // The `Op` parameter is templated on the Duration (which depends on the timestamp // unit) and a Localizer class (depending on whether the timestamp has a // timezone defined). // -template