diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index a9cf01467f6..451dacf904e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -50,6 +50,11 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { : utf8_code_unit; } +template +static inline bool IsAsciiCharacter(T character) { + return character < 128; +} + // TODO: optional ascii validation struct AsciiLength { @@ -66,6 +71,7 @@ constexpr uint32_t kMaxCodepointLookup = 0xffff; // up to this codepoint is in a lookup table std::vector lut_upper_codepoint; std::vector lut_lower_codepoint; +std::vector lut_category; std::once_flag flag_case_luts; void EnsureLookupTablesFilled() { @@ -75,6 +81,7 @@ void EnsureLookupTablesFilled() { for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) { lut_upper_codepoint.push_back(utf8proc_toupper(i)); lut_lower_codepoint.push_back(utf8proc_tolower(i)); + lut_category.push_back(utf8proc_category(i)); } }); } @@ -97,8 +104,8 @@ struct UTF8Transform { } static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + EnsureLookupTablesFilled(); if (batch[0].kind() == Datum::ARRAY) { - EnsureLookupTablesFilled(); const ArrayData& input = *batch[0].array(); ArrayType input_boxed(batch[0].array()); ArrayData* output = out->mutable_array(); @@ -199,6 +206,10 @@ struct UTF8Lower : UTF8Transform> { } }; +#else + +void EnsureLookupTablesFilled() {} + #endif // ARROW_WITH_UTF8PROC using TransformFunc = std::function; @@ -322,9 +333,7 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch, } else { const auto& input = checked_cast(*batch[0].scalar()); if (input.is_valid) { - auto result = checked_pointer_cast(MakeNullScalar(out->type())); uint8_t result_value = 0; - result->is_valid = true; std::array offsets{0, static_cast(input.value->size())}; transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0, @@ -409,6 +418,413 @@ void AddBinaryContainsExact(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +// IsAlpha/Digit etc + +#ifdef ARROW_WITH_UTF8PROC + +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { + utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup + ? lut_category[codepoint] + : utf8proc_category(codepoint); + uint32_t general_category_bit = 1 << general_category; + // for e.g. undefined (but valid) codepoints, general_category == 0 == + // UTF8PROC_CATEGORY_CN + return (general_category != UTF8PROC_CATEGORY_CN) && + ((general_category_bit & mask) != 0); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), + categories...); +} + +static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) || + (static_cast(utf8proc_tolower(codepoint)) != codepoint)); +} + +static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { + // although this trick seems to work for upper case, this is not enough for lower case + // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the + // best we can do + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) || + ((static_cast(utf8proc_toupper(codepoint)) != codepoint) && + (static_cast(utf8proc_tolower(codepoint)) == codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { + // this seems to be a good workaround for utf8proc not having case information + // https://github.com/JuliaStrings/utf8proc/issues/195 + return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || + ((static_cast(utf8proc_toupper(codepoint)) == codepoint) && + (static_cast(utf8proc_tolower(codepoint)) != codepoint))) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT); +} + +static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory( + codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); +} + +static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { + // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. + // utf8proc has no support for this, this is the best we can do: + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { + // Formally this is not correct, but utf8proc does not allow us to query for Numerical + // properties, e.g. Numeric_Value and Numeric_Type + // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or + // Numeric_Type=Numeric. + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { + auto property = utf8proc_get_property(codepoint); + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || + property->bidi_class == UTF8PROC_BIDI_CLASS_WS || + property->bidi_class == UTF8PROC_BIDI_CLASS_B || + property->bidi_class == UTF8PROC_BIDI_CLASS_S; +} + +static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { + uint32_t general_category = utf8proc_category(codepoint); + return (general_category != UTF8PROC_CATEGORY_CN) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, + UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, + UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, + UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); +} + +#endif + +static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'a') && (ascii_character <= 'z'); +} + +static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'A') && (ascii_character <= 'Z'); +} + +static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + return IsLowerCaseCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); +} + +static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // same +} + +static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')) || + ((ascii_character >= 'a') && (ascii_character <= 'z')) || + ((ascii_character >= 'A') && (ascii_character <= 'Z')); +} + +static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')); +} + +static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || + (ascii_character == ' '); +} + +static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= ' ') && (ascii_character <= '~')); +} + +template +struct CharacterPredicateUnicode { + static bool Call(KernelContext* ctx, const uint8_t* input, + size_t input_string_ncodeunits) { + if (allow_empty && input_string_ncodeunits == 0) { + return true; + } + bool all; + bool any = false; + if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf( + input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) { + any |= Derived::PredicateCharacterAny(codepoint); + return Derived::PredicateCharacterAll(codepoint); + }))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input")); + return false; + } + return all & any; + } + + static inline bool PredicateCharacterAny(uint32_t) { + return true; // default condition make sure there is at least 1 charachter + } +}; + +template +struct CharacterPredicateAscii { + static bool Call(KernelContext* ctx, const uint8_t* input, + size_t input_string_ncodeunits) { + if (allow_empty && input_string_ncodeunits == 0) { + return true; + } + bool any = false; + // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii + // benchmark. I don't consider that worth it. + bool all = std::all_of(input, input + input_string_ncodeunits, + [&any](uint8_t ascii_character) { + any |= Derived::PredicateCharacterAny(ascii_character); + return Derived::PredicateCharacterAll(ascii_character); + }); + return all & any; + } + + static inline bool PredicateCharacterAny(uint8_t) { + return true; // default condition make sure there is at least 1 charachter + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsAlphaNumericUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsAlphaNumericCharacterUnicode(codepoint); + } +}; +#endif + +struct IsAlphaNumericAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsAlphaNumericCharacterAscii(ascii_character); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsAlphaUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsAlphaCharacterUnicode(codepoint); + } +}; +#endif + +struct IsAlphaAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsAlphaCharacterAscii(ascii_character); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsDecimalUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsDecimalCharacterUnicode(codepoint); + } +}; +#endif + +struct IsDecimalAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsDecimalCharacterAscii(ascii_character); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsDigitUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsDigitCharacterUnicode(codepoint); + } +}; + +struct IsNumericUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsNumericCharacterUnicode(codepoint); + } +}; +#endif + +struct IsAscii { + static bool Call(KernelContext* ctx, const uint8_t* input, + size_t input_string_nascii_characters) { + return std::all_of(input, input + input_string_nascii_characters, + IsAsciiCharacter); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsLowerUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint); + } + static inline bool PredicateCharacterAny(uint32_t codepoint) { + return IsCasedCharacterUnicode(codepoint); // at least 1 cased character + } +}; +#endif + +struct IsLowerAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterAscii(ascii_character) || + IsLowerCaseCharacterAscii(ascii_character); + } + static inline bool PredicateCharacterAny(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // at least 1 cased character + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsPrintableUnicode + : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint); + } +}; +#endif + +struct IsPrintableAscii + : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsPrintableCharacterAscii(ascii_character); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsSpaceUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsSpaceCharacterUnicode(codepoint); + } +}; +#endif + +struct IsSpaceAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsSpaceCharacterAscii(ascii_character); + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsTitleUnicode { + static bool Call(KernelContext* ctx, const uint8_t* input, + size_t input_string_ncodeunits) { + // rules: + // * 1: lower case follows cased + // * 2: upper case follows uncased + // * 3: at least 1 cased character (which logically should be upper/title) + bool rules_1_and_2; + bool previous_cased = false; // in LL, LU or LT + bool rule_3 = false; + bool status = + arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2, + [&previous_cased, &rule_3](uint32_t codepoint) { + if (IsLowerCaseCharacterUnicode(codepoint)) { + if (!previous_cased) return false; // rule 1 broken + previous_cased = true; + } else if (IsCasedCharacterUnicode(codepoint)) { + if (previous_cased) return false; // rule 2 broken + // next should be a lower case or uncased + previous_cased = true; + rule_3 = true; // rule 3 obeyed + } else { + // a non-cased char, like _ or 1 + // next should be upper case or more uncased + previous_cased = false; + } + return true; + }); + if (!ARROW_PREDICT_TRUE(status)) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input")); + return false; + } + return rules_1_and_2 & rule_3; + } +}; +#endif + +struct IsTitleAscii { + static bool Call(KernelContext* ctx, const uint8_t* input, + size_t input_string_ncodeunits) { + // rules: + // * 1: lower case follows cased + // * 2: upper case follows uncased + // * 3: at least 1 cased character (which logically should be upper/title) + bool rules_1_and_2 = true; + bool previous_cased = false; // in LL, LU or LT + bool rule_3 = false; + // we cannot rely on std::all_of because we need guaranteed order + for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) { + if (IsLowerCaseCharacterAscii(*c)) { + if (!previous_cased) { + // rule 1 broken + rules_1_and_2 = false; + break; + } + previous_cased = true; + } else if (IsCasedCharacterAscii(*c)) { + if (previous_cased) { + // rule 2 broken + rules_1_and_2 = false; + break; + } + // next should be a lower case or uncased + previous_cased = true; + rule_3 = true; // rule 3 obeyed + } else { + // a non-cased char, like _ or 1 + // next should be upper case or more uncased + previous_cased = false; + } + } + return rules_1_and_2 & rule_3; + } +}; + +#ifdef ARROW_WITH_UTF8PROC +struct IsUpperUnicode : CharacterPredicateUnicode { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint); + } + static inline bool PredicateCharacterAny(uint32_t codepoint) { + return IsCasedCharacterUnicode(codepoint); // at least 1 cased character + } +}; +#endif + +struct IsUpperAscii : CharacterPredicateAscii { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); + } + static inline bool PredicateCharacterAny(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // at least 1 cased character + } +}; + // ---------------------------------------------------------------------- // strptime string parsing @@ -479,15 +895,84 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi #endif +using StringPredicate = std::function; + +template +void ApplyPredicate(KernelContext* ctx, const ExecBatch& batch, StringPredicate predicate, + Datum* out) { + EnsureLookupTablesFilled(); + if (batch[0].kind() == Datum::ARRAY) { + const ArrayData& input = *batch[0].array(); + ArrayIterator input_it(input); + ArrayData* out_arr = out->mutable_array(); + ::arrow::internal::GenerateBitsUnrolled( + out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length, + [&]() -> bool { + util::string_view val = input_it(); + return predicate(ctx, reinterpret_cast(val.data()), val.size()); + }); + } else { + const auto& input = checked_cast(*batch[0].scalar()); + if (input.is_valid) { + bool boolean_result = + predicate(ctx, input.value->data(), static_cast(input.value->size())); + if (!ctx->status().ok()) { + // UTF decoding can lead to issues + return; + } + out->value = std::make_shared(boolean_result); + } + } +} + +template +void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry) { + auto func = std::make_shared(name, Arity::Unary()); + auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ApplyPredicate(ctx, batch, Predicate::Call, out); + }; + auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ApplyPredicate(ctx, batch, Predicate::Call, out); + }; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32))); + DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64))); + DCHECK_OK(registry->AddFunction(std::move(func))); +} + } // namespace void RegisterScalarStringAscii(FunctionRegistry* registry) { MakeUnaryStringBatchKernel("ascii_upper", registry); MakeUnaryStringBatchKernel("ascii_lower", registry); + + AddUnaryStringPredicate("binary_isascii", registry); + + AddUnaryStringPredicate("ascii_isalnum", registry); + AddUnaryStringPredicate("ascii_isalpha", registry); + AddUnaryStringPredicate("ascii_isdecimal", registry); + // no isdigic for ascii, since it is the same as isdecimal + AddUnaryStringPredicate("ascii_islower", registry); + // no isnumeric for ascii, since it is the same as isdecimal + AddUnaryStringPredicate("ascii_isprintable", registry); + AddUnaryStringPredicate("ascii_isspace", registry); + AddUnaryStringPredicate("ascii_istitle", registry); + AddUnaryStringPredicate("ascii_isupper", registry); #ifdef ARROW_WITH_UTF8PROC MakeUnaryStringUTF8TransformKernel("utf8_upper", registry); MakeUnaryStringUTF8TransformKernel("utf8_lower", registry); + AddUnaryStringPredicate("utf8_isalnum", registry); + AddUnaryStringPredicate("utf8_isalpha", registry); + AddUnaryStringPredicate("utf8_isdecimal", registry); + AddUnaryStringPredicate("utf8_isdigit", registry); + AddUnaryStringPredicate("utf8_islower", registry); + AddUnaryStringPredicate("utf8_isnumeric", registry); + AddUnaryStringPredicate("utf8_isprintable", registry); + AddUnaryStringPredicate("utf8_isspace", registry); + AddUnaryStringPredicate("utf8_istitle", registry); + AddUnaryStringPredicate("utf8_isupper", registry); + #endif + AddAsciiLength(registry); AddBinaryContainsExact(registry); AddStrptime(registry); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc index 46ee129b03c..01a32c71f34 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc @@ -57,6 +57,10 @@ static void AsciiUpper(benchmark::State& state) { UnaryStringBenchmark(state, "ascii_upper"); } +static void IsAlphaNumericAscii(benchmark::State& state) { + UnaryStringBenchmark(state, "ascii_isalnum"); +} + static void BinaryContainsExact(benchmark::State& state) { BinaryContainsExactOptions options("abac"); UnaryStringBenchmark(state, "binary_contains_exact", &options); @@ -70,14 +74,20 @@ static void Utf8Upper(benchmark::State& state) { static void Utf8Lower(benchmark::State& state) { UnaryStringBenchmark(state, "utf8_lower"); } + +static void IsAlphaNumericUnicode(benchmark::State& state) { + UnaryStringBenchmark(state, "utf8_isalnum"); +} #endif BENCHMARK(AsciiLower); BENCHMARK(AsciiUpper); +BENCHMARK(IsAlphaNumericAscii); BENCHMARK(BinaryContainsExact); #ifdef ARROW_WITH_UTF8PROC BENCHMARK(Utf8Lower); BENCHMARK(Utf8Upper); +BENCHMARK(IsAlphaNumericUnicode); #endif } // namespace compute diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 0989401d034..88a0258ee5f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -145,8 +145,172 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { CallFunction("utf8_lower", {invalid_input})); } +TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { + // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is + // UTF8PROC_CATEGORY_LO + this->CheckUnary("utf8_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", boolean(), + "[true, null, true, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsAlphaUnicode) { + // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is + // UTF8PROC_CATEGORY_LO + this->CheckUnary("utf8_isalpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(), + "[true, null, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsAscii) { + this->CheckUnary("binary_isascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(), + "[true, null, false, true]"); +} + +TYPED_TEST(TestStringKernels, IsDecimalUnicode) { + // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal) + this->CheckUnary("utf8_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + boolean(), "[true, null, true, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsDigitUnicode) { + // These are digits according to Python, but we don't have the information in + // utf8proc for this + // this->CheckUnary("utf8_isdigit", "[\"²\", \"①\"]", boolean(), "[true, + // true]"); +} + +TYPED_TEST(TestStringKernels, IsNumericUnicode) { + // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal) + this->CheckUnary("utf8_isnumeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + boolean(), "[true, null, true, true, false, false]"); + // These are numerical according to Python, but we don't have the information in + // utf8proc for this + // this->CheckUnary("utf8_isnumeric", "[\"㐅\", \"卌\"]", boolean(), + // "[true, null, true, true, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsLowerUnicode) { + // ٣ is arabic 3 (decimal), Φ capital + this->CheckUnary("utf8_islower", + "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", " + "\"With space\"]", + boolean(), + "[false, null, true, false, true, false, false, true, false]"); + // lower case character utf8proc does not know about + // this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true, + // true]"); +} + +TYPED_TEST(TestStringKernels, IsPrintableUnicode) { + // U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable + // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category + this->CheckUnary( + "utf8_isprintable", + "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(), + "[true, null, false, true, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsSpaceUnicode) { + // U+2008 (utf8: \xe2\x80\x88) is punctuaction space + this->CheckUnary("utf8_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), + "[true, null, true, true]"); + this->CheckUnary("utf8_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", + boolean(), "[false, null, false, false, true]"); +} + +TYPED_TEST(TestStringKernels, IsTitleUnicode) { + // ٣ is arabic 3 (decimal), Φ capital + this->CheckUnary("utf8_istitle", + "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", + boolean(), "[true, null, true, true, true, true, true]"); + this->CheckUnary( + "utf8_istitle", + "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]", + boolean(), "[false, null, false, false, false, false, false, false]"); +} + +// Older versions of utf8proc fail +#if !(UTF8PROC_VERSION_MAJOR <= 2 && UTF8PROC_VERSION_MINOR < 5) + +TYPED_TEST(TestStringKernels, IsUpperUnicode) { + // ٣ is arabic 3 (decimal), Φ capital + this->CheckUnary( + "utf8_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]", + boolean(), "[false, null, false, true, true, true, false, true, true]"); + // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ + // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower + // case + // * U+1F88 - ᾈ - \E1\xBE\x88 - Greek Capital Letter Alpha with Psili and Prosgegrammeni + // - title case + // U+10400 - 𐐀 - \xF0x90x90x80 - Deseret Capital Letter Long - upper case + // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A - new in unicode 13 + // (not tested since it depends on the version of libutf8proc) + // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13 + this->CheckUnary("utf8_isupper", + "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]", + boolean(), "[true, true, true, false, true, false]"); +} + +#endif // UTF8PROC_VERSION_MINOR >= 5 + #endif // ARROW_WITH_UTF8PROC +TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) { + this->CheckUnary("ascii_isalnum", + "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]", + boolean(), "[false, null, false, false, false, false, false]"); + this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]", + boolean(), "[true, null, true, true, true, false]"); +} + +TYPED_TEST(TestStringKernels, IsAlphaAscii) { + this->CheckUnary("ascii_isalpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]", + boolean(), "[false, true, null, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsDecimalAscii) { + // ٣ is arabic 3 + this->CheckUnary("ascii_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + boolean(), "[true, null, false, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsLowerAscii) { + // ٣ is arabic 3 (decimal), φ lower greek + this->CheckUnary("ascii_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", + boolean(), "[false, null, true, false, true, false, false]"); +} +TYPED_TEST(TestStringKernels, IsPrintableAscii) { + // \xe2\x80\x88 is punctuaction space + this->CheckUnary("ascii_isprintable", + "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(), + "[true, null, false, true, false]"); +} + +TYPED_TEST(TestStringKernels, IsSpaceAscii) { + // \xe2\x80\x88 is punctuaction space + // Note: for ascii version, the non-ascii chars are seen as non-cased + this->CheckUnary("ascii_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), + "[true, null, true, true]"); + this->CheckUnary("ascii_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", + boolean(), "[false, null, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsTitleAscii) { + // ٣ is arabic 3 (decimal), Φ capital + // Note: for ascii version, the non-ascii chars are seen as non-cased + this->CheckUnary("ascii_istitle", + "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", + boolean(), "[true, null, true, true, true, false, false]"); + this->CheckUnary( + "ascii_istitle", + "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]", + boolean(), "[false, null, false, false, true, false, false, false]"); +} + +TYPED_TEST(TestStringKernels, IsUpperAscii) { + // ٣ is arabic 3 (decimal), Φ capital greek + this->CheckUnary("ascii_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", + boolean(), "[false, null, false, true, true, false, false]"); +} + TYPED_TEST(TestStringKernels, BinaryContainsExact) { BinaryContainsExactOptions options{"ab"}; this->CheckUnary("binary_contains_exact", "[]", boolean(), "[]", &options); diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 1775b19d0fe..d5875c4590b 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -366,5 +366,24 @@ static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last, return true; } +template +static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* result, + UnaryPredicate&& predicate) { + const uint8_t* i = first; + while (i < last) { + uint32_t codepoint = 0; + if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) { + return false; + } + + if (!predicate(codepoint)) { + *result = false; + return true; + } + } + *result = true; + return true; +} + } // namespace util } // namespace arrow diff --git a/docker-compose.yml b/docker-compose.yml index 62ddca0cfcd..89feef5eb21 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -319,6 +319,8 @@ services: ARROW_ORC: "OFF" ARROW_USE_ASAN: "ON" ARROW_USE_UBSAN: "ON" + # utf8proc 2.1.0 in Ubuntu Bionic has test failures + utf8proc_SOURCE: "BUNDLED" command: *cpp-command fedora-cpp: diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index ae7dae86d18..165895af7fb 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -103,6 +103,27 @@ def func(left, right): utf8_upper = _simple_unary_function('utf8_upper') utf8_lower = _simple_unary_function('utf8_lower') +binary_isascii = _simple_unary_function('binary_isascii') + +ascii_isalnum = _simple_unary_function('ascii_isalnum') +utf8_isalnum = _simple_unary_function('utf8_isalnum') +ascii_isalpha = _simple_unary_function('ascii_isalpha') +utf8_isalpha = _simple_unary_function('utf8_isalpha') +ascii_isdecimal = _simple_unary_function('ascii_isdecimal') +utf8_isdecimal = _simple_unary_function('utf8_isdecimal') +ascii_isdigit = ascii_isdecimal # alias +utf8_isdigit = _simple_unary_function('utf8_isdigit') +ascii_islower = _simple_unary_function('ascii_islower') +utf8_islower = _simple_unary_function('utf8_islower') +ascii_isnumeric = ascii_isdecimal # alias +utf8_isnumeric = _simple_unary_function('utf8_isnumeric') +ascii_isprintable = _simple_unary_function('ascii_isprintable') +utf8_isprintable = _simple_unary_function('utf8_isprintable') +ascii_istitle = _simple_unary_function('ascii_istitle') +utf8_istitle = _simple_unary_function('utf8_istitle') +ascii_isupper = _simple_unary_function('ascii_isupper') +utf8_isupper = _simple_unary_function('utf8_isupper') + is_valid = _simple_unary_function('is_valid') is_null = _simple_unary_function('is_null') diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 00486d54149..52107f70dfa 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -95,6 +95,128 @@ def test_binary_contains_exact(): assert expected.equals(result) +# We use isprintable to find about codepoints that Python doesn't know, but +# utfproc does (or in future version of Python the other way around). +# These codepoints cannot be compared between Arrow and the Python +# implementation. +def _find_new_unicode_codepoints(): + new = set() + for i in range(0x80, 0x11000): + c = chr(i) + if i in range(0xD800, 0xE000): + continue # bug? pyarrow doesn't allow utf16 surrogates + ar = pa.array([c]) + if pc.utf8_isprintable(ar)[0].as_py() != c.isprintable(): + new.add(i) + return new + + +new_unicode_codepoints = _find_new_unicode_codepoints() + +# Python claims there are not alpha, not sure why, they are in +# gc='Other Letter': https://graphemica.com/%E1%B3%B2 +unknown_issue_isalpha = {0x1cf2, 0x1cf3} +# utf8proc does not know if codepoints are lower case +utf8proc_issue_islower = {0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, + 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, + 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d, + 0x1d2e, 0x1d2f, 0x1d30, 0x1d31, 0x1d32, 0x1d33, + 0x1d34, 0x1d35, 0x1d36, 0x1d37, 0x1d38, 0x1d39, + 0x1d3a, 0x1d3b, 0x1d3c, 0x1d3d, 0x1d3e, 0x1d3f, + 0x1d40, 0x1d41, 0x1d42, 0x1d43, 0x1d44, 0x1d45, + 0x1d46, 0x1d47, 0x1d48, 0x1d49, 0x1d4a, 0x1d4b, + 0x1d4c, 0x1d4d, 0x1d4e, 0x1d4f, 0x1d50, 0x1d51, + 0x1d52, 0x1d53, 0x1d54, 0x1d55, 0x1d56, 0x1d57, + 0x1d58, 0x1d59, 0x1d5a, 0x1d5b, 0x1d5c, 0x1d5d, + 0x1d5e, 0x1d5f, 0x1d60, 0x1d61, 0x1d62, 0x1d63, + 0x1d64, 0x1d65, 0x1d66, 0x1d67, 0x1d68, 0x1d69, + 0x1d6a, 0x1d78, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, + 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, + 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, + 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, + 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, + 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, + 0x1dbd, 0x1dbe, 0x1dbf, 0x2071, 0x207f, 0x2090, + 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, + 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, + 0x2c7c, 0x2c7d, 0xa69c, 0xa69d, 0xa770, 0xa7f8, + 0xa7f9, 0xab5c, 0xab5d, 0xab5e, 0xab5f, } +# utf8proc does not store if a codepoint is numeric +numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, + 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, + 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, + 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, + 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, + 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, + 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, + 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, + 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, + 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, + 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, 0x10fc5, + 0x10fc6, 0x10fc7, 0x10fc8, 0x10fc9, 0x10fca, + 0x10fcb, } +# utf8proc has no no digit/numeric information +digit_info_missing = {0xb2, 0xb3, 0xb9, 0x1369, 0x136a, 0x136b, 0x136c, + 0x136d, 0x136e, 0x136f, 0x1370, 0x1371, 0x19da, 0x2070, + 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x2080, + 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, + 0x2088, 0x2089, 0x2460, 0x2461, 0x2462, 0x2463, 0x2464, + 0x2465, 0x2466, 0x2467, 0x2468, 0x2474, 0x2475, 0x2476, + 0x2477, 0x2478, 0x2479, 0x247a, 0x247b, 0x247c, 0x2488, + 0x2489, 0x248a, 0x248b, 0x248c, 0x248d, 0x248e, 0x248f, + 0x2490, 0x24ea, 0x24f5, 0x24f6, 0x24f7, 0x24f8, 0x24f9, + 0x24fa, 0x24fb, 0x24fc, 0x24fd, 0x24ff, 0x2776, 0x2777, + 0x2778, 0x2779, 0x277a, 0x277b, 0x277c, 0x277d, 0x277e, + 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786, + 0x2787, 0x2788, 0x278a, 0x278b, 0x278c, 0x278d, 0x278e, + 0x278f, 0x2790, 0x2791, 0x2792, 0x10a40, 0x10a41, + 0x10a42, 0x10a43, 0x10e60, 0x10e61, 0x10e62, 0x10e63, + 0x10e64, 0x10e65, 0x10e66, 0x10e67, 0x10e68, } +numeric_info_missing = {0x3405, 0x3483, 0x382a, 0x3b4d, 0x4e00, 0x4e03, + 0x4e07, 0x4e09, 0x4e5d, 0x4e8c, 0x4e94, 0x4e96, + 0x4ebf, 0x4ec0, 0x4edf, 0x4ee8, 0x4f0d, 0x4f70, + 0x5104, 0x5146, 0x5169, 0x516b, 0x516d, 0x5341, + 0x5343, 0x5344, 0x5345, 0x534c, 0x53c1, 0x53c2, + 0x53c3, 0x53c4, 0x56db, 0x58f1, 0x58f9, 0x5e7a, + 0x5efe, 0x5eff, 0x5f0c, 0x5f0d, 0x5f0e, 0x5f10, + 0x62fe, 0x634c, 0x67d2, 0x6f06, 0x7396, 0x767e, + 0x8086, 0x842c, 0x8cae, 0x8cb3, 0x8d30, 0x9621, + 0x9646, 0x964c, 0x9678, 0x96f6, 0xf96b, 0xf973, + 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, } + +codepoints_ignore = { + 'isalnum': numeric_info_missing | digit_info_missing | + unknown_issue_isalpha, + 'isalpha': unknown_issue_isalpha, + 'isdigit': digit_info_missing, + 'isnumeric': numeric_info_missing, + 'islower': utf8proc_issue_islower +} + + +@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', + 'isdecimal', 'isdigit', 'islower', + 'isnumeric', 'isprintable', + 'isspace', 'isupper', ]) +@pytest.mark.parametrize('variant', ['ascii', 'utf8']) +def test_string_py_compat_boolean(function_name, variant): + arrow_name = variant + "_" + function_name + py_name = function_name + ignore = codepoints_ignore.get(function_name, set()) |\ + new_unicode_codepoints + for i in range(128 if ascii else 0x11000): + if i in range(0xD800, 0xE000): + continue # bug? pyarrow doesn't allow utf16 surrogates + # the issues we know of, we skip + if i in ignore: + continue + c = chr(i) + if hasattr(pc, arrow_name): + ar = pa.array([c]) + assert getattr(pc, arrow_name)( + ar)[0].as_py() == getattr(c, py_name)() + + @pytest.mark.parametrize(('ty', 'values'), all_array_types) def test_take(ty, values): arr = pa.array(values, type=ty)