From 2cd1d8bd57294f8a9472b7ed4cfa87e9428c0a52 Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Tue, 7 Jul 2020 13:35:42 +0200 Subject: [PATCH 01/28] ARROW-9268: [C++] add string_is{alpnum,alpha...,upper} kernels --- .../arrow/compute/kernels/scalar_string.cc | 497 +++++++++++++++++- .../kernels/scalar_string_benchmark.cc | 10 + .../compute/kernels/scalar_string_test.cc | 140 +++++ cpp/src/arrow/util/utf8.h | 19 + python/pyarrow/compute.py | 22 + python/pyarrow/tests/test_compute.py | 44 ++ 6 files changed, 730 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index a9cf01467f6..d44c0eae217 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -50,6 +50,11 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { : utf8_code_unit; } +template +static inline bool IsAsciiCharacter(T character) { + return character < 128; +} + // TODO: optional ascii validation struct AsciiLength { @@ -183,6 +188,54 @@ struct UTF8Transform { } }; +template +struct BinaryToBoolean { + using offset_type = typename StringType::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + + static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].kind() == Datum::ARRAY) { + EnsureLookupTablesFilled(); + const ArrayData& input = *batch[0].array(); + ArrayType input_boxed(batch[0].array()); + ArrayData* out_arr = out->mutable_array(); + + // offset_type input_ncodeunits = input_boxed.total_values_length(); + offset_type input_nstrings = static_cast(input.length); + + FirstTimeBitmapWriter bitmap_writer(out_arr->buffers[1]->mutable_data(), + out_arr->offset, input.length); + for (int64_t i = 0; i < input_nstrings; i++) { + offset_type input_string_ncodeunits; + const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits); + offset_type encoded_nbytes; + bool boolean_result = + Derived::Predicate(ctx, input_string, input_string_ncodeunits); + if (!ctx->status().ok()) { + // UTF decoding can lead to issues + return; + } + if (boolean_result) { + bitmap_writer.Set(); + } + bitmap_writer.Next(); + } + bitmap_writer.Finish(); + } else { + const auto& input = checked_cast(*batch[0].scalar()); + if (input.is_valid) { + offset_type data_nbytes = static_cast(input.value->size()); + bool boolean_result = Derived::Predicate(ctx, input.value->data(), data_nbytes); + if (!ctx->status().ok()) { + // UTF decoding can lead to issues + return; + } + out->value = std::make_shared(boolean_result); + } + } + } +}; + template struct UTF8Upper : UTF8Transform> { inline static uint32_t TransformCodepoint(uint32_t codepoint) { @@ -322,9 +375,7 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch, } else { const auto& input = checked_cast(*batch[0].scalar()); if (input.is_valid) { - auto result = checked_pointer_cast(MakeNullScalar(out->type())); uint8_t result_value = 0; - result->is_valid = true; std::array offsets{0, static_cast(input.value->size())}; transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0, @@ -409,6 +460,413 @@ void AddBinaryContainsExact(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +// IsAlpha/Digit etc + +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) { + uint32_t general_category = 1 << utf8proc_category(codepoint); + // for e.g. undefined (but valid) codepoints, general_category == 0 + return (general_category != 0) && ((general_category & mask) != 0); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...); +} + +template +static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, + utf8proc_category_t category, + Categories... categories) { + return HasAnyUnicodeGeneralCategory(codepoint, static_cast(1u << category), + categories...); +} + +static inline bool IsUpperCaseCharacterRoman(uint32_t codepoint) { + // Roman letter Ⅰ to Ⅿ are seen as capital (see 4.2 of Unicode spec) + // DerivedCoreProperties.txt should have this information, but it is not stored in + // the utf8proc library. + return (codepoint >= 0x2160) && (codepoint <= 0x216f); +} + +static inline bool IsUpperCaseCharacterCircled(uint32_t codepoint) { + // Circled letters Ⓐ-Ⓩ are seen as capital (see 4.2 of Unicode spec) + // DerivedCoreProperties.txt should have this information, but it is not stored in + // the utf8proc library. + return (codepoint >= 0x24b6) && (codepoint <= 0x24cf); +} + +static inline bool IsCasedCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) || + IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint); + ; +} + +static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL); +} + +static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) || + IsUpperCaseCharacterRoman(codepoint) || IsUpperCaseCharacterCircled(codepoint); +} + +static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU, + UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT, + UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO); +} + +static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'a') && (ascii_character <= 'z'); +} + +static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { + return (ascii_character >= 'A') && (ascii_character <= 'Z'); +} + +static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + return IsLowerCaseCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); +} + +static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // same +} + +static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= '0') && (ascii_character <= '9')); +} + +static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { + // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. + // utf8proc has no support for this, this is the best we can do: + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); +} + +static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { + // Formally this is not correct, but utf8proc does not allow us to query for Numerical + // properties, e.g. Numeric_Value and Numeric_Type + // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or + // Numeric_Type=Numeric. + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO); +} + +static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) { + auto property = utf8proc_get_property(codepoint); + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) || + property->bidi_class == UTF8PROC_BIDI_CLASS_WS || + property->bidi_class == UTF8PROC_BIDI_CLASS_B || + property->bidi_class == UTF8PROC_BIDI_CLASS_S; +} + +static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || + (ascii_character == ' '); +} + +static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) { + uint32_t general_category = utf8proc_category(codepoint); + return (general_category != 0) && + !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC, + UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS, + UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS, + UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP); +} + +static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { + return ((ascii_character >= ' ') && (ascii_character <= '~')); +} + +template +struct CharacterPredicateUnicode + : BinaryToBoolean> { + using offset_type = typename StringType::offset_type; + static inline bool Predicate(KernelContext* ctx, const uint8_t* input, + offset_type input_string_ncodeunits) { + if (allow_empty && input_string_ncodeunits == 0) { + return true; + } + bool all; + bool any = false; + if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf( + input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) { + any |= Derived::PredicateCharacterAny(codepoint); + return Derived::PredicateCharacterAll(codepoint); + }))) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input")); + return false; + } + return all & any; + } + static inline bool PredicateCharacterAny(uint32_t) { + return true; // default condition make sure there is at least 1 charachter + } +}; + +template +struct CharacterPredicateAscii + : BinaryToBoolean> { + using offset_type = typename StringType::offset_type; + static inline bool Predicate(KernelContext* ctx, const uint8_t* input, + offset_type input_string_ncodeunits) { + if (allow_empty && input_string_ncodeunits == 0) { + return true; + } + bool any = false; + bool all = std::all_of(input, input + input_string_ncodeunits, + [&any](uint8_t ascii_character) { + any |= Derived::PredicateCharacterAny(ascii_character); + return Derived::PredicateCharacterAll(ascii_character); + }); + return all & any; + } + static inline bool PredicateCharacterAny(uint8_t) { + return true; // default condition make sure there is at least 1 charachter + } +}; + +template +struct IsAlphaNumericUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsAlphaCharacterUnicode(codepoint) || IsDecimalCharacterUnicode(codepoint) || + IsNumericCharacterUnicode(codepoint) || IsDigitCharacterUnicode(codepoint); + } +}; + +template +struct IsAlphaNumericAscii + : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsAlphaCharacterAscii(codepoint) || IsDecimalCharacterAscii(codepoint); + } +}; + +template +struct IsAlphaUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsAlphaCharacterUnicode(codepoint); + } +}; + +template +struct IsAlphaAscii : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsAlphaCharacterAscii(ascii_character); + } +}; + +template +struct IsDecimalUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsDecimalCharacterUnicode(codepoint); + } +}; + +template +struct IsDecimalAscii : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsDecimalCharacterAscii(ascii_character); + } +}; + +template +struct IsDigitUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsDigitCharacterUnicode(codepoint); + } +}; + +template +struct IsNumericUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsNumericCharacterUnicode(codepoint); + } +}; + +template +struct IsAscii : BinaryToBoolean> { + using offset_type = typename StringType::offset_type; + static bool Predicate(KernelContext* ctx, const uint8_t* input, + offset_type input_string_nascii_characters) { + return std::all_of(input, input + input_string_nascii_characters, + IsAsciiCharacter); + } +}; + +template +struct IsLowerUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint); + } + static inline bool PredicateCharacterAny(uint32_t codepoint) { + return IsCasedCharacterUnicode(codepoint); // at least 1 cased character + } +}; + +template +struct IsLowerAscii : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterAscii(ascii_character) || + IsLowerCaseCharacterAscii(ascii_character); + } + static inline bool PredicateCharacterAny(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // at least 1 cased character + } +}; + +template +struct IsPrintableUnicode + : CharacterPredicateUnicode, + /*allow_empty=*/true> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint); + } +}; + +template +struct IsPrintableAscii + : CharacterPredicateAscii, + /*allow_empty=*/true> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsPrintableCharacterAscii(ascii_character); + } +}; + +template +struct IsSpaceUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + return IsSpaceCharacterUnicode(codepoint); + } +}; + +template +struct IsSpaceAscii : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + return IsSpaceCharacterAscii(ascii_character); + } +}; + +template +struct IsTitleUnicode : BinaryToBoolean> { + using offset_type = typename StringType::offset_type; + static bool Predicate(KernelContext* ctx, const uint8_t* input, + offset_type input_string_ncodeunits) { + // rules: + // * 1: lower case follows cased + // * 2: upper case follows uncased + // * 3: at least 1 cased character (which logically should be upper/title) + bool rules_1_and_2; + bool previous_cased = false; // in LL, LU or LT + bool rule_3 = false; + bool status = + arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2, + [&previous_cased, &rule_3](uint32_t codepoint) { + if (IsLowerCaseCharacterUnicode(codepoint)) { + if (!previous_cased) return false; // rule 1 broken + previous_cased = true; + } else if (IsCasedCharacterUnicode(codepoint)) { + if (previous_cased) return false; // rule 2 broken + // next should be a lower case or uncased + previous_cased = true; + rule_3 = true; // rule 3 obeyed + } else { + // a non-cased char, like _ or 1 + // next should be upper case or more uncased + previous_cased = false; + } + return true; + }); + if (!ARROW_PREDICT_TRUE(status)) { + ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input")); + return false; + } + return rules_1_and_2 & rule_3; + } +}; + +template +struct IsTitleAscii : BinaryToBoolean> { + using offset_type = typename StringType::offset_type; + static bool Predicate(KernelContext* ctx, const uint8_t* input, + offset_type input_string_ncodeunits) { + // rules: + // * 1: lower case follows cased + // * 2: upper case follows uncased + // * 3: at least 1 cased character (which logically should be upper/title) + bool rules_1_and_2 = true; + bool previous_cased = false; // in LL, LU or LT + bool rule_3 = false; + // we cannot rely on std::all_of because we need guaranteed order + for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) { + if (IsLowerCaseCharacterAscii(*c)) { + if (!previous_cased) { + // rule 1 broken + rules_1_and_2 = false; + break; + } + previous_cased = true; + } else if (IsCasedCharacterAscii(*c)) { + if (previous_cased) { + // rule 2 broken + rules_1_and_2 = false; + break; + } + // next should be a lower case or uncased + previous_cased = true; + rule_3 = true; // rule 3 obeyed + } else { + // a non-cased char, like _ or 1 + // next should be upper case or more uncased + previous_cased = false; + } + } + return rules_1_and_2 & rule_3; + } +}; + +template +struct IsUpperUnicode + : CharacterPredicateUnicode> { + static inline bool PredicateCharacterAll(uint32_t codepoint) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint); + } + static inline bool PredicateCharacterAny(uint32_t codepoint) { + return IsCasedCharacterUnicode(codepoint); // at least 1 cased character + } +}; + +template +struct IsUpperAscii : CharacterPredicateAscii> { + static inline bool PredicateCharacterAll(uint8_t ascii_character) { + // Only for cased character it needs to be lower case + return !IsCasedCharacterAscii(ascii_character) || + IsUpperCaseCharacterAscii(ascii_character); + } + static inline bool PredicateCharacterAny(uint8_t ascii_character) { + return IsCasedCharacterAscii(ascii_character); // at least 1 cased character + } +}; + // ---------------------------------------------------------------------- // strptime string parsing @@ -477,6 +935,16 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi DCHECK_OK(registry->AddFunction(std::move(func))); } +template